runecomp(2)

This commit is contained in:
Jacob Moody 2023-03-26 01:02:20 +00:00
parent 2163aebcb8
commit 04759ec9af
13 changed files with 1894 additions and 2 deletions

85
lib/ucd/mkfile Normal file
View file

@ -0,0 +1,85 @@
</$objtype/mkfile
VERSION='15.0.0'
URL='https://www.unicode.org/Public/'$VERSION'/ucd/'
TXT=\
ArabicShaping.txt\
BidiBrackets.txt\
BidiMirroring.txt\
BidiTest.txt\
Blocks.txt\
CJKRadicals.txt\
CaseFolding.txt\
CompositionExclusions.txt\
DerivedAge.txt\
DerivedCoreProperties.txt\
DerivedNormalizationProps.txt\
EastAsianWidth.txt\
EmojiSources.txt\
EquivalentUnifiedIdeograph.txt\
HangulSyllableType.txt\
Index.txt\
IndicPositionalCategory.txt\
IndicSyllabicCategory.txt\
Jamo.txt\
LineBreak.txt\
NameAliases.txt\
NamedSequences.txt\
NamedSequencesProv.txt\
NamesList.txt\
NormalizationCorrections.txt\
NushuSources.txt\
PropList.txt\
PropertyAliases.txt\
PropertyValueAliases.txt\
ScriptExtensions.txt\
Scripts.txt\
SpecialCasing.txt\
StandardizedVariants.txt\
TangutSources.txt\
USourceData.txt\
UnicodeData.txt\
VerticalOrientation.txt\
TEST=\
NormalizationTest.txt\
BidiCharacterTest.txt\
PDF=\
USourceGlyphs.pdf\
USourceRSChart.pdf\
AUX=\
WordBreakProperty.txt\
GraphemeBreakProperty.txt\
ucd:V: UnicodeData.txt
%.txt:
hget $URL^$target > $target >[2]/dev/null
%.pdf:
hget $URL^$target > $target
emoji-data.txt:
hget $URL^emoji/^$target > $target
WordBreakProperty.txt:
hget $URL^'auxiliary/'^$target > $target
GraphemeBreakProperty.txt:
hget $URL^'auxiliary/'^$target > $target
WordBreakTest.txt:
hget $URL^'auxiliary/'^$target > $target
GraphemeBreakTest.txt:
hget $URL^'auxiliary/'^$target > $target
txt:V: $TXT
pdf:V: $PDF
test:V: $TEST
all:V: $TXT $PDF $TEST

View file

@ -77,6 +77,18 @@ extern Rune* runestrrchr(Rune*, Rune);
extern long runestrlen(Rune*);
extern Rune* runestrstr(Rune*, Rune*);
extern int runecomp(Rune*, Rune*, int);
extern int runedecomp(Rune*, Rune*, int);
extern int utfcomp(char*, char*, int);
extern int utfdecomp(char*, char*, int);
extern char* fullutfnorm(char*,int);
extern Rune* fullrunenorm(Rune*,int);
extern Rune* runewbreak(Rune*);
extern char* utfwbreak(char*);
extern Rune* runegbreak(Rune*);
extern char* utfgbreak(char*);
extern Rune tolowerrune(Rune);
extern Rune totitlerune(Rune);
extern Rune toupperrune(Rune);

View file

@ -48,7 +48,11 @@ The names are self-explanatory.
.PP
The case-conversion routines return the character unchanged if it has no case.
.SH SOURCE
.B /sys/src/libc/port/runetype.c
.B /sys/src/libc/port/mkrunetype.c
.br
.B /sys/src/libc/port/runeistype.c
.br
.B /sys/src/libc/port/runetotype.c
.SH "SEE ALSO
.IR ctype (2) ,
.IR "The Unicode Standard" .

116
sys/man/2/runecomp Normal file
View file

@ -0,0 +1,116 @@
.TH RUNECOMP 2
.SH NAME
runecomp, runedecomp, fullrunenorm, runegbreak, runewbreak, utfcomp, utfdecomp, fullutfnorm, utfgbreak, utfwbreak \- multi-rune graphemes
.SH SYNOPSIS
.ta \w'\fLchar*xx'u
.B #include <u.h>
.br
.B #include <libc.h>
.PP
.B
int runecomp(Rune *dst, Rune *src, int max)
.PP
.B
int runedecomp(Rune *dst, Rune *src, int max)
.PP
.B
Rune* fullrunenorm(Rune *s, int n)
.PP
.B
Rune* runegbreak(Rune *s)
.PP
.B
Rune* runewbreak(Rune *s)
.PP
.B
int utfcomp(char *dst, char *src, int max)
.PP
.B
int utfdecomp(char *dst, char *src, int max)
.PP
.B
char* fullutfnorm(char *s, int n)
.PP
.B
char* utfgbreak(char *s)
.PP
.B
char* utfwbreak(char *s)
.SH DESCRIPTION
These routines help in handling
graphemes that may span multiple runes.
.PP
.IR Runecomp ,
.IR runedecomp ,
.IR utfcomp ,
and
.I utfdecomp
perform Unicode® normalization on
.IR src ,
storing the result in
.IR dst .
No more than
.I max
elements will be written, and the resulting string
will always be null terminated. The return value
is always the total number of elements required to
store the transformation. If this value is larger
than the supplied
.I max
the caller can assume the result has been truncated.
.I Runecomp
and
.I utfcomp
perform NFC normalization while
.I runedecomp
and
.I utfdecomp
perform NFD normalization.
.PP
.IR Fullrunenorm ,
and
.I fullutfnorm
determine if enough elements are present in
.I s
to perform normalization. If enough are present,
a pointer is returned to the first element that begins
the next context. Otherwise
.I s
is returned. No more then
.I n
elements will be read. In order to find the boundary, the
first element of the next context must be peeked.
.PP
.I Runegbreak
and
.I utfgbreak
search
.B s
for the next grapheme break opportunity.
If none is found before the end of the string,
.I s
is returned.
.PP
.I Runewbreak
and
.I utfwbreak
search
.B s
for the next word break opportunity.
If none is found before the end of the string,
.I s
is returned.
.SH SOURCE
.B /sys/src/libc/port/mkrunetype.c
.br
.B /sys/src/libc/port/runenorm.c
.br
.B /sys/src/libc/port/runebreak.c
.SH SEE ALSO
Unicode® Standard Annex #15
.br
Unicode® Standard Annex #29
.br
.IR rune (2),
.IR utf (6),
.IR tcs (1)

View file

@ -62,6 +62,9 @@ CFILES=\
rand.c\
readn.c\
rune.c\
runebreak.c\
runeistype.c\
runenorm.c\
runestrcat.c\
runestrchr.c\
runestrcmp.c\
@ -74,7 +77,7 @@ CFILES=\
runestrrchr.c\
runestrlen.c\
runestrstr.c\
runetype.c\
runetotype.c\
sin.c\
sinh.c\
sqrt.c\
@ -127,3 +130,26 @@ UPDATE=mkfile\
</sys/src/cmd/mksyslib
profile.$O: /sys/include/tos.h
runenorm.$O: runenormdata runenorm.c
runetotype.$O: runetotypedata runetotype.c
runeistype.$O: runeistypedata runeistype.c
runebreak.$O: runebreakdata runebreak.c
UCD=\
/lib/ucd/WordBreakProperty.txt\
/lib/ucd/GraphemeBreakProperty.txt\
/lib/ucd/emoji-data.txt\
/lib/ucd/CompositionExclusions.txt\
/lib/ucd/UnicodeData.txt\
/lib/ucd/%:
cd /lib/ucd && mk $stem
runenormdata runetotypedata runeistypedata runebreakdata: mkrunetype.c $UCD
@{
eval `{grep '^[A-Z]' /$cputype/mkfile}
$CC $CFLAGS -o mkrunetype.$O mkrunetype.c
$LD $LDFLAGS -o $O.mkrunetype mkrunetype.$O
$O.mkrunetype
}

View file

@ -0,0 +1,748 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
enum{
NRUNES = 1<<21
};
typedef struct Param Param;
typedef struct Lvl Lvl;
struct Lvl{
int bits;
int max;
int mask;
};
struct Param{
Lvl idx1;
Lvl idx2;
Lvl data;
int round1max;
};
static void
derive(Lvl *l)
{
l->max = 1 << l->bits;
l->mask = l->max - 1;
}
static void
param(Param *p, int idx1, int idx2)
{
assert(idx1 + idx2 < 21);
p->idx1.bits = idx1;
p->idx2.bits = idx2;
p->data.bits = 21 - idx1 - idx2;
derive(&p->idx1);
derive(&p->idx2);
derive(&p->data);
p->round1max = NRUNES/p->data.max;
}
static int
lkup(Param *p, int *idx1, int *idx2, int *data, int x)
{
int y, z;
y = (((x)>>(p->data.bits+p->idx2.bits))&p->idx1.mask);
z = (((x)>>p->data.bits)&p->idx2.mask);
return data[idx2[idx1[y] + z] + (x&p->data.mask)];
}
static int
mkarrvar(int fd, char *name, int *d, int len)
{
int i, sz;
int max, min;
char *t;
max = min = 0;
for(i = 0; i < len; i++){
if(d[i] > max)
max = d[i];
if(d[i] < min)
min = d[i];
}
if(min == 0){
if(max < 0xFF)
t = "uchar", sz = 1;
else if(max < 0xFFFF)
t = "ushort", sz = 2;
else
t = "uint", sz = 4;
} else {
if(max < 1<<7)
t = "char", sz = 1;
else if(max < 1<<15)
t = "short", sz = 2;
else
t = "int", sz = 4;
}
if(fd < 0)
return sz * len;
fprint(fd, "static\n%s\t%s[%d] =\n{\n\t", t, name, len);
for(i = 0; i < len; i++){
fprint(fd, "%d,", d[i]);
if((i+1) % 16 == 0)
fprint(fd, "\n\t");
}
fprint(fd, "\n};\n");
return sz * len;
}
static int
mkexceptarr(int fd, char *name, int *d, int n, int all)
{
int i;
fprint(fd, "static\nRune %s[][%d] =\n{\n\t", name, all ? 3 : 2);
for(i = 0; i < n*3; i += 3){
if(all && d[i] != 0)
fprint(fd, "{0x%X, 0x%X, 0x%X},", d[i], d[i+1], d[i+2]);
else if(!all)
fprint(fd, "{0x%X, 0x%X},", d[i+1], d[i+2]);
if((i+3) % (8*3) == 0)
fprint(fd, "\n\t");
}
fprint(fd, "\n};\n");
return n * sizeof(Rune) * 2;
}
static int
compact(int *data, int *idx, int nidx, int *src, int chunksize)
{
int i, n, ndata, best;
int *dot, *lp, *rp;
dot = src;
ndata = 0;
idx[0] = 0;
for(i = 1; i <= nidx; i++){
rp = dot + chunksize;
lp = rp - 1;
for(best = 0, n = 0; i != nidx && n < chunksize; n++, lp--){
if(memcmp(lp, rp, (n+1) * sizeof data[0]) == 0)
best = n+1;
}
memmove(data + ndata, dot, (chunksize - best) * sizeof data[0]);
ndata += (chunksize - best);
idx[i] = idx[i - 1] + (chunksize - best);
dot = rp;
}
return ndata;
}
static int
mklkup(int fd, char *label, int *map, Param *p)
{
static int data[NRUNES];
static int idx2[NRUNES];
static int idx2dest[NRUNES];
static int idx1[NRUNES];
int i, nidx2, ndata;
int size;
ndata = compact(data, idx2, p->round1max, map, p->data.max);
nidx2 = compact(idx2dest, idx1, p->idx1.max, idx2, p->idx2.max);
if(fd >= 0){
for(i = 0; i < NRUNES; i++)
if(map[i] != lkup(p, idx1, idx2dest, data, i))
sysfatal("mismatch in %s at %d %d %d\n", label, i, map[i], lkup(p, idx1, idx2dest, data, i));
}
size = mkarrvar(fd, smprint("_%sdata", label), data, ndata);
size += mkarrvar(fd, smprint("_%sidx2", label), idx2dest, nidx2);
size += mkarrvar(fd, smprint("_%sidx1", label), idx1, p->idx1.max);
if(fd >= 0){
fprint(fd, "\n");
fprint(fd, "#define %sindex1(x) (((x)>>(%d+%d))&0x%X)\n", label, p->data.bits, p->idx2.bits, p->idx1.mask);
fprint(fd, "#define %sindex2(x) (((x)>>%d)&0x%X)\n", label, p->data.bits, p->idx2.mask);
fprint(fd, "#define %soffset(x) ((x)&0x%X)\n", label, p->data.mask);
fprint(fd, "#define %slkup(x) (_%sdata[_%sidx2[_%sidx1[%sindex1(x)] + %sindex2(x)] + %soffset(x)] )\n\n",
label, label, label, label, label, label, label);
}
return size;
}
static void
mklkupmatrix(char *label, int *map, Param *p)
{
int bestsize, size, bestx, besty;
int x, y;
bestsize = bestx = besty = -1;
for(x = 4; x <= 12; x++)
for(y=4; y <= (19 - x); y++){
param(p, x, y);
size = mklkup(-1, label, map, p);
if(bestsize == -1 || size < bestsize){
bestx = x;
besty = y;
bestsize = size;
}
}
assert(bestsize != -1);
fprint(2, "label: %s best: %d %d (%d)\n", label, bestx, besty, bestsize);
param(p, bestx, besty);
}
static int myismerged[NRUNES];
static int mytoupper[NRUNES];
static int mytolower[NRUNES];
static int mytotitle[NRUNES];
static int mybreak[NRUNES];
enum{ DSTART = 0xEEEE };
static int mydecomp[NRUNES];
static int mydespecial[256*3];
static int nspecial;
static int myccc[NRUNES];
typedef struct KV KV;
struct KV{
uint key;
uint val;
ushort next;
};
static KV myrecomp[2000];
static int nrecomp;
static int recompext[256*3];
static int nrecompext;
static uint
hash(uint x)
{
x ^= x >> 16;
x *= 0x21f0aaad;
x ^= x >> 15;
x *= 0xd35a2d97;
x ^= x >> 15;
return x;
}
static void
mkrecomp(int fd)
{
int i;
KV *p;
static KV vals[512];
static KV coll[1000];
int over;
int maxchain;
for(i = 0; i < nelem(vals); i++)
vals[i] = (KV){0, 0, 0};
for(i = 0; i < nelem(coll); i++)
coll[i] = (KV){0, 0, 0};
over = 1;
for(i = 0; i < nrecomp; i++){
p = vals + (hash(myrecomp[i].key) % nelem(vals));
maxchain = 0;
while(p->key != 0){
maxchain++;
if(p->next == 0){
p->next = over;
p = coll + over - 1;
over++;
} else
p = coll + p->next - 1;
}
p->key = myrecomp[i].key;
p->val = myrecomp[i].val;
}
fprint(2, "recomp map [%d][%d]: %d\n", nelem(vals), over-1, (nelem(vals) + over-1) * (4+2+2));
fprint(fd, "static\nuint\t_recompdata[] =\n{\n\t");
for(p = vals, i = 0;; i++){
assert(p->val < 0xFFFF);
assert(p->next < 0xFFFF);
fprint(fd, "%udU,%udU,", p->key, p->val | (p->next<<16));
if((i+1) % 8 == 0)
fprint(fd, "\n\t");
if(p == vals+nelem(vals)-1)
p = coll;
else if(p == coll + over - 2)
break;
else
p++;
}
fprint(fd, "\n};\n");
fprint(fd, "static uint *_recompcoll = _recompdata+%d*2;\n", nelem(vals));
}
static void
mktables(void)
{
Param p;
int tofd, isfd, normfd, breakfd;
int size;
tofd = create("runetotypedata", OWRITE, 0664);
if(tofd < 0)
sysfatal("could not create runetotypedata: %r");
param(&p, 10, 7);
size = mklkup(tofd, "upper", mytoupper, &p);
fprint(2, "%s: %d\n", "upper", size);
size = mklkup(tofd, "lower", mytolower, &p);
fprint(2, "%s: %d\n", "lower", size);
size = mklkup(tofd, "title", mytotitle, &p);
fprint(2, "%s: %d\n", "title", size);
close(tofd);
isfd = create("runeistypedata", OWRITE, 0664);
if(isfd < 0)
sysfatal("could not create runeistypedata: %r");
param(&p, 11, 6);
size = mklkup(isfd, "merged", myismerged, &p);
fprint(2, "%s: %d\n", "merged", size);
fprint(isfd, "static\nenum {\n");
fprint(isfd, "\tL%s = %s,\n", "space", "1<<0");
fprint(isfd, "\tL%s = %s,\n", "alpha", "1<<1");
fprint(isfd, "\tL%s = %s,\n", "digit", "1<<2");
fprint(isfd, "\tL%s = %s,\n", "upper", "1<<3");
fprint(isfd, "\tL%s = %s,\n", "lower", "1<<4");
fprint(isfd, "\tL%s = %s,\n", "title", "1<<5");
fprint(isfd, "};\n");
close(isfd);
normfd = create("runenormdata", OWRITE, 0664);
if(normfd < 0)
sysfatal("could not create runenormdata: %r");
param(&p, 10, 7);
size = mklkup(normfd, "decomp", mydecomp, &p);
fprint(2, "%s: %d\n", "decomp", size);
param(&p, 9, 7);
size = mklkup(normfd, "ccc", myccc, &p);
fprint(2, "%s: %d\n", "ccc", size);
mkexceptarr(normfd, "_decompexceptions", mydespecial, nspecial, 0);
mkexceptarr(normfd, "_recompexceptions", recompext, nrecompext, 1);
mkrecomp(normfd);
close(normfd);
param(&p, 10, 6);
breakfd = create("runebreakdata", OWRITE, 0644);
if(breakfd < 0)
sysfatal("could not create runebreakdata: %r");
size = mklkup(breakfd, "break", mybreak, &p);
fprint(2, "%s: %d\n", "break", size);
}
enum {
FIELD_CODE,
FIELD_NAME,
FIELD_CATEGORY,
FIELD_COMBINING,
FIELD_BIDIR,
FIELD_DECOMP,
FIELD_DECIMAL_DIG,
FIELD_DIG,
FIELD_NUMERIC_VAL,
FIELD_MIRRORED,
FIELD_UNICODE_1_NAME,
FIELD_COMMENT,
FIELD_UPPER,
FIELD_LOWER,
FIELD_TITLE,
NFIELDS,
};
static int
getunicodeline(Biobuf *in, char **fields)
{
char *p;
if((p = Brdline(in, '\n')) == nil)
return 0;
p[Blinelen(in)-1] = '\0';
if (getfields(p, fields, NFIELDS + 1, 0, ";") != NFIELDS)
sysfatal("bad number of fields");
return 1;
}
static int
estrtoul(char *s, int base)
{
char *epr;
Rune code;
code = strtoul(s, &epr, base);
if(s == epr)
sysfatal("bad code point hex string");
return code;
}
enum {
OTHER,
Hebrew_Letter, Newline, Extend, Format,
Katakana, ALetter, MidLetter, MidNum,
MidNumLet, Numeric, ExtendNumLet, WSegSpace,
PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
EMOJIEX = 0xB0,
};
static void
markbreak(void)
{
Biobuf *b;
char *p, *dot;
int i, s, e;
uchar v;
b = Bopen("/lib/ucd/WordBreakProperty.txt", OREAD);
if(b == nil)
sysfatal("could not load word breaks: %r");
while((p = Brdline(b, '\n')) != nil){
p[Blinelen(b)-1] = 0;
if(p[0] == 0 || p[0] == '#')
continue;
if((dot = strstr(p, "..")) != nil){
*dot = 0;
dot += 2;
s = estrtoul(p, 16);
e = estrtoul(dot, 16);
} else {
s = e = estrtoul(p, 16);
dot = p;
}
v = 0;
if(strstr(dot, "ExtendNumLet") != nil)
v = ExtendNumLet;
else if(strstr(dot, "Hebrew_Letter") != nil)
v = Hebrew_Letter;
else if(strstr(dot, "Newline") != nil)
v = Newline;
else if(strstr(dot, "Extend") != nil)
v = Extend;
else if(strstr(dot, "Format") != nil)
v = Format;
else if(strstr(dot, "Katakana") != nil)
v = Katakana;
else if(strstr(dot, "ALetter") != nil)
v = ALetter;
else if(strstr(dot, "MidLetter") != nil)
v = MidLetter;
else if(strstr(dot, "MidNum") != nil)
v = MidNum;
else if(strstr(dot, "Numeric") != nil)
v = Numeric;
else if(strstr(dot, "WSegSpace") != nil)
v = WSegSpace;
for(i = s; i <= e; i++)
mybreak[i] = v;
}
Bterm(b);
b = Bopen("/lib/ucd/GraphemeBreakProperty.txt", OREAD);
if(b == nil)
sysfatal("could not load Grapheme breaks: %r");
while((p = Brdline(b, '\n')) != nil){
p[Blinelen(b)-1] = 0;
if(p[0] == 0 || p[0] == '#')
continue;
if((dot = strstr(p, "..")) != nil){
*dot = 0;
dot += 2;
s = estrtoul(p, 16);
e = estrtoul(dot, 16);
} else {
s = e = estrtoul(p, 16);
dot = p;
}
v = 0;
if(strstr(dot, "; Prepend #") != nil)
v = PREPEND;
else if(strstr(dot, "; Control #") != nil)
v = CONTROL;
else if(strstr(dot, "; Extend #") != nil)
v = EXTEND;
else if(strstr(dot, "; Regional_Indicator #") != nil)
v = REGION;
else if(strstr(dot, "; SpacingMark #") != nil)
v = SPACEMK;
else if(strstr(dot, "; L #") != nil)
v = L;
else if(strstr(dot, "; V #") != nil)
v = V;
else if(strstr(dot, "; T #") != nil)
v = T;
else if(strstr(dot, "; LV #") != nil)
v = LV;
else if(strstr(dot, "; LVT #") != nil)
v = LVT;
for(i = s; i <= e; i++)
mybreak[i] |= v;
}
Bterm(b);
b = Bopen("/lib/ucd/emoji-data.txt", OREAD);
if(b == nil)
sysfatal("could not load emoji-data: %r");
while((p = Brdline(b, '\n')) != nil){
p[Blinelen(b)-1] = 0;
if(p[0] == 0 || p[0] == '#')
continue;
if((dot = strstr(p, "..")) != nil){
*dot = 0;
dot += 2;
s = estrtoul(p, 16);
e = estrtoul(dot, 16);
} else {
s = e = estrtoul(p, 16);
dot = p;
}
v = 0;
if(strstr(dot, "; Extended_Pictographic") != nil)
v = EMOJIEX;
for(i = s; i <= e; i++)
mybreak[i] |= v;
}
Bterm(b);
}
static void
markexclusions(void)
{
Biobuf *b;
char *p;
int i;
uint x;
b = Bopen("/lib/ucd/CompositionExclusions.txt", OREAD);
if(b == nil)
sysfatal("could not load composition exclusions: %r");
while((p = Brdline(b, '\n')) != nil){
p[Blinelen(b)-1] = 0;
if(p[0] == 0 || p[0] == '#')
continue;
x = estrtoul(p, 16);
for(i = 0; i < nrecomp; i++){
if(myrecomp[i].val == x){
myrecomp[i].val = 0;
break;
}
}
if(i == nrecomp){
for(i = 0; i < nrecompext; i++){
if(recompext[i*3] == x){
recompext[i*3] = 0;
break;
}
}
}
}
Bterm(b);
}
void
main(int, char)
{
static char myisspace[NRUNES];
static char myisalpha[NRUNES];
static char myisdigit[NRUNES];
static char myisupper[NRUNES];
static char myislower[NRUNES];
static char myistitle[NRUNES];
Biobuf *in;
char *fields[NFIELDS + 1], *fields2[NFIELDS + 1];
char *p, *d;
int i, code, last;
int decomp[2], *ip;
in = Bopen("/lib/ucd/UnicodeData.txt", OREAD);
if(in == nil)
sysfatal("can't open UnicodeData.txt: %r");
for(i = 0; i < NRUNES; i++){
mytoupper[i] = -1;
mytolower[i] = -1;
mytotitle[i] = -1;
mydecomp[i] = 0;
myccc[i] = 0;
mybreak[i] = 0;
}
myisspace['\t'] = 1;
myisspace['\n'] = 1;
myisspace['\r'] = 1;
myisspace['\f'] = 1;
myisspace['\v'] = 1;
myisspace[0x85] = 1; /* control char, "next line" */
myisspace[0xfeff] = 1; /* zero-width non-break space */
last = -1;
nspecial = nrecomp = nrecompext = 0;
while(getunicodeline(in, fields)){
code = estrtoul(fields[FIELD_CODE], 16);
if (code >= NRUNES)
sysfatal("code-point value too big: %x", code);
if(code <= last)
sysfatal("bad code sequence: %x then %x", last, code);
last = code;
p = fields[FIELD_CATEGORY];
if(strstr(fields[FIELD_NAME], ", First>") != nil){
if(!getunicodeline(in, fields2))
sysfatal("range start at eof");
if (strstr(fields2[FIELD_NAME], ", Last>") == nil)
sysfatal("range start not followed by range end");
last = estrtoul(fields2[FIELD_CODE], 16);
if(last <= code)
sysfatal("range out of sequence: %x then %x", code, last);
if(strcmp(p, fields2[FIELD_CATEGORY]) != 0)
sysfatal("range with mismatched category");
}
d = fields[FIELD_DECOMP];
if(strlen(d) > 0 && strstr(d, "<") == nil){
decomp[0] = estrtoul(d, 16);
d = strstr(d, " ");
if(d == nil){
/* singleton recompositions are verboden */
decomp[1] = 0;
if(decomp[0] > 0xFFFF){
ip = mydespecial + nspecial*3;
ip[0] = code;
ip[1] = decomp[0];
ip[2] = 0;
mydecomp[code] = (DSTART+nspecial)<<16;
nspecial++;
} else
mydecomp[code] = decomp[0]<<16;
} else {
d++;
decomp[1] = estrtoul(d, 16);
if(decomp[0] > 0xFFFF || decomp[1] > 0xFFFF){
ip = mydespecial + nspecial*3;
ip[0] = code;
ip[1] = decomp[0];
ip[2] = decomp[1];
mydecomp[code] = (DSTART+nspecial)<<16;
nspecial++;
ip = recompext + nrecompext*3;
ip[0] = code;
ip[1] = decomp[0];
ip[2] = decomp[1];
nrecompext++;
} else {
mydecomp[code] = decomp[0]<<16 | decomp[1];
myrecomp[nrecomp++] = (KV){decomp[0]<<16 | decomp[1], code, 0};
}
}
}
for (; code <= last; code++){
if(p[0] == 'L')
myisalpha[code] = 1;
if(p[0] == 'Z')
myisspace[code] = 1;
if(strcmp(p, "Lu") == 0)
myisupper[code] = 1;
if(strcmp(p, "Ll") == 0)
myislower[code] = 1;
if(strcmp(p, "Lt") == 0)
myistitle[code] = 1;
if(strcmp(p, "Nd") == 0)
myisdigit[code] = 1;
if(fields[FIELD_UPPER][0] != '\0')
mytoupper[code] = estrtoul(fields[FIELD_UPPER], 16);
if(fields[FIELD_LOWER][0] != '\0')
mytolower[code] = estrtoul(fields[FIELD_LOWER], 16);
if(fields[FIELD_TITLE][0] != '\0')
mytotitle[code] = estrtoul(fields[FIELD_TITLE], 16);
myccc[code] = estrtoul(fields[FIELD_COMBINING], 10);
}
}
Bterm(in);
markexclusions();
/*
* according to standard, if totitle(x) is not defined in ucd
* but toupper(x) is, then totitle is defined to be toupper(x)
*/
for(i = 0; i < NRUNES; i++){
if(mytotitle[i] == -1
&& mytoupper[i] != -1
&& !myistitle[i])
mytotitle[i] = mytoupper[i];
}
/*
* A couple corrections:
* is*(to*(x)) should be true.
* restore undefined transformations.
* store offset instead of value, makes them sparse.
*/
for(i = 0; i < NRUNES; i++){
if(mytoupper[i] != -1)
myisupper[mytoupper[i]] = 1;
else
mytoupper[i] = i;
if(mytolower[i] != -1)
myislower[mytolower[i]] = 1;
else
mytolower[i] = i;
if(mytotitle[i] != -1)
myistitle[mytotitle[i]] = 1;
else
mytotitle[i] = i;
mytoupper[i] = mytoupper[i] - i;
mytolower[i] = mytolower[i] - i;
mytotitle[i] = mytotitle[i] - i;
}
uchar b;
for(i = 0; i < NRUNES; i++){
b = 0;
if(myisspace[i])
b |= 1<<0;
if(myisalpha[i])
b |= 1<<1;
if(myisdigit[i])
b |= 1<<2;
if(myisupper[i])
b |= 1<<3;
if(myislower[i])
b |= 1<<4;
if(myistitle[i])
b |= 1<<5;
myismerged[i] = b;
}
markbreak();
mktables();
exits(nil);
}

View file

@ -0,0 +1,293 @@
#include <u.h>
#include <libc.h>
#include "runebreakdata"
enum {
OTHER,
Hebrew_Letter, Newline, Extend, Format,
Katakana, ALetter, MidLetter, MidNum,
MidNumLet, Numeric, ExtendNumLet, WSegSpace,
PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
EMOJIEX = 0xB0,
ZWJ = 0x200DU,
LINETAB = 0xB,
};
#define IS(x, y) ((x&0xf) == y)
#define ISG(x, y) ((x&0xf0) == y)
Rune*
runegbreak(Rune *s)
{
Rune l, r;
uchar lt, rt;
Rune *p;
p = s;
if((l = *p++) == 0)
return s;
if((r = *p) == 0)
return s;
lt = breaklkup(l);
rt = breaklkup(r);
if(l == '\r' && r == '\n')
goto Done;
if(ISG(lt, CONTROL) || l == '\r' || l == '\n')
return p;
if(ISG(rt, CONTROL) || r == '\r' || r == '\n')
return p;
if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT)))
goto Done;
if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T)))
goto Done;
if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T)))
goto Done;
if(ISG(rt, SPACEMK) || ISG(lt, PREPEND))
goto Done;
if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){
while(ISG(rt, EXTEND)){
p++;
if((r = *p) == 0)
return s;
rt = breaklkup(r);
}
if(r != ZWJ)
return p;
p++;
if((r = *p) == 0)
return s;
rt = breaklkup(r);
if(ISG(rt, EMOJIEX))
goto Done;
return p;
}
if(ISG(rt, EXTEND) || r == ZWJ)
goto Done;
if(ISG(lt, REGION) && ISG(rt, REGION))
goto Done;
return p;
Done:
if(p[1] == 0)
return s;
return p + 1;
}
char*
utfgbreak(char *s)
{
Rune l, r;
uchar lt, rt;
char *p;
p = s;
p += chartorune(&l, p);
if(l == 0)
return s;
chartorune(&r, p);
if(r == 0)
return s;
lt = breaklkup(l);
rt = breaklkup(r);
if(l == '\r' && r == '\n')
goto Done;
if(ISG(lt, CONTROL) || l == '\r' || l == '\n')
return p;
if(ISG(rt, CONTROL) || r == '\r' || r == '\n')
return p;
if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT)))
goto Done;
if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T)))
goto Done;
if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T)))
goto Done;
if(ISG(rt, SPACEMK) || ISG(lt, PREPEND))
goto Done;
if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){
while(ISG(rt, EXTEND)){
p += chartorune(&r, p);
chartorune(&r, p);
if(r == 0)
return s;
rt = breaklkup(r);
}
if(r != ZWJ)
return p;
p += chartorune(&r, p);
chartorune(&r, p);
if(r == 0)
return s;
rt = breaklkup(r);
if(ISG(rt, EMOJIEX))
goto Done;
return p;
}
if(ISG(rt, EXTEND) || r == ZWJ)
goto Done;
if(ISG(lt, REGION) && ISG(rt, REGION))
goto Done;
return p;
Done:
p += chartorune(&r, p);
chartorune(&r, p);
if(r == 0)
return s;
return p;
}
#define AH(x) (IS(x, ALetter) || IS(x, Hebrew_Letter))
#define MNLQ(x) (IS(x, MidNumLet) || x == '\'')
Rune*
runewbreak(Rune *s)
{
Rune l, r;
uchar lt, rt;
Rune *p;
p = s;
if((l = *p++) == 0)
return s;
if((r = *p) == 0)
return s;
lt = breaklkup(l);
rt = breaklkup(r);
if(l == '\r' && r == '\n')
goto Done;
if(l == '\r' || l == '\n' || l == LINETAB)
return p;
if(r == '\r' || r == '\n' || l == LINETAB)
return p;
if(IS(lt, WSegSpace) && IS(rt, WSegSpace))
goto Done;
if(IS(rt, Format) || IS(rt, Extend))
goto Done;
if(AH(lt)){
if(AH(rt))
goto Done;
if((IS(rt, MidLetter) || MNLQ(rt)) && p[1] != 0 && AH(breaklkup(p[1])))
goto Done;
if(IS(lt, Hebrew_Letter) && r == '\'')
goto Done;
if(IS(lt, Hebrew_Letter) && r == '"' && p[1] != 0 && IS(breaklkup(p[1]), Hebrew_Letter))
goto Done;
if(IS(rt, Numeric))
goto Done;
}
if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric)))
goto Done;
if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && p[1] != 0 && IS(breaklkup(p[1]), Numeric))
goto Done;
if(IS(lt, Katakana) && IS(rt, Katakana))
goto Done;
if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet))
if(IS(rt, ExtendNumLet))
goto Done;
if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana)))
goto Done;
if(ISG(lt, REGION)){
if(ISG(rt, REGION))
goto Done;
if(r != ZWJ)
return p;
p++;
if((r = *p) == 0)
return s;
rt = breaklkup(r);
if(ISG(rt, REGION))
goto Done;
}
return p;
Done:
if(p[1] == 0)
return s;
return p + 1;
}
char*
utfwbreak(char *s)
{
Rune l, r;
Rune peek;
uchar lt, rt;
char *p;
p = s;
p += chartorune(&l, p);
if(l == 0)
return s;
chartorune(&peek, p+chartorune(&r, p));
if(r == 0)
return s;
lt = breaklkup(l);
rt = breaklkup(r);
if(l == '\r' && r == '\n')
goto Done;
if(l == '\r' || l == '\n' || l == LINETAB)
return p;
if(r == '\r' || r == '\n' || l == LINETAB)
return p;
if(IS(lt, WSegSpace) && IS(rt, WSegSpace))
goto Done;
if(IS(rt, Format) || IS(rt, Extend))
goto Done;
if(AH(lt)){
if(AH(rt))
goto Done;
if(IS(rt, MidLetter) || MNLQ(rt))
if(peek != 0 && AH(breaklkup(peek)))
goto Done;
if(IS(lt, Hebrew_Letter) && r == '\'')
goto Done;
if(IS(lt, Hebrew_Letter) && r == '"')
if(peek != 0 && IS(breaklkup(peek), Hebrew_Letter))
goto Done;
if(IS(rt, Numeric))
goto Done;
}
if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric)))
goto Done;
if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && peek != 0 && IS(breaklkup(peek), Numeric))
goto Done;
if(IS(lt, Katakana) && IS(rt, Katakana))
goto Done;
if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet))
if(IS(rt, ExtendNumLet))
goto Done;
if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana)))
goto Done;
if(ISG(lt, REGION)){
if(ISG(rt, REGION))
goto Done;
if(r != ZWJ)
return p;
p += chartorune(&r, p);
chartorune(&r, p);
if(r == 0)
return s;
rt = breaklkup(r);
if(ISG(rt, REGION))
goto Done;
}
return p;
Done:
p += chartorune(&r, p);
chartorune(&r, p);
if(r == 0)
return s;
return p;
}

View file

@ -0,0 +1,40 @@
#include <u.h>
#include <libc.h>
#include "runeistypedata"
int
isspacerune(Rune c)
{
return (mergedlkup(c) & Lspace) == Lspace;
}
int
isalpharune(Rune c)
{
return (mergedlkup(c) & Lalpha) == Lalpha;
}
int
isdigitrune(Rune c)
{
return (mergedlkup(c) & Ldigit) == Ldigit;
}
int
isupperrune(Rune c)
{
return (mergedlkup(c) & Lupper) == Lupper;
}
int
islowerrune(Rune c)
{
return (mergedlkup(c) & Llower) == Llower;
}
int
istitlerune(Rune c)
{
return (mergedlkup(c) & Ltitle) == Ltitle;
}

View file

@ -0,0 +1,334 @@
#include <u.h>
#include <libc.h>
#include "runenormdata"
//Unicode Standard: Section 3.12 Conjoining Jamo Behavior
enum {
SBase = 0xAC00,
LBase = 0x1100,
VBase = 0x1161,
TBase = 0x11A7,
LCount = 19,
VCount = 21,
TCount = 28,
NCount = VCount * TCount,
SCount = LCount * NCount,
LLast = LBase + LCount - 1,
SLast = SBase + SCount - 1,
VLast = VBase + VCount - 1,
TLast = TBase + TCount - 1,
};
static void
_runedecomp(Rune dst[2], Rune c)
{
uint x;
if(c >= SBase && c <= SLast){
c -= SBase;
x = c % TCount;
if(x){
dst[0] = SBase + ((c / TCount) * TCount);
dst[1] = TBase + x;
return;
}
dst[0] = LBase + (c / NCount);
dst[1] = VBase + ((c % NCount) / TCount);
return;
}
x = decomplkup(c);
if((x & 0xFFFF) != 0){
dst[0] = x>>16;
dst[1] = x & 0xFFFF;
return;
}
x >>= 16;
if(x >= 0xEEEE && x <0xF8FF){
memmove(dst, _decompexceptions[x - 0xEEEE], sizeof(Rune)*2);
return;
}
dst[0] = x;
dst[1] = 0;
}
static Rune
_runerecomp(Rune r[2])
{
uint x, y, *p, next;
if(r[0] >= LBase && r[0] <= LLast){
if(r[1] < VBase || r[1] > VLast)
return 0;
x = (r[0] - LBase) * NCount + (r[1] - VBase) * TCount;
return SBase + x;
}
if(r[0] >= SBase && r[0] <= SLast && (r[0] - SBase) % TCount == 0){
if(r[1] > TBase && r[1] <= TLast)
return r[0] + (r[1] - TBase);
return 0;
}
if(r[0] > 0xFFFF || r[1] > 0xFFFF){
for(x = 0; x < nelem(_recompexceptions); x++)
if(r[0] == _recompexceptions[x][1] && r[1] == _recompexceptions[x][2])
return _recompexceptions[x][0];
return 0;
}
y = x = r[0]<<16 | r[1];
x ^= x >> 16;
x *= 0x21f0aaad;
x ^= x >> 15;
x *= 0xd35a2d97;
x ^= x >> 15;
p = _recompdata + (x%512)*2;
while(p[0] != y){
next = p[1]>>16;
if(!next)
return 0;
p = _recompcoll + (next-1)*2;
}
return p[1] & 0xFFFF;
}
static void
runecccsort(Rune *a, int len)
{
Rune r;
int i;
int fail;
do {
fail = 0;
for(i = 0; i < len - 1; i++){
if(ccclkup(a[i]) > ccclkup(a[i+1]) > 0){
r = a[i];
a[i] = a[i+1];
a[i + 1] = r;
fail = 1;
}
}
} while(fail);
}
char*
fullutfnorm(char *s, int n)
{
Rune r, peek;
char *p, *p2;
p = s;
if(fullrune(p, n) == 0)
return s;
p += chartorune(&r, p);
n -= (p - s);
if((r >= LBase && r <= LLast) || (r >= SBase && r <= SLast)){
do {
if(fullrune(p, n) == 0)
return s;
p2 = p + chartorune(&peek, p);
n -= (p2 - p);
p = p2;
} while(n > 0 && (peek >= VBase && peek <= VLast) || (peek > TBase && peek <= TLast));
if(n <= 0)
return s;
return p;
}
do {
if(fullrune(p, n) == 0)
return s;
p2 = p + chartorune(&peek, p);
n -= (p2 - p);
p = p2;
if(ccclkup(peek) == 0)
return p;
} while(n > 0);
return s;
}
Rune*
fullrunenorm(Rune *r, int n)
{
Rune *e, *p;
p = r;
e = p + n;
if((*p >= LBase && *p <= LLast) || (*p >= SBase && *p <= SLast)){
p++;
while(p < e && (*p >= VBase && *p <= VLast) || (*p > TBase && *p <= TLast))
p++;
if(p >= e)
return r;
return p;
}
for(; p < e && p + 1 < e; p++)
if(ccclkup(p[1]) == 0)
return p + 1;
return r;
}
static int
runenorm(Rune *dst, Rune *src, char *sdst, char *ssrc, int max, int compose)
{
Rune c, r[2], _stack[32];
Rune *p, *stack, *sp, *tp;
char *strp, *strstop;
Rune *rp, *rrp;
Rune *stop;
Rune peek;
int w, w2, size;
int mode;
if(src){
mode = 1;
p = src;
stop = dst + (max - 1);
strp = "";
strstop = nil;
} else {
mode = 0;
p = L"";
stop = nil;
strp = ssrc;
strstop = sdst + (max - 1);
}
stack = _stack + nelem(_stack)/2;
size = 0;
w = w2 = 0;
while(*strp || *p){
if(mode)
c = *p;
else
w = chartorune(&c, strp);
sp = stack - 1;
tp = stack;
_runedecomp(r, c);
while(r[0] != 0){
c = r[0];
if(r[1] != 0){
*sp-- = r[1];
if(sp == _stack)
break;
}
_runedecomp(r, c);
}
*sp = c;
if(mode)
peek = p[1];
else
w2 = chartorune(&peek, strp+w);
if((*sp >= LBase && *sp <= LLast) || (*sp >= SBase && *sp <= SLast)){
while(peek != 0 && (peek >= VBase && peek <= VLast) || (peek > TBase && peek <= TLast)){
*tp++ = peek;
if(mode){
p++;
peek = p[1];
} else {
strp += w;
w = w2;
w2 = chartorune(&peek, strp+w);
}
if(tp == _stack + nelem(_stack))
break;
}
}
while(peek != 0 && ccclkup(peek) != 0){
_runedecomp(r, peek);
if(r[1] != 0){
if(tp+1 >= _stack + nelem(_stack))
break;
*tp++ = r[0];
*tp++ = r[1];
} else if(r[0] != 0)
*tp++ = r[0];
else
*tp++ = peek;
if(mode){
p++;
peek = p[1];
} else {
strp += w;
w = w2;
w2 = chartorune(&peek, strp+w);
}
if(tp == _stack + nelem(_stack))
break;
}
runecccsort(sp, tp - sp);
if(compose && ccclkup(*sp) == 0){
for(rp = sp + 1; rp < tp; rp++){
r[0] = *sp;
r[1] = *rp;
c = _runerecomp(r);
if(c != 0){
*sp = c;
for(rrp = rp; rrp > sp; rrp--)
*rrp = rrp[-1];
sp++;
} else while(rp + 1 < tp && ccclkup(*rp) == ccclkup(*(rp+1)))
rp++;
}
}
for(; sp < tp; sp++){
if(mode){
if(dst < stop)
*dst++ = *sp;
size++;
} else {
w2 = runelen(*sp);
if(sdst+w2 < strstop)
sdst += runetochar(sdst, sp);
size += w2;
}
}
if(mode)
p++;
else
strp += w;
}
if(mode)
*dst = 0;
else
*sdst = 0;
return size;
}
int
runecomp(Rune *dst, Rune *src, int max)
{
return runenorm(dst, src, nil, nil, max, 1);
}
int
runedecomp(Rune *dst, Rune *src, int max)
{
return runenorm(dst, src, nil, nil, max, 0);
}
int
utfcomp(char *dst, char *src, int max)
{
return runenorm(nil, nil, dst, src, max, 1);
}
int
utfdecomp(char *dst, char *src, int max)
{
return runenorm(nil, nil, dst, src, max, 0);
}

View file

@ -0,0 +1,22 @@
#include <u.h>
#include <libc.h>
#include "runetotypedata"
Rune
toupperrune(Rune c)
{
return c + upperlkup(c);
}
Rune
tolowerrune(Rune c)
{
return c + lowerlkup(c);
}
Rune
totitlerune(Rune c)
{
return c + titlelkup(c);
}

View file

@ -3,6 +3,14 @@
TEST=\
date\
pow\
runebreak\
runenorm\
strchr\
</sys/src/cmd/mktest
/lib/ucd/%:
cd /lib/ucd && mk $stem
runebreak.test: /lib/ucd/GraphemeBreakTest.txt /lib/ucd/WordBreakTest.txt
runenorm.test: /lib/ucd/NormalizationTest.txt

View file

@ -0,0 +1,112 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
static int
estrtoul(char *s)
{
char *epr;
Rune code;
code = strtoul(s, &epr, 16);
if(s == epr)
sysfatal("bad code point hex string");
return code;
}
static Rune*
check(Rune *r, Rune* (*fn)(Rune*), char* (*fn2)(char*))
{
Rune *r2, *tmp;
char *p, *p2;
p = smprint("%S", r);
r2 = fn(r);
p2 = fn2(p);
tmp = runesmprint("%.*s", (int)(p2-p), p);
if(memcmp(r, tmp, r2-r) != 0)
print("utf mismstach\n");
free(p);
free(tmp);
return r2;
}
static void
run(char *file, Rune* (*fn)(Rune*), char* (*fn2)(char*))
{
Biobuf *b;
char *p, *dot;
char *pieces[16];
int i, j, n;
Rune stack[16], ops[16];
int nstack, nops;
Rune r, *rp, *rp2;
char *line;
b = Bopen(file, OREAD);
if(b == nil)
sysfatal("could not load composition exclusions: %r");
for(;(p = Brdline(b, '\n')) != nil; free(line)){
p[Blinelen(b)-1] = 0;
line = strdup(p);
if(p[0] == 0 || p[0] == '#')
continue;
if((dot = strstr(p, "#")) != nil)
*dot = 0;
n = getfields(p, pieces, nelem(pieces), 0, " ");
nstack = nops = 0;
for(i = 0; i < n; i++){
chartorune(&r, pieces[i]);
if(r != L'÷' && r != L'×'){
r = estrtoul(pieces[i]);
stack[nstack++] = r;
stack[nstack] = 0;
} else {
ops[nops++] = r;
ops[nops] = 0;
}
}
rp = stack;
for(i = 1; i < nops-1;){
rp2 = check(rp, fn, fn2);
switch(ops[i]){
case L'÷':
if(rp2 != rp+1){
print("break fail %X %X || %s\n", rp[0], rp[1], line);
goto Break;
}
rp++;
i++;
break;
case L'×':
if(rp2 - rp == 0){
for(j = i; j < nops - 1; j++)
if(ops[j] != L'×')
print("skipped %d %d %s\n", i, nops, line);
goto Break;
}
for(; rp < (rp2-1); rp++, i++){
if(ops[i] != L'×')
print("skipped %d %d %s\n", i, nops, line);
}
rp = rp2;
i++;
break;
}
}
Break:
;
}
}
void
main(int, char)
{
run("/lib/ucd/GraphemeBreakTest.txt", runegbreak, utfgbreak);
run("/lib/ucd/WordBreakTest.txt", runewbreak, utfwbreak);
exits(nil);
}

View file

@ -0,0 +1,92 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
static int
estrtoul(char *s)
{
char *epr;
Rune code;
code = strtoul(s, &epr, 16);
if(s == epr)
sysfatal("bad code point hex string");
return code;
}
void
main(int, char)
{
Rune buffer1[64];
Rune buffer2[64];
char utfbuff1[128];
char utfbuff2[128];
char srctmp[128], tmp1[128], tmp2[128];
char *fields[10];
char *runes[32];
char *p;
int n, n2;
int i;
uint fail;
Biobuf *b;
b = Bopen("/lib/ucd/NormalizationTest.txt", OREAD);
if(b == nil)
sysfatal("could not load composition exclusions: %r");
struct {
Rune src[32];
Rune nfc[32];
Rune nfd[32];
} test;
while((p = Brdline(b, '\n')) != nil){
p[Blinelen(b)-1] = 0;
if(p[0] == 0 || p[0] == '#' || p[0] == '@')
continue;
getfields(p, fields, 6 + 1, 0, ";");
n = getfields(fields[0], runes, nelem(runes), 0, " ");
for(i = 0; i < n; i++)
test.src[i] = estrtoul(runes[i]);
test.src[i] = 0;
n = getfields(fields[1], runes, nelem(runes), 0, " ");
for(i = 0; i < n; i++)
test.nfc[i] = estrtoul(runes[i]);
test.nfc[i] = 0;
n = getfields(fields[2], runes, nelem(runes), 0, " ");
for(i = 0; i < n; i++)
test.nfd[i] = estrtoul(runes[i]);
test.nfd[i] = 0;
n = runecomp(buffer1, test.src, nelem(buffer1));
n2 = runedecomp(buffer2, test.src, nelem(buffer2));
fail = 0;
if(runestrcmp(buffer1, test.nfc) != 0)
fail |= 1<<0;
if(runestrcmp(buffer2, test.nfd) != 0)
fail |= 1<<1;
if(fail)
print("%d %d %S %S %S %S %S\n", fail, i, test.src, test.nfd, test.nfc, buffer2, buffer1);
assert(n == runestrlen(test.nfc));
assert(n2 == runestrlen(test.nfd));
snprint(srctmp, sizeof tmp1, "%S", test.src);
snprint(tmp1, sizeof tmp1, "%S", test.nfc);
snprint(tmp2, sizeof tmp2, "%S", test.nfd);
n = utfcomp(utfbuff1, srctmp, nelem(utfbuff1));
n2 = utfdecomp(utfbuff2, srctmp, nelem(utfbuff2));
if(strcmp(utfbuff1, tmp1) != 0)
fail |= 1<<2;
if(strcmp(utfbuff2, tmp2) != 0)
fail |= 1<<3;
if(fail)
print("%d %d %s %s %s %s %s\n", fail, i, srctmp, tmp2, tmp1, utfbuff2, utfbuff1);
assert(n == strlen(tmp1));
assert(n2 == strlen(tmp2));
}
exits(nil);
}