mirror of
git://git.9front.org/plan9front/plan9front
synced 2025-01-12 11:10:06 +00:00
runecomp(2)
This commit is contained in:
parent
2163aebcb8
commit
04759ec9af
13 changed files with 1894 additions and 2 deletions
85
lib/ucd/mkfile
Normal file
85
lib/ucd/mkfile
Normal file
|
@ -0,0 +1,85 @@
|
|||
</$objtype/mkfile
|
||||
|
||||
VERSION='15.0.0'
|
||||
URL='https://www.unicode.org/Public/'$VERSION'/ucd/'
|
||||
|
||||
TXT=\
|
||||
ArabicShaping.txt\
|
||||
BidiBrackets.txt\
|
||||
BidiMirroring.txt\
|
||||
BidiTest.txt\
|
||||
Blocks.txt\
|
||||
CJKRadicals.txt\
|
||||
CaseFolding.txt\
|
||||
CompositionExclusions.txt\
|
||||
DerivedAge.txt\
|
||||
DerivedCoreProperties.txt\
|
||||
DerivedNormalizationProps.txt\
|
||||
EastAsianWidth.txt\
|
||||
EmojiSources.txt\
|
||||
EquivalentUnifiedIdeograph.txt\
|
||||
HangulSyllableType.txt\
|
||||
Index.txt\
|
||||
IndicPositionalCategory.txt\
|
||||
IndicSyllabicCategory.txt\
|
||||
Jamo.txt\
|
||||
LineBreak.txt\
|
||||
NameAliases.txt\
|
||||
NamedSequences.txt\
|
||||
NamedSequencesProv.txt\
|
||||
NamesList.txt\
|
||||
NormalizationCorrections.txt\
|
||||
NushuSources.txt\
|
||||
PropList.txt\
|
||||
PropertyAliases.txt\
|
||||
PropertyValueAliases.txt\
|
||||
ScriptExtensions.txt\
|
||||
Scripts.txt\
|
||||
SpecialCasing.txt\
|
||||
StandardizedVariants.txt\
|
||||
TangutSources.txt\
|
||||
USourceData.txt\
|
||||
UnicodeData.txt\
|
||||
VerticalOrientation.txt\
|
||||
|
||||
TEST=\
|
||||
NormalizationTest.txt\
|
||||
BidiCharacterTest.txt\
|
||||
|
||||
PDF=\
|
||||
USourceGlyphs.pdf\
|
||||
USourceRSChart.pdf\
|
||||
|
||||
AUX=\
|
||||
WordBreakProperty.txt\
|
||||
GraphemeBreakProperty.txt\
|
||||
|
||||
ucd:V: UnicodeData.txt
|
||||
|
||||
%.txt:
|
||||
hget $URL^$target > $target >[2]/dev/null
|
||||
%.pdf:
|
||||
hget $URL^$target > $target
|
||||
|
||||
emoji-data.txt:
|
||||
hget $URL^emoji/^$target > $target
|
||||
|
||||
WordBreakProperty.txt:
|
||||
hget $URL^'auxiliary/'^$target > $target
|
||||
|
||||
GraphemeBreakProperty.txt:
|
||||
hget $URL^'auxiliary/'^$target > $target
|
||||
|
||||
WordBreakTest.txt:
|
||||
hget $URL^'auxiliary/'^$target > $target
|
||||
|
||||
GraphemeBreakTest.txt:
|
||||
hget $URL^'auxiliary/'^$target > $target
|
||||
|
||||
txt:V: $TXT
|
||||
|
||||
pdf:V: $PDF
|
||||
|
||||
test:V: $TEST
|
||||
|
||||
all:V: $TXT $PDF $TEST
|
|
@ -77,6 +77,18 @@ extern Rune* runestrrchr(Rune*, Rune);
|
|||
extern long runestrlen(Rune*);
|
||||
extern Rune* runestrstr(Rune*, Rune*);
|
||||
|
||||
extern int runecomp(Rune*, Rune*, int);
|
||||
extern int runedecomp(Rune*, Rune*, int);
|
||||
extern int utfcomp(char*, char*, int);
|
||||
extern int utfdecomp(char*, char*, int);
|
||||
extern char* fullutfnorm(char*,int);
|
||||
extern Rune* fullrunenorm(Rune*,int);
|
||||
|
||||
extern Rune* runewbreak(Rune*);
|
||||
extern char* utfwbreak(char*);
|
||||
extern Rune* runegbreak(Rune*);
|
||||
extern char* utfgbreak(char*);
|
||||
|
||||
extern Rune tolowerrune(Rune);
|
||||
extern Rune totitlerune(Rune);
|
||||
extern Rune toupperrune(Rune);
|
||||
|
|
|
@ -48,7 +48,11 @@ The names are self-explanatory.
|
|||
.PP
|
||||
The case-conversion routines return the character unchanged if it has no case.
|
||||
.SH SOURCE
|
||||
.B /sys/src/libc/port/runetype.c
|
||||
.B /sys/src/libc/port/mkrunetype.c
|
||||
.br
|
||||
.B /sys/src/libc/port/runeistype.c
|
||||
.br
|
||||
.B /sys/src/libc/port/runetotype.c
|
||||
.SH "SEE ALSO
|
||||
.IR ctype (2) ,
|
||||
.IR "The Unicode Standard" .
|
||||
|
|
116
sys/man/2/runecomp
Normal file
116
sys/man/2/runecomp
Normal file
|
@ -0,0 +1,116 @@
|
|||
.TH RUNECOMP 2
|
||||
.SH NAME
|
||||
runecomp, runedecomp, fullrunenorm, runegbreak, runewbreak, utfcomp, utfdecomp, fullutfnorm, utfgbreak, utfwbreak \- multi-rune graphemes
|
||||
.SH SYNOPSIS
|
||||
.ta \w'\fLchar*xx'u
|
||||
.B #include <u.h>
|
||||
.br
|
||||
.B #include <libc.h>
|
||||
.PP
|
||||
.B
|
||||
int runecomp(Rune *dst, Rune *src, int max)
|
||||
.PP
|
||||
.B
|
||||
int runedecomp(Rune *dst, Rune *src, int max)
|
||||
.PP
|
||||
.B
|
||||
Rune* fullrunenorm(Rune *s, int n)
|
||||
.PP
|
||||
.B
|
||||
Rune* runegbreak(Rune *s)
|
||||
.PP
|
||||
.B
|
||||
Rune* runewbreak(Rune *s)
|
||||
.PP
|
||||
.B
|
||||
int utfcomp(char *dst, char *src, int max)
|
||||
.PP
|
||||
.B
|
||||
int utfdecomp(char *dst, char *src, int max)
|
||||
.PP
|
||||
.B
|
||||
char* fullutfnorm(char *s, int n)
|
||||
.PP
|
||||
.B
|
||||
char* utfgbreak(char *s)
|
||||
.PP
|
||||
.B
|
||||
char* utfwbreak(char *s)
|
||||
.SH DESCRIPTION
|
||||
These routines help in handling
|
||||
graphemes that may span multiple runes.
|
||||
.PP
|
||||
.IR Runecomp ,
|
||||
.IR runedecomp ,
|
||||
.IR utfcomp ,
|
||||
and
|
||||
.I utfdecomp
|
||||
perform Unicode® normalization on
|
||||
.IR src ,
|
||||
storing the result in
|
||||
.IR dst .
|
||||
No more than
|
||||
.I max
|
||||
elements will be written, and the resulting string
|
||||
will always be null terminated. The return value
|
||||
is always the total number of elements required to
|
||||
store the transformation. If this value is larger
|
||||
than the supplied
|
||||
.I max
|
||||
the caller can assume the result has been truncated.
|
||||
.I Runecomp
|
||||
and
|
||||
.I utfcomp
|
||||
perform NFC normalization while
|
||||
.I runedecomp
|
||||
and
|
||||
.I utfdecomp
|
||||
perform NFD normalization.
|
||||
.PP
|
||||
.IR Fullrunenorm ,
|
||||
and
|
||||
.I fullutfnorm
|
||||
determine if enough elements are present in
|
||||
.I s
|
||||
to perform normalization. If enough are present,
|
||||
a pointer is returned to the first element that begins
|
||||
the next context. Otherwise
|
||||
.I s
|
||||
is returned. No more then
|
||||
.I n
|
||||
elements will be read. In order to find the boundary, the
|
||||
first element of the next context must be peeked.
|
||||
.PP
|
||||
.I Runegbreak
|
||||
and
|
||||
.I utfgbreak
|
||||
search
|
||||
.B s
|
||||
for the next grapheme break opportunity.
|
||||
If none is found before the end of the string,
|
||||
.I s
|
||||
is returned.
|
||||
.PP
|
||||
.I Runewbreak
|
||||
and
|
||||
.I utfwbreak
|
||||
search
|
||||
.B s
|
||||
for the next word break opportunity.
|
||||
If none is found before the end of the string,
|
||||
.I s
|
||||
is returned.
|
||||
.SH SOURCE
|
||||
.B /sys/src/libc/port/mkrunetype.c
|
||||
.br
|
||||
.B /sys/src/libc/port/runenorm.c
|
||||
.br
|
||||
.B /sys/src/libc/port/runebreak.c
|
||||
.SH SEE ALSO
|
||||
Unicode® Standard Annex #15
|
||||
.br
|
||||
Unicode® Standard Annex #29
|
||||
.br
|
||||
.IR rune (2),
|
||||
.IR utf (6),
|
||||
.IR tcs (1)
|
|
@ -62,6 +62,9 @@ CFILES=\
|
|||
rand.c\
|
||||
readn.c\
|
||||
rune.c\
|
||||
runebreak.c\
|
||||
runeistype.c\
|
||||
runenorm.c\
|
||||
runestrcat.c\
|
||||
runestrchr.c\
|
||||
runestrcmp.c\
|
||||
|
@ -74,7 +77,7 @@ CFILES=\
|
|||
runestrrchr.c\
|
||||
runestrlen.c\
|
||||
runestrstr.c\
|
||||
runetype.c\
|
||||
runetotype.c\
|
||||
sin.c\
|
||||
sinh.c\
|
||||
sqrt.c\
|
||||
|
@ -127,3 +130,26 @@ UPDATE=mkfile\
|
|||
</sys/src/cmd/mksyslib
|
||||
|
||||
profile.$O: /sys/include/tos.h
|
||||
|
||||
runenorm.$O: runenormdata runenorm.c
|
||||
runetotype.$O: runetotypedata runetotype.c
|
||||
runeistype.$O: runeistypedata runeistype.c
|
||||
runebreak.$O: runebreakdata runebreak.c
|
||||
|
||||
UCD=\
|
||||
/lib/ucd/WordBreakProperty.txt\
|
||||
/lib/ucd/GraphemeBreakProperty.txt\
|
||||
/lib/ucd/emoji-data.txt\
|
||||
/lib/ucd/CompositionExclusions.txt\
|
||||
/lib/ucd/UnicodeData.txt\
|
||||
|
||||
/lib/ucd/%:
|
||||
cd /lib/ucd && mk $stem
|
||||
|
||||
runenormdata runetotypedata runeistypedata runebreakdata: mkrunetype.c $UCD
|
||||
@{
|
||||
eval `{grep '^[A-Z]' /$cputype/mkfile}
|
||||
$CC $CFLAGS -o mkrunetype.$O mkrunetype.c
|
||||
$LD $LDFLAGS -o $O.mkrunetype mkrunetype.$O
|
||||
$O.mkrunetype
|
||||
}
|
||||
|
|
748
sys/src/libc/port/mkrunetype.c
Normal file
748
sys/src/libc/port/mkrunetype.c
Normal file
|
@ -0,0 +1,748 @@
|
|||
#include <u.h>
|
||||
#include <libc.h>
|
||||
#include <bio.h>
|
||||
|
||||
enum{
|
||||
NRUNES = 1<<21
|
||||
};
|
||||
|
||||
typedef struct Param Param;
|
||||
typedef struct Lvl Lvl;
|
||||
struct Lvl{
|
||||
int bits;
|
||||
int max;
|
||||
int mask;
|
||||
};
|
||||
struct Param{
|
||||
Lvl idx1;
|
||||
Lvl idx2;
|
||||
Lvl data;
|
||||
|
||||
int round1max;
|
||||
};
|
||||
|
||||
static void
|
||||
derive(Lvl *l)
|
||||
{
|
||||
l->max = 1 << l->bits;
|
||||
l->mask = l->max - 1;
|
||||
}
|
||||
|
||||
static void
|
||||
param(Param *p, int idx1, int idx2)
|
||||
{
|
||||
|
||||
assert(idx1 + idx2 < 21);
|
||||
p->idx1.bits = idx1;
|
||||
p->idx2.bits = idx2;
|
||||
p->data.bits = 21 - idx1 - idx2;
|
||||
derive(&p->idx1);
|
||||
derive(&p->idx2);
|
||||
derive(&p->data);
|
||||
|
||||
p->round1max = NRUNES/p->data.max;
|
||||
}
|
||||
|
||||
static int
|
||||
lkup(Param *p, int *idx1, int *idx2, int *data, int x)
|
||||
{
|
||||
int y, z;
|
||||
|
||||
y = (((x)>>(p->data.bits+p->idx2.bits))&p->idx1.mask);
|
||||
z = (((x)>>p->data.bits)&p->idx2.mask);
|
||||
return data[idx2[idx1[y] + z] + (x&p->data.mask)];
|
||||
}
|
||||
|
||||
static int
|
||||
mkarrvar(int fd, char *name, int *d, int len)
|
||||
{
|
||||
int i, sz;
|
||||
int max, min;
|
||||
char *t;
|
||||
|
||||
max = min = 0;
|
||||
for(i = 0; i < len; i++){
|
||||
if(d[i] > max)
|
||||
max = d[i];
|
||||
if(d[i] < min)
|
||||
min = d[i];
|
||||
}
|
||||
if(min == 0){
|
||||
if(max < 0xFF)
|
||||
t = "uchar", sz = 1;
|
||||
else if(max < 0xFFFF)
|
||||
t = "ushort", sz = 2;
|
||||
else
|
||||
t = "uint", sz = 4;
|
||||
} else {
|
||||
if(max < 1<<7)
|
||||
t = "char", sz = 1;
|
||||
else if(max < 1<<15)
|
||||
t = "short", sz = 2;
|
||||
else
|
||||
t = "int", sz = 4;
|
||||
}
|
||||
if(fd < 0)
|
||||
return sz * len;
|
||||
|
||||
fprint(fd, "static\n%s\t%s[%d] =\n{\n\t", t, name, len);
|
||||
for(i = 0; i < len; i++){
|
||||
fprint(fd, "%d,", d[i]);
|
||||
if((i+1) % 16 == 0)
|
||||
fprint(fd, "\n\t");
|
||||
}
|
||||
fprint(fd, "\n};\n");
|
||||
|
||||
return sz * len;
|
||||
}
|
||||
|
||||
static int
|
||||
mkexceptarr(int fd, char *name, int *d, int n, int all)
|
||||
{
|
||||
int i;
|
||||
fprint(fd, "static\nRune %s[][%d] =\n{\n\t", name, all ? 3 : 2);
|
||||
for(i = 0; i < n*3; i += 3){
|
||||
if(all && d[i] != 0)
|
||||
fprint(fd, "{0x%X, 0x%X, 0x%X},", d[i], d[i+1], d[i+2]);
|
||||
else if(!all)
|
||||
fprint(fd, "{0x%X, 0x%X},", d[i+1], d[i+2]);
|
||||
if((i+3) % (8*3) == 0)
|
||||
fprint(fd, "\n\t");
|
||||
}
|
||||
fprint(fd, "\n};\n");
|
||||
return n * sizeof(Rune) * 2;
|
||||
}
|
||||
|
||||
static int
|
||||
compact(int *data, int *idx, int nidx, int *src, int chunksize)
|
||||
{
|
||||
int i, n, ndata, best;
|
||||
int *dot, *lp, *rp;
|
||||
|
||||
dot = src;
|
||||
ndata = 0;
|
||||
idx[0] = 0;
|
||||
for(i = 1; i <= nidx; i++){
|
||||
rp = dot + chunksize;
|
||||
lp = rp - 1;
|
||||
|
||||
for(best = 0, n = 0; i != nidx && n < chunksize; n++, lp--){
|
||||
if(memcmp(lp, rp, (n+1) * sizeof data[0]) == 0)
|
||||
best = n+1;
|
||||
}
|
||||
memmove(data + ndata, dot, (chunksize - best) * sizeof data[0]);
|
||||
ndata += (chunksize - best);
|
||||
idx[i] = idx[i - 1] + (chunksize - best);
|
||||
dot = rp;
|
||||
}
|
||||
return ndata;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
mklkup(int fd, char *label, int *map, Param *p)
|
||||
{
|
||||
static int data[NRUNES];
|
||||
static int idx2[NRUNES];
|
||||
static int idx2dest[NRUNES];
|
||||
static int idx1[NRUNES];
|
||||
int i, nidx2, ndata;
|
||||
int size;
|
||||
|
||||
ndata = compact(data, idx2, p->round1max, map, p->data.max);
|
||||
nidx2 = compact(idx2dest, idx1, p->idx1.max, idx2, p->idx2.max);
|
||||
|
||||
if(fd >= 0){
|
||||
for(i = 0; i < NRUNES; i++)
|
||||
if(map[i] != lkup(p, idx1, idx2dest, data, i))
|
||||
sysfatal("mismatch in %s at %d %d %d\n", label, i, map[i], lkup(p, idx1, idx2dest, data, i));
|
||||
}
|
||||
|
||||
size = mkarrvar(fd, smprint("_%sdata", label), data, ndata);
|
||||
size += mkarrvar(fd, smprint("_%sidx2", label), idx2dest, nidx2);
|
||||
size += mkarrvar(fd, smprint("_%sidx1", label), idx1, p->idx1.max);
|
||||
if(fd >= 0){
|
||||
fprint(fd, "\n");
|
||||
fprint(fd, "#define %sindex1(x) (((x)>>(%d+%d))&0x%X)\n", label, p->data.bits, p->idx2.bits, p->idx1.mask);
|
||||
fprint(fd, "#define %sindex2(x) (((x)>>%d)&0x%X)\n", label, p->data.bits, p->idx2.mask);
|
||||
fprint(fd, "#define %soffset(x) ((x)&0x%X)\n", label, p->data.mask);
|
||||
fprint(fd, "#define %slkup(x) (_%sdata[_%sidx2[_%sidx1[%sindex1(x)] + %sindex2(x)] + %soffset(x)] )\n\n",
|
||||
label, label, label, label, label, label, label);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
static void
|
||||
mklkupmatrix(char *label, int *map, Param *p)
|
||||
{
|
||||
int bestsize, size, bestx, besty;
|
||||
int x, y;
|
||||
|
||||
bestsize = bestx = besty = -1;
|
||||
for(x = 4; x <= 12; x++)
|
||||
for(y=4; y <= (19 - x); y++){
|
||||
param(p, x, y);
|
||||
size = mklkup(-1, label, map, p);
|
||||
if(bestsize == -1 || size < bestsize){
|
||||
bestx = x;
|
||||
besty = y;
|
||||
bestsize = size;
|
||||
}
|
||||
}
|
||||
|
||||
assert(bestsize != -1);
|
||||
fprint(2, "label: %s best: %d %d (%d)\n", label, bestx, besty, bestsize);
|
||||
param(p, bestx, besty);
|
||||
}
|
||||
|
||||
static int myismerged[NRUNES];
|
||||
static int mytoupper[NRUNES];
|
||||
static int mytolower[NRUNES];
|
||||
static int mytotitle[NRUNES];
|
||||
static int mybreak[NRUNES];
|
||||
|
||||
enum{ DSTART = 0xEEEE };
|
||||
static int mydecomp[NRUNES];
|
||||
static int mydespecial[256*3];
|
||||
static int nspecial;
|
||||
static int myccc[NRUNES];
|
||||
|
||||
typedef struct KV KV;
|
||||
struct KV{
|
||||
uint key;
|
||||
uint val;
|
||||
ushort next;
|
||||
};
|
||||
|
||||
static KV myrecomp[2000];
|
||||
static int nrecomp;
|
||||
|
||||
static int recompext[256*3];
|
||||
static int nrecompext;
|
||||
|
||||
static uint
|
||||
hash(uint x)
|
||||
{
|
||||
x ^= x >> 16;
|
||||
x *= 0x21f0aaad;
|
||||
x ^= x >> 15;
|
||||
x *= 0xd35a2d97;
|
||||
x ^= x >> 15;
|
||||
return x;
|
||||
}
|
||||
|
||||
static void
|
||||
mkrecomp(int fd)
|
||||
{
|
||||
int i;
|
||||
KV *p;
|
||||
static KV vals[512];
|
||||
static KV coll[1000];
|
||||
int over;
|
||||
int maxchain;
|
||||
|
||||
for(i = 0; i < nelem(vals); i++)
|
||||
vals[i] = (KV){0, 0, 0};
|
||||
for(i = 0; i < nelem(coll); i++)
|
||||
coll[i] = (KV){0, 0, 0};
|
||||
over = 1;
|
||||
for(i = 0; i < nrecomp; i++){
|
||||
p = vals + (hash(myrecomp[i].key) % nelem(vals));
|
||||
maxchain = 0;
|
||||
while(p->key != 0){
|
||||
maxchain++;
|
||||
if(p->next == 0){
|
||||
p->next = over;
|
||||
p = coll + over - 1;
|
||||
over++;
|
||||
} else
|
||||
p = coll + p->next - 1;
|
||||
}
|
||||
p->key = myrecomp[i].key;
|
||||
p->val = myrecomp[i].val;
|
||||
}
|
||||
fprint(2, "recomp map [%d][%d]: %d\n", nelem(vals), over-1, (nelem(vals) + over-1) * (4+2+2));
|
||||
fprint(fd, "static\nuint\t_recompdata[] =\n{\n\t");
|
||||
for(p = vals, i = 0;; i++){
|
||||
assert(p->val < 0xFFFF);
|
||||
assert(p->next < 0xFFFF);
|
||||
fprint(fd, "%udU,%udU,", p->key, p->val | (p->next<<16));
|
||||
if((i+1) % 8 == 0)
|
||||
fprint(fd, "\n\t");
|
||||
|
||||
if(p == vals+nelem(vals)-1)
|
||||
p = coll;
|
||||
else if(p == coll + over - 2)
|
||||
break;
|
||||
else
|
||||
p++;
|
||||
}
|
||||
fprint(fd, "\n};\n");
|
||||
fprint(fd, "static uint *_recompcoll = _recompdata+%d*2;\n", nelem(vals));
|
||||
}
|
||||
|
||||
static void
|
||||
mktables(void)
|
||||
{
|
||||
Param p;
|
||||
int tofd, isfd, normfd, breakfd;
|
||||
int size;
|
||||
|
||||
tofd = create("runetotypedata", OWRITE, 0664);
|
||||
if(tofd < 0)
|
||||
sysfatal("could not create runetotypedata: %r");
|
||||
param(&p, 10, 7);
|
||||
size = mklkup(tofd, "upper", mytoupper, &p);
|
||||
fprint(2, "%s: %d\n", "upper", size);
|
||||
|
||||
size = mklkup(tofd, "lower", mytolower, &p);
|
||||
fprint(2, "%s: %d\n", "lower", size);
|
||||
|
||||
size = mklkup(tofd, "title", mytotitle, &p);
|
||||
fprint(2, "%s: %d\n", "title", size);
|
||||
close(tofd);
|
||||
|
||||
isfd = create("runeistypedata", OWRITE, 0664);
|
||||
if(isfd < 0)
|
||||
sysfatal("could not create runeistypedata: %r");
|
||||
param(&p, 11, 6);
|
||||
size = mklkup(isfd, "merged", myismerged, &p);
|
||||
fprint(2, "%s: %d\n", "merged", size);
|
||||
fprint(isfd, "static\nenum {\n");
|
||||
fprint(isfd, "\tL%s = %s,\n", "space", "1<<0");
|
||||
fprint(isfd, "\tL%s = %s,\n", "alpha", "1<<1");
|
||||
fprint(isfd, "\tL%s = %s,\n", "digit", "1<<2");
|
||||
fprint(isfd, "\tL%s = %s,\n", "upper", "1<<3");
|
||||
fprint(isfd, "\tL%s = %s,\n", "lower", "1<<4");
|
||||
fprint(isfd, "\tL%s = %s,\n", "title", "1<<5");
|
||||
fprint(isfd, "};\n");
|
||||
close(isfd);
|
||||
|
||||
normfd = create("runenormdata", OWRITE, 0664);
|
||||
if(normfd < 0)
|
||||
sysfatal("could not create runenormdata: %r");
|
||||
param(&p, 10, 7);
|
||||
size = mklkup(normfd, "decomp", mydecomp, &p);
|
||||
fprint(2, "%s: %d\n", "decomp", size);
|
||||
|
||||
param(&p, 9, 7);
|
||||
size = mklkup(normfd, "ccc", myccc, &p);
|
||||
fprint(2, "%s: %d\n", "ccc", size);
|
||||
|
||||
mkexceptarr(normfd, "_decompexceptions", mydespecial, nspecial, 0);
|
||||
mkexceptarr(normfd, "_recompexceptions", recompext, nrecompext, 1);
|
||||
mkrecomp(normfd);
|
||||
close(normfd);
|
||||
|
||||
param(&p, 10, 6);
|
||||
breakfd = create("runebreakdata", OWRITE, 0644);
|
||||
if(breakfd < 0)
|
||||
sysfatal("could not create runebreakdata: %r");
|
||||
size = mklkup(breakfd, "break", mybreak, &p);
|
||||
fprint(2, "%s: %d\n", "break", size);
|
||||
}
|
||||
|
||||
enum {
|
||||
FIELD_CODE,
|
||||
FIELD_NAME,
|
||||
FIELD_CATEGORY,
|
||||
FIELD_COMBINING,
|
||||
FIELD_BIDIR,
|
||||
FIELD_DECOMP,
|
||||
FIELD_DECIMAL_DIG,
|
||||
FIELD_DIG,
|
||||
FIELD_NUMERIC_VAL,
|
||||
FIELD_MIRRORED,
|
||||
FIELD_UNICODE_1_NAME,
|
||||
FIELD_COMMENT,
|
||||
FIELD_UPPER,
|
||||
FIELD_LOWER,
|
||||
FIELD_TITLE,
|
||||
NFIELDS,
|
||||
};
|
||||
|
||||
static int
|
||||
getunicodeline(Biobuf *in, char **fields)
|
||||
{
|
||||
char *p;
|
||||
|
||||
if((p = Brdline(in, '\n')) == nil)
|
||||
return 0;
|
||||
|
||||
p[Blinelen(in)-1] = '\0';
|
||||
|
||||
if (getfields(p, fields, NFIELDS + 1, 0, ";") != NFIELDS)
|
||||
sysfatal("bad number of fields");
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int
|
||||
estrtoul(char *s, int base)
|
||||
{
|
||||
char *epr;
|
||||
Rune code;
|
||||
|
||||
code = strtoul(s, &epr, base);
|
||||
if(s == epr)
|
||||
sysfatal("bad code point hex string");
|
||||
return code;
|
||||
}
|
||||
|
||||
enum {
|
||||
OTHER,
|
||||
Hebrew_Letter, Newline, Extend, Format,
|
||||
Katakana, ALetter, MidLetter, MidNum,
|
||||
MidNumLet, Numeric, ExtendNumLet, WSegSpace,
|
||||
PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
|
||||
L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
|
||||
EMOJIEX = 0xB0,
|
||||
};
|
||||
|
||||
static void
|
||||
markbreak(void)
|
||||
{
|
||||
Biobuf *b;
|
||||
char *p, *dot;
|
||||
int i, s, e;
|
||||
uchar v;
|
||||
|
||||
b = Bopen("/lib/ucd/WordBreakProperty.txt", OREAD);
|
||||
if(b == nil)
|
||||
sysfatal("could not load word breaks: %r");
|
||||
|
||||
while((p = Brdline(b, '\n')) != nil){
|
||||
p[Blinelen(b)-1] = 0;
|
||||
if(p[0] == 0 || p[0] == '#')
|
||||
continue;
|
||||
if((dot = strstr(p, "..")) != nil){
|
||||
*dot = 0;
|
||||
dot += 2;
|
||||
s = estrtoul(p, 16);
|
||||
e = estrtoul(dot, 16);
|
||||
} else {
|
||||
s = e = estrtoul(p, 16);
|
||||
dot = p;
|
||||
}
|
||||
v = 0;
|
||||
if(strstr(dot, "ExtendNumLet") != nil)
|
||||
v = ExtendNumLet;
|
||||
else if(strstr(dot, "Hebrew_Letter") != nil)
|
||||
v = Hebrew_Letter;
|
||||
else if(strstr(dot, "Newline") != nil)
|
||||
v = Newline;
|
||||
else if(strstr(dot, "Extend") != nil)
|
||||
v = Extend;
|
||||
else if(strstr(dot, "Format") != nil)
|
||||
v = Format;
|
||||
else if(strstr(dot, "Katakana") != nil)
|
||||
v = Katakana;
|
||||
else if(strstr(dot, "ALetter") != nil)
|
||||
v = ALetter;
|
||||
else if(strstr(dot, "MidLetter") != nil)
|
||||
v = MidLetter;
|
||||
else if(strstr(dot, "MidNum") != nil)
|
||||
v = MidNum;
|
||||
else if(strstr(dot, "Numeric") != nil)
|
||||
v = Numeric;
|
||||
else if(strstr(dot, "WSegSpace") != nil)
|
||||
v = WSegSpace;
|
||||
for(i = s; i <= e; i++)
|
||||
mybreak[i] = v;
|
||||
}
|
||||
Bterm(b);
|
||||
b = Bopen("/lib/ucd/GraphemeBreakProperty.txt", OREAD);
|
||||
if(b == nil)
|
||||
sysfatal("could not load Grapheme breaks: %r");
|
||||
|
||||
while((p = Brdline(b, '\n')) != nil){
|
||||
p[Blinelen(b)-1] = 0;
|
||||
if(p[0] == 0 || p[0] == '#')
|
||||
continue;
|
||||
if((dot = strstr(p, "..")) != nil){
|
||||
*dot = 0;
|
||||
dot += 2;
|
||||
s = estrtoul(p, 16);
|
||||
e = estrtoul(dot, 16);
|
||||
} else {
|
||||
s = e = estrtoul(p, 16);
|
||||
dot = p;
|
||||
}
|
||||
v = 0;
|
||||
if(strstr(dot, "; Prepend #") != nil)
|
||||
v = PREPEND;
|
||||
else if(strstr(dot, "; Control #") != nil)
|
||||
v = CONTROL;
|
||||
else if(strstr(dot, "; Extend #") != nil)
|
||||
v = EXTEND;
|
||||
else if(strstr(dot, "; Regional_Indicator #") != nil)
|
||||
v = REGION;
|
||||
else if(strstr(dot, "; SpacingMark #") != nil)
|
||||
v = SPACEMK;
|
||||
else if(strstr(dot, "; L #") != nil)
|
||||
v = L;
|
||||
else if(strstr(dot, "; V #") != nil)
|
||||
v = V;
|
||||
else if(strstr(dot, "; T #") != nil)
|
||||
v = T;
|
||||
else if(strstr(dot, "; LV #") != nil)
|
||||
v = LV;
|
||||
else if(strstr(dot, "; LVT #") != nil)
|
||||
v = LVT;
|
||||
for(i = s; i <= e; i++)
|
||||
mybreak[i] |= v;
|
||||
}
|
||||
Bterm(b);
|
||||
|
||||
b = Bopen("/lib/ucd/emoji-data.txt", OREAD);
|
||||
if(b == nil)
|
||||
sysfatal("could not load emoji-data: %r");
|
||||
|
||||
while((p = Brdline(b, '\n')) != nil){
|
||||
p[Blinelen(b)-1] = 0;
|
||||
if(p[0] == 0 || p[0] == '#')
|
||||
continue;
|
||||
if((dot = strstr(p, "..")) != nil){
|
||||
*dot = 0;
|
||||
dot += 2;
|
||||
s = estrtoul(p, 16);
|
||||
e = estrtoul(dot, 16);
|
||||
} else {
|
||||
s = e = estrtoul(p, 16);
|
||||
dot = p;
|
||||
}
|
||||
v = 0;
|
||||
if(strstr(dot, "; Extended_Pictographic") != nil)
|
||||
v = EMOJIEX;
|
||||
for(i = s; i <= e; i++)
|
||||
mybreak[i] |= v;
|
||||
}
|
||||
Bterm(b);
|
||||
}
|
||||
|
||||
static void
|
||||
markexclusions(void)
|
||||
{
|
||||
Biobuf *b;
|
||||
char *p;
|
||||
int i;
|
||||
uint x;
|
||||
|
||||
b = Bopen("/lib/ucd/CompositionExclusions.txt", OREAD);
|
||||
if(b == nil)
|
||||
sysfatal("could not load composition exclusions: %r");
|
||||
|
||||
while((p = Brdline(b, '\n')) != nil){
|
||||
p[Blinelen(b)-1] = 0;
|
||||
if(p[0] == 0 || p[0] == '#')
|
||||
continue;
|
||||
x = estrtoul(p, 16);
|
||||
for(i = 0; i < nrecomp; i++){
|
||||
if(myrecomp[i].val == x){
|
||||
myrecomp[i].val = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(i == nrecomp){
|
||||
for(i = 0; i < nrecompext; i++){
|
||||
if(recompext[i*3] == x){
|
||||
recompext[i*3] = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Bterm(b);
|
||||
}
|
||||
|
||||
void
|
||||
main(int, char)
|
||||
{
|
||||
static char myisspace[NRUNES];
|
||||
static char myisalpha[NRUNES];
|
||||
static char myisdigit[NRUNES];
|
||||
static char myisupper[NRUNES];
|
||||
static char myislower[NRUNES];
|
||||
static char myistitle[NRUNES];
|
||||
Biobuf *in;
|
||||
char *fields[NFIELDS + 1], *fields2[NFIELDS + 1];
|
||||
char *p, *d;
|
||||
int i, code, last;
|
||||
int decomp[2], *ip;
|
||||
|
||||
in = Bopen("/lib/ucd/UnicodeData.txt", OREAD);
|
||||
if(in == nil)
|
||||
sysfatal("can't open UnicodeData.txt: %r");
|
||||
|
||||
for(i = 0; i < NRUNES; i++){
|
||||
mytoupper[i] = -1;
|
||||
mytolower[i] = -1;
|
||||
mytotitle[i] = -1;
|
||||
mydecomp[i] = 0;
|
||||
myccc[i] = 0;
|
||||
mybreak[i] = 0;
|
||||
}
|
||||
|
||||
myisspace['\t'] = 1;
|
||||
myisspace['\n'] = 1;
|
||||
myisspace['\r'] = 1;
|
||||
myisspace['\f'] = 1;
|
||||
myisspace['\v'] = 1;
|
||||
myisspace[0x85] = 1; /* control char, "next line" */
|
||||
myisspace[0xfeff] = 1; /* zero-width non-break space */
|
||||
|
||||
last = -1;
|
||||
nspecial = nrecomp = nrecompext = 0;
|
||||
while(getunicodeline(in, fields)){
|
||||
code = estrtoul(fields[FIELD_CODE], 16);
|
||||
if (code >= NRUNES)
|
||||
sysfatal("code-point value too big: %x", code);
|
||||
if(code <= last)
|
||||
sysfatal("bad code sequence: %x then %x", last, code);
|
||||
last = code;
|
||||
|
||||
p = fields[FIELD_CATEGORY];
|
||||
if(strstr(fields[FIELD_NAME], ", First>") != nil){
|
||||
if(!getunicodeline(in, fields2))
|
||||
sysfatal("range start at eof");
|
||||
if (strstr(fields2[FIELD_NAME], ", Last>") == nil)
|
||||
sysfatal("range start not followed by range end");
|
||||
last = estrtoul(fields2[FIELD_CODE], 16);
|
||||
if(last <= code)
|
||||
sysfatal("range out of sequence: %x then %x", code, last);
|
||||
if(strcmp(p, fields2[FIELD_CATEGORY]) != 0)
|
||||
sysfatal("range with mismatched category");
|
||||
}
|
||||
|
||||
d = fields[FIELD_DECOMP];
|
||||
if(strlen(d) > 0 && strstr(d, "<") == nil){
|
||||
decomp[0] = estrtoul(d, 16);
|
||||
d = strstr(d, " ");
|
||||
if(d == nil){
|
||||
/* singleton recompositions are verboden */
|
||||
decomp[1] = 0;
|
||||
if(decomp[0] > 0xFFFF){
|
||||
ip = mydespecial + nspecial*3;
|
||||
ip[0] = code;
|
||||
ip[1] = decomp[0];
|
||||
ip[2] = 0;
|
||||
mydecomp[code] = (DSTART+nspecial)<<16;
|
||||
nspecial++;
|
||||
} else
|
||||
mydecomp[code] = decomp[0]<<16;
|
||||
} else {
|
||||
d++;
|
||||
decomp[1] = estrtoul(d, 16);
|
||||
if(decomp[0] > 0xFFFF || decomp[1] > 0xFFFF){
|
||||
ip = mydespecial + nspecial*3;
|
||||
ip[0] = code;
|
||||
ip[1] = decomp[0];
|
||||
ip[2] = decomp[1];
|
||||
mydecomp[code] = (DSTART+nspecial)<<16;
|
||||
nspecial++;
|
||||
ip = recompext + nrecompext*3;
|
||||
ip[0] = code;
|
||||
ip[1] = decomp[0];
|
||||
ip[2] = decomp[1];
|
||||
nrecompext++;
|
||||
} else {
|
||||
mydecomp[code] = decomp[0]<<16 | decomp[1];
|
||||
myrecomp[nrecomp++] = (KV){decomp[0]<<16 | decomp[1], code, 0};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (; code <= last; code++){
|
||||
if(p[0] == 'L')
|
||||
myisalpha[code] = 1;
|
||||
if(p[0] == 'Z')
|
||||
myisspace[code] = 1;
|
||||
|
||||
if(strcmp(p, "Lu") == 0)
|
||||
myisupper[code] = 1;
|
||||
if(strcmp(p, "Ll") == 0)
|
||||
myislower[code] = 1;
|
||||
|
||||
if(strcmp(p, "Lt") == 0)
|
||||
myistitle[code] = 1;
|
||||
|
||||
if(strcmp(p, "Nd") == 0)
|
||||
myisdigit[code] = 1;
|
||||
|
||||
if(fields[FIELD_UPPER][0] != '\0')
|
||||
mytoupper[code] = estrtoul(fields[FIELD_UPPER], 16);
|
||||
|
||||
if(fields[FIELD_LOWER][0] != '\0')
|
||||
mytolower[code] = estrtoul(fields[FIELD_LOWER], 16);
|
||||
|
||||
if(fields[FIELD_TITLE][0] != '\0')
|
||||
mytotitle[code] = estrtoul(fields[FIELD_TITLE], 16);
|
||||
|
||||
myccc[code] = estrtoul(fields[FIELD_COMBINING], 10);
|
||||
}
|
||||
}
|
||||
|
||||
Bterm(in);
|
||||
markexclusions();
|
||||
|
||||
/*
|
||||
* according to standard, if totitle(x) is not defined in ucd
|
||||
* but toupper(x) is, then totitle is defined to be toupper(x)
|
||||
*/
|
||||
for(i = 0; i < NRUNES; i++){
|
||||
if(mytotitle[i] == -1
|
||||
&& mytoupper[i] != -1
|
||||
&& !myistitle[i])
|
||||
mytotitle[i] = mytoupper[i];
|
||||
}
|
||||
|
||||
/*
|
||||
* A couple corrections:
|
||||
* is*(to*(x)) should be true.
|
||||
* restore undefined transformations.
|
||||
* store offset instead of value, makes them sparse.
|
||||
*/
|
||||
for(i = 0; i < NRUNES; i++){
|
||||
if(mytoupper[i] != -1)
|
||||
myisupper[mytoupper[i]] = 1;
|
||||
else
|
||||
mytoupper[i] = i;
|
||||
|
||||
if(mytolower[i] != -1)
|
||||
myislower[mytolower[i]] = 1;
|
||||
else
|
||||
mytolower[i] = i;
|
||||
|
||||
if(mytotitle[i] != -1)
|
||||
myistitle[mytotitle[i]] = 1;
|
||||
else
|
||||
mytotitle[i] = i;
|
||||
|
||||
mytoupper[i] = mytoupper[i] - i;
|
||||
mytolower[i] = mytolower[i] - i;
|
||||
mytotitle[i] = mytotitle[i] - i;
|
||||
}
|
||||
|
||||
uchar b;
|
||||
for(i = 0; i < NRUNES; i++){
|
||||
b = 0;
|
||||
if(myisspace[i])
|
||||
b |= 1<<0;
|
||||
if(myisalpha[i])
|
||||
b |= 1<<1;
|
||||
if(myisdigit[i])
|
||||
b |= 1<<2;
|
||||
if(myisupper[i])
|
||||
b |= 1<<3;
|
||||
if(myislower[i])
|
||||
b |= 1<<4;
|
||||
if(myistitle[i])
|
||||
b |= 1<<5;
|
||||
|
||||
myismerged[i] = b;
|
||||
}
|
||||
|
||||
markbreak();
|
||||
mktables();
|
||||
exits(nil);
|
||||
}
|
293
sys/src/libc/port/runebreak.c
Normal file
293
sys/src/libc/port/runebreak.c
Normal file
|
@ -0,0 +1,293 @@
|
|||
#include <u.h>
|
||||
#include <libc.h>
|
||||
|
||||
#include "runebreakdata"
|
||||
|
||||
enum {
|
||||
OTHER,
|
||||
Hebrew_Letter, Newline, Extend, Format,
|
||||
Katakana, ALetter, MidLetter, MidNum,
|
||||
MidNumLet, Numeric, ExtendNumLet, WSegSpace,
|
||||
PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
|
||||
L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
|
||||
EMOJIEX = 0xB0,
|
||||
|
||||
ZWJ = 0x200DU,
|
||||
LINETAB = 0xB,
|
||||
};
|
||||
|
||||
#define IS(x, y) ((x&0xf) == y)
|
||||
#define ISG(x, y) ((x&0xf0) == y)
|
||||
|
||||
Rune*
|
||||
runegbreak(Rune *s)
|
||||
{
|
||||
Rune l, r;
|
||||
uchar lt, rt;
|
||||
Rune *p;
|
||||
|
||||
p = s;
|
||||
if((l = *p++) == 0)
|
||||
return s;
|
||||
if((r = *p) == 0)
|
||||
return s;
|
||||
lt = breaklkup(l);
|
||||
rt = breaklkup(r);
|
||||
if(l == '\r' && r == '\n')
|
||||
goto Done;
|
||||
if(ISG(lt, CONTROL) || l == '\r' || l == '\n')
|
||||
return p;
|
||||
if(ISG(rt, CONTROL) || r == '\r' || r == '\n')
|
||||
return p;
|
||||
if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT)))
|
||||
goto Done;
|
||||
if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T)))
|
||||
goto Done;
|
||||
if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T)))
|
||||
goto Done;
|
||||
if(ISG(rt, SPACEMK) || ISG(lt, PREPEND))
|
||||
goto Done;
|
||||
if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){
|
||||
while(ISG(rt, EXTEND)){
|
||||
p++;
|
||||
if((r = *p) == 0)
|
||||
return s;
|
||||
rt = breaklkup(r);
|
||||
}
|
||||
if(r != ZWJ)
|
||||
return p;
|
||||
p++;
|
||||
if((r = *p) == 0)
|
||||
return s;
|
||||
rt = breaklkup(r);
|
||||
if(ISG(rt, EMOJIEX))
|
||||
goto Done;
|
||||
return p;
|
||||
}
|
||||
if(ISG(rt, EXTEND) || r == ZWJ)
|
||||
goto Done;
|
||||
if(ISG(lt, REGION) && ISG(rt, REGION))
|
||||
goto Done;
|
||||
|
||||
return p;
|
||||
|
||||
Done:
|
||||
if(p[1] == 0)
|
||||
return s;
|
||||
return p + 1;
|
||||
}
|
||||
|
||||
char*
|
||||
utfgbreak(char *s)
|
||||
{
|
||||
Rune l, r;
|
||||
uchar lt, rt;
|
||||
char *p;
|
||||
|
||||
p = s;
|
||||
p += chartorune(&l, p);
|
||||
if(l == 0)
|
||||
return s;
|
||||
chartorune(&r, p);
|
||||
if(r == 0)
|
||||
return s;
|
||||
lt = breaklkup(l);
|
||||
rt = breaklkup(r);
|
||||
if(l == '\r' && r == '\n')
|
||||
goto Done;
|
||||
if(ISG(lt, CONTROL) || l == '\r' || l == '\n')
|
||||
return p;
|
||||
if(ISG(rt, CONTROL) || r == '\r' || r == '\n')
|
||||
return p;
|
||||
if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT)))
|
||||
goto Done;
|
||||
if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T)))
|
||||
goto Done;
|
||||
if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T)))
|
||||
goto Done;
|
||||
if(ISG(rt, SPACEMK) || ISG(lt, PREPEND))
|
||||
goto Done;
|
||||
if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){
|
||||
while(ISG(rt, EXTEND)){
|
||||
p += chartorune(&r, p);
|
||||
chartorune(&r, p);
|
||||
if(r == 0)
|
||||
return s;
|
||||
rt = breaklkup(r);
|
||||
}
|
||||
if(r != ZWJ)
|
||||
return p;
|
||||
|
||||
p += chartorune(&r, p);
|
||||
chartorune(&r, p);
|
||||
if(r == 0)
|
||||
return s;
|
||||
rt = breaklkup(r);
|
||||
if(ISG(rt, EMOJIEX))
|
||||
goto Done;
|
||||
return p;
|
||||
}
|
||||
if(ISG(rt, EXTEND) || r == ZWJ)
|
||||
goto Done;
|
||||
if(ISG(lt, REGION) && ISG(rt, REGION))
|
||||
goto Done;
|
||||
|
||||
return p;
|
||||
|
||||
Done:
|
||||
p += chartorune(&r, p);
|
||||
chartorune(&r, p);
|
||||
if(r == 0)
|
||||
return s;
|
||||
return p;
|
||||
}
|
||||
|
||||
#define AH(x) (IS(x, ALetter) || IS(x, Hebrew_Letter))
|
||||
#define MNLQ(x) (IS(x, MidNumLet) || x == '\'')
|
||||
|
||||
Rune*
|
||||
runewbreak(Rune *s)
|
||||
{
|
||||
Rune l, r;
|
||||
uchar lt, rt;
|
||||
Rune *p;
|
||||
|
||||
p = s;
|
||||
if((l = *p++) == 0)
|
||||
return s;
|
||||
if((r = *p) == 0)
|
||||
return s;
|
||||
lt = breaklkup(l);
|
||||
rt = breaklkup(r);
|
||||
if(l == '\r' && r == '\n')
|
||||
goto Done;
|
||||
if(l == '\r' || l == '\n' || l == LINETAB)
|
||||
return p;
|
||||
if(r == '\r' || r == '\n' || l == LINETAB)
|
||||
return p;
|
||||
if(IS(lt, WSegSpace) && IS(rt, WSegSpace))
|
||||
goto Done;
|
||||
if(IS(rt, Format) || IS(rt, Extend))
|
||||
goto Done;
|
||||
if(AH(lt)){
|
||||
if(AH(rt))
|
||||
goto Done;
|
||||
if((IS(rt, MidLetter) || MNLQ(rt)) && p[1] != 0 && AH(breaklkup(p[1])))
|
||||
goto Done;
|
||||
if(IS(lt, Hebrew_Letter) && r == '\'')
|
||||
goto Done;
|
||||
if(IS(lt, Hebrew_Letter) && r == '"' && p[1] != 0 && IS(breaklkup(p[1]), Hebrew_Letter))
|
||||
goto Done;
|
||||
if(IS(rt, Numeric))
|
||||
goto Done;
|
||||
}
|
||||
if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric)))
|
||||
goto Done;
|
||||
if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && p[1] != 0 && IS(breaklkup(p[1]), Numeric))
|
||||
goto Done;
|
||||
if(IS(lt, Katakana) && IS(rt, Katakana))
|
||||
goto Done;
|
||||
if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet))
|
||||
if(IS(rt, ExtendNumLet))
|
||||
goto Done;
|
||||
if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana)))
|
||||
goto Done;
|
||||
if(ISG(lt, REGION)){
|
||||
if(ISG(rt, REGION))
|
||||
goto Done;
|
||||
if(r != ZWJ)
|
||||
return p;
|
||||
p++;
|
||||
if((r = *p) == 0)
|
||||
return s;
|
||||
rt = breaklkup(r);
|
||||
if(ISG(rt, REGION))
|
||||
goto Done;
|
||||
}
|
||||
|
||||
return p;
|
||||
|
||||
Done:
|
||||
if(p[1] == 0)
|
||||
return s;
|
||||
return p + 1;
|
||||
}
|
||||
|
||||
char*
|
||||
utfwbreak(char *s)
|
||||
{
|
||||
Rune l, r;
|
||||
Rune peek;
|
||||
uchar lt, rt;
|
||||
char *p;
|
||||
|
||||
p = s;
|
||||
p += chartorune(&l, p);
|
||||
if(l == 0)
|
||||
return s;
|
||||
chartorune(&peek, p+chartorune(&r, p));
|
||||
if(r == 0)
|
||||
return s;
|
||||
lt = breaklkup(l);
|
||||
rt = breaklkup(r);
|
||||
if(l == '\r' && r == '\n')
|
||||
goto Done;
|
||||
if(l == '\r' || l == '\n' || l == LINETAB)
|
||||
return p;
|
||||
if(r == '\r' || r == '\n' || l == LINETAB)
|
||||
return p;
|
||||
if(IS(lt, WSegSpace) && IS(rt, WSegSpace))
|
||||
goto Done;
|
||||
if(IS(rt, Format) || IS(rt, Extend))
|
||||
goto Done;
|
||||
if(AH(lt)){
|
||||
if(AH(rt))
|
||||
goto Done;
|
||||
if(IS(rt, MidLetter) || MNLQ(rt))
|
||||
if(peek != 0 && AH(breaklkup(peek)))
|
||||
goto Done;
|
||||
|
||||
if(IS(lt, Hebrew_Letter) && r == '\'')
|
||||
goto Done;
|
||||
|
||||
if(IS(lt, Hebrew_Letter) && r == '"')
|
||||
if(peek != 0 && IS(breaklkup(peek), Hebrew_Letter))
|
||||
goto Done;
|
||||
|
||||
if(IS(rt, Numeric))
|
||||
goto Done;
|
||||
}
|
||||
if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric)))
|
||||
goto Done;
|
||||
if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && peek != 0 && IS(breaklkup(peek), Numeric))
|
||||
goto Done;
|
||||
if(IS(lt, Katakana) && IS(rt, Katakana))
|
||||
goto Done;
|
||||
if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet))
|
||||
if(IS(rt, ExtendNumLet))
|
||||
goto Done;
|
||||
if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana)))
|
||||
goto Done;
|
||||
if(ISG(lt, REGION)){
|
||||
if(ISG(rt, REGION))
|
||||
goto Done;
|
||||
if(r != ZWJ)
|
||||
return p;
|
||||
p += chartorune(&r, p);
|
||||
chartorune(&r, p);
|
||||
if(r == 0)
|
||||
return s;
|
||||
rt = breaklkup(r);
|
||||
if(ISG(rt, REGION))
|
||||
goto Done;
|
||||
}
|
||||
|
||||
return p;
|
||||
|
||||
Done:
|
||||
p += chartorune(&r, p);
|
||||
chartorune(&r, p);
|
||||
if(r == 0)
|
||||
return s;
|
||||
return p;
|
||||
}
|
40
sys/src/libc/port/runeistype.c
Normal file
40
sys/src/libc/port/runeistype.c
Normal file
|
@ -0,0 +1,40 @@
|
|||
#include <u.h>
|
||||
#include <libc.h>
|
||||
|
||||
#include "runeistypedata"
|
||||
|
||||
int
|
||||
isspacerune(Rune c)
|
||||
{
|
||||
return (mergedlkup(c) & Lspace) == Lspace;
|
||||
}
|
||||
|
||||
int
|
||||
isalpharune(Rune c)
|
||||
{
|
||||
return (mergedlkup(c) & Lalpha) == Lalpha;
|
||||
}
|
||||
|
||||
int
|
||||
isdigitrune(Rune c)
|
||||
{
|
||||
return (mergedlkup(c) & Ldigit) == Ldigit;
|
||||
}
|
||||
|
||||
int
|
||||
isupperrune(Rune c)
|
||||
{
|
||||
return (mergedlkup(c) & Lupper) == Lupper;
|
||||
}
|
||||
|
||||
int
|
||||
islowerrune(Rune c)
|
||||
{
|
||||
return (mergedlkup(c) & Llower) == Llower;
|
||||
}
|
||||
|
||||
int
|
||||
istitlerune(Rune c)
|
||||
{
|
||||
return (mergedlkup(c) & Ltitle) == Ltitle;
|
||||
}
|
334
sys/src/libc/port/runenorm.c
Normal file
334
sys/src/libc/port/runenorm.c
Normal file
|
@ -0,0 +1,334 @@
|
|||
#include <u.h>
|
||||
#include <libc.h>
|
||||
|
||||
#include "runenormdata"
|
||||
|
||||
//Unicode Standard: Section 3.12 Conjoining Jamo Behavior
|
||||
enum {
|
||||
SBase = 0xAC00,
|
||||
LBase = 0x1100,
|
||||
VBase = 0x1161,
|
||||
TBase = 0x11A7,
|
||||
|
||||
LCount = 19,
|
||||
VCount = 21,
|
||||
TCount = 28,
|
||||
NCount = VCount * TCount,
|
||||
SCount = LCount * NCount,
|
||||
|
||||
LLast = LBase + LCount - 1,
|
||||
SLast = SBase + SCount - 1,
|
||||
VLast = VBase + VCount - 1,
|
||||
TLast = TBase + TCount - 1,
|
||||
};
|
||||
|
||||
static void
|
||||
_runedecomp(Rune dst[2], Rune c)
|
||||
{
|
||||
uint x;
|
||||
|
||||
if(c >= SBase && c <= SLast){
|
||||
c -= SBase;
|
||||
x = c % TCount;
|
||||
if(x){
|
||||
dst[0] = SBase + ((c / TCount) * TCount);
|
||||
dst[1] = TBase + x;
|
||||
return;
|
||||
}
|
||||
dst[0] = LBase + (c / NCount);
|
||||
dst[1] = VBase + ((c % NCount) / TCount);
|
||||
return;
|
||||
}
|
||||
x = decomplkup(c);
|
||||
if((x & 0xFFFF) != 0){
|
||||
dst[0] = x>>16;
|
||||
dst[1] = x & 0xFFFF;
|
||||
return;
|
||||
}
|
||||
x >>= 16;
|
||||
if(x >= 0xEEEE && x <0xF8FF){
|
||||
memmove(dst, _decompexceptions[x - 0xEEEE], sizeof(Rune)*2);
|
||||
return;
|
||||
}
|
||||
dst[0] = x;
|
||||
dst[1] = 0;
|
||||
}
|
||||
|
||||
static Rune
|
||||
_runerecomp(Rune r[2])
|
||||
{
|
||||
uint x, y, *p, next;
|
||||
|
||||
if(r[0] >= LBase && r[0] <= LLast){
|
||||
if(r[1] < VBase || r[1] > VLast)
|
||||
return 0;
|
||||
x = (r[0] - LBase) * NCount + (r[1] - VBase) * TCount;
|
||||
return SBase + x;
|
||||
}
|
||||
if(r[0] >= SBase && r[0] <= SLast && (r[0] - SBase) % TCount == 0){
|
||||
if(r[1] > TBase && r[1] <= TLast)
|
||||
return r[0] + (r[1] - TBase);
|
||||
return 0;
|
||||
}
|
||||
if(r[0] > 0xFFFF || r[1] > 0xFFFF){
|
||||
for(x = 0; x < nelem(_recompexceptions); x++)
|
||||
if(r[0] == _recompexceptions[x][1] && r[1] == _recompexceptions[x][2])
|
||||
return _recompexceptions[x][0];
|
||||
return 0;
|
||||
}
|
||||
y = x = r[0]<<16 | r[1];
|
||||
x ^= x >> 16;
|
||||
x *= 0x21f0aaad;
|
||||
x ^= x >> 15;
|
||||
x *= 0xd35a2d97;
|
||||
x ^= x >> 15;
|
||||
p = _recompdata + (x%512)*2;
|
||||
while(p[0] != y){
|
||||
next = p[1]>>16;
|
||||
if(!next)
|
||||
return 0;
|
||||
p = _recompcoll + (next-1)*2;
|
||||
}
|
||||
return p[1] & 0xFFFF;
|
||||
}
|
||||
|
||||
static void
|
||||
runecccsort(Rune *a, int len)
|
||||
{
|
||||
Rune r;
|
||||
int i;
|
||||
int fail;
|
||||
|
||||
do {
|
||||
fail = 0;
|
||||
for(i = 0; i < len - 1; i++){
|
||||
if(ccclkup(a[i]) > ccclkup(a[i+1]) > 0){
|
||||
r = a[i];
|
||||
a[i] = a[i+1];
|
||||
a[i + 1] = r;
|
||||
fail = 1;
|
||||
}
|
||||
}
|
||||
} while(fail);
|
||||
}
|
||||
|
||||
char*
|
||||
fullutfnorm(char *s, int n)
|
||||
{
|
||||
Rune r, peek;
|
||||
char *p, *p2;
|
||||
|
||||
p = s;
|
||||
if(fullrune(p, n) == 0)
|
||||
return s;
|
||||
|
||||
p += chartorune(&r, p);
|
||||
n -= (p - s);
|
||||
|
||||
if((r >= LBase && r <= LLast) || (r >= SBase && r <= SLast)){
|
||||
do {
|
||||
if(fullrune(p, n) == 0)
|
||||
return s;
|
||||
p2 = p + chartorune(&peek, p);
|
||||
n -= (p2 - p);
|
||||
p = p2;
|
||||
} while(n > 0 && (peek >= VBase && peek <= VLast) || (peek > TBase && peek <= TLast));
|
||||
if(n <= 0)
|
||||
return s;
|
||||
return p;
|
||||
}
|
||||
|
||||
do {
|
||||
if(fullrune(p, n) == 0)
|
||||
return s;
|
||||
p2 = p + chartorune(&peek, p);
|
||||
n -= (p2 - p);
|
||||
p = p2;
|
||||
if(ccclkup(peek) == 0)
|
||||
return p;
|
||||
} while(n > 0);
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
Rune*
|
||||
fullrunenorm(Rune *r, int n)
|
||||
{
|
||||
Rune *e, *p;
|
||||
|
||||
p = r;
|
||||
e = p + n;
|
||||
|
||||
if((*p >= LBase && *p <= LLast) || (*p >= SBase && *p <= SLast)){
|
||||
p++;
|
||||
while(p < e && (*p >= VBase && *p <= VLast) || (*p > TBase && *p <= TLast))
|
||||
p++;
|
||||
|
||||
if(p >= e)
|
||||
return r;
|
||||
return p;
|
||||
}
|
||||
|
||||
for(; p < e && p + 1 < e; p++)
|
||||
if(ccclkup(p[1]) == 0)
|
||||
return p + 1;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static int
|
||||
runenorm(Rune *dst, Rune *src, char *sdst, char *ssrc, int max, int compose)
|
||||
{
|
||||
Rune c, r[2], _stack[32];
|
||||
Rune *p, *stack, *sp, *tp;
|
||||
char *strp, *strstop;
|
||||
Rune *rp, *rrp;
|
||||
Rune *stop;
|
||||
Rune peek;
|
||||
int w, w2, size;
|
||||
int mode;
|
||||
|
||||
if(src){
|
||||
mode = 1;
|
||||
p = src;
|
||||
stop = dst + (max - 1);
|
||||
strp = "";
|
||||
strstop = nil;
|
||||
} else {
|
||||
mode = 0;
|
||||
p = L"";
|
||||
stop = nil;
|
||||
strp = ssrc;
|
||||
strstop = sdst + (max - 1);
|
||||
}
|
||||
|
||||
stack = _stack + nelem(_stack)/2;
|
||||
size = 0;
|
||||
w = w2 = 0;
|
||||
while(*strp || *p){
|
||||
if(mode)
|
||||
c = *p;
|
||||
else
|
||||
w = chartorune(&c, strp);
|
||||
|
||||
sp = stack - 1;
|
||||
tp = stack;
|
||||
_runedecomp(r, c);
|
||||
while(r[0] != 0){
|
||||
c = r[0];
|
||||
if(r[1] != 0){
|
||||
*sp-- = r[1];
|
||||
if(sp == _stack)
|
||||
break;
|
||||
}
|
||||
_runedecomp(r, c);
|
||||
}
|
||||
|
||||
*sp = c;
|
||||
if(mode)
|
||||
peek = p[1];
|
||||
else
|
||||
w2 = chartorune(&peek, strp+w);
|
||||
|
||||
if((*sp >= LBase && *sp <= LLast) || (*sp >= SBase && *sp <= SLast)){
|
||||
while(peek != 0 && (peek >= VBase && peek <= VLast) || (peek > TBase && peek <= TLast)){
|
||||
*tp++ = peek;
|
||||
if(mode){
|
||||
p++;
|
||||
peek = p[1];
|
||||
} else {
|
||||
strp += w;
|
||||
w = w2;
|
||||
w2 = chartorune(&peek, strp+w);
|
||||
}
|
||||
if(tp == _stack + nelem(_stack))
|
||||
break;
|
||||
}
|
||||
}
|
||||
while(peek != 0 && ccclkup(peek) != 0){
|
||||
_runedecomp(r, peek);
|
||||
if(r[1] != 0){
|
||||
if(tp+1 >= _stack + nelem(_stack))
|
||||
break;
|
||||
*tp++ = r[0];
|
||||
*tp++ = r[1];
|
||||
} else if(r[0] != 0)
|
||||
*tp++ = r[0];
|
||||
else
|
||||
*tp++ = peek;
|
||||
|
||||
if(mode){
|
||||
p++;
|
||||
peek = p[1];
|
||||
} else {
|
||||
strp += w;
|
||||
w = w2;
|
||||
w2 = chartorune(&peek, strp+w);
|
||||
}
|
||||
if(tp == _stack + nelem(_stack))
|
||||
break;
|
||||
}
|
||||
runecccsort(sp, tp - sp);
|
||||
|
||||
if(compose && ccclkup(*sp) == 0){
|
||||
for(rp = sp + 1; rp < tp; rp++){
|
||||
r[0] = *sp;
|
||||
r[1] = *rp;
|
||||
c = _runerecomp(r);
|
||||
if(c != 0){
|
||||
*sp = c;
|
||||
for(rrp = rp; rrp > sp; rrp--)
|
||||
*rrp = rrp[-1];
|
||||
sp++;
|
||||
} else while(rp + 1 < tp && ccclkup(*rp) == ccclkup(*(rp+1)))
|
||||
rp++;
|
||||
}
|
||||
}
|
||||
|
||||
for(; sp < tp; sp++){
|
||||
if(mode){
|
||||
if(dst < stop)
|
||||
*dst++ = *sp;
|
||||
size++;
|
||||
} else {
|
||||
w2 = runelen(*sp);
|
||||
if(sdst+w2 < strstop)
|
||||
sdst += runetochar(sdst, sp);
|
||||
size += w2;
|
||||
}
|
||||
}
|
||||
if(mode)
|
||||
p++;
|
||||
else
|
||||
strp += w;
|
||||
}
|
||||
if(mode)
|
||||
*dst = 0;
|
||||
else
|
||||
*sdst = 0;
|
||||
return size;
|
||||
}
|
||||
|
||||
int
|
||||
runecomp(Rune *dst, Rune *src, int max)
|
||||
{
|
||||
return runenorm(dst, src, nil, nil, max, 1);
|
||||
}
|
||||
|
||||
int
|
||||
runedecomp(Rune *dst, Rune *src, int max)
|
||||
{
|
||||
return runenorm(dst, src, nil, nil, max, 0);
|
||||
}
|
||||
|
||||
int
|
||||
utfcomp(char *dst, char *src, int max)
|
||||
{
|
||||
return runenorm(nil, nil, dst, src, max, 1);
|
||||
}
|
||||
|
||||
int
|
||||
utfdecomp(char *dst, char *src, int max)
|
||||
{
|
||||
return runenorm(nil, nil, dst, src, max, 0);
|
||||
}
|
22
sys/src/libc/port/runetotype.c
Normal file
22
sys/src/libc/port/runetotype.c
Normal file
|
@ -0,0 +1,22 @@
|
|||
#include <u.h>
|
||||
#include <libc.h>
|
||||
|
||||
#include "runetotypedata"
|
||||
|
||||
Rune
|
||||
toupperrune(Rune c)
|
||||
{
|
||||
return c + upperlkup(c);
|
||||
}
|
||||
|
||||
Rune
|
||||
tolowerrune(Rune c)
|
||||
{
|
||||
return c + lowerlkup(c);
|
||||
}
|
||||
|
||||
Rune
|
||||
totitlerune(Rune c)
|
||||
{
|
||||
return c + titlelkup(c);
|
||||
}
|
|
@ -3,6 +3,14 @@
|
|||
TEST=\
|
||||
date\
|
||||
pow\
|
||||
runebreak\
|
||||
runenorm\
|
||||
strchr\
|
||||
|
||||
</sys/src/cmd/mktest
|
||||
|
||||
/lib/ucd/%:
|
||||
cd /lib/ucd && mk $stem
|
||||
|
||||
runebreak.test: /lib/ucd/GraphemeBreakTest.txt /lib/ucd/WordBreakTest.txt
|
||||
runenorm.test: /lib/ucd/NormalizationTest.txt
|
||||
|
|
112
sys/src/libc/test/runebreak.c
Normal file
112
sys/src/libc/test/runebreak.c
Normal file
|
@ -0,0 +1,112 @@
|
|||
#include <u.h>
|
||||
#include <libc.h>
|
||||
#include <bio.h>
|
||||
|
||||
static int
|
||||
estrtoul(char *s)
|
||||
{
|
||||
char *epr;
|
||||
Rune code;
|
||||
|
||||
code = strtoul(s, &epr, 16);
|
||||
if(s == epr)
|
||||
sysfatal("bad code point hex string");
|
||||
return code;
|
||||
}
|
||||
|
||||
static Rune*
|
||||
check(Rune *r, Rune* (*fn)(Rune*), char* (*fn2)(char*))
|
||||
{
|
||||
Rune *r2, *tmp;
|
||||
char *p, *p2;
|
||||
|
||||
p = smprint("%S", r);
|
||||
r2 = fn(r);
|
||||
p2 = fn2(p);
|
||||
|
||||
tmp = runesmprint("%.*s", (int)(p2-p), p);
|
||||
if(memcmp(r, tmp, r2-r) != 0)
|
||||
print("utf mismstach\n");
|
||||
|
||||
free(p);
|
||||
free(tmp);
|
||||
return r2;
|
||||
}
|
||||
|
||||
static void
|
||||
run(char *file, Rune* (*fn)(Rune*), char* (*fn2)(char*))
|
||||
{
|
||||
Biobuf *b;
|
||||
char *p, *dot;
|
||||
char *pieces[16];
|
||||
int i, j, n;
|
||||
Rune stack[16], ops[16];
|
||||
int nstack, nops;
|
||||
Rune r, *rp, *rp2;
|
||||
char *line;
|
||||
|
||||
b = Bopen(file, OREAD);
|
||||
if(b == nil)
|
||||
sysfatal("could not load composition exclusions: %r");
|
||||
|
||||
for(;(p = Brdline(b, '\n')) != nil; free(line)){
|
||||
p[Blinelen(b)-1] = 0;
|
||||
line = strdup(p);
|
||||
if(p[0] == 0 || p[0] == '#')
|
||||
continue;
|
||||
if((dot = strstr(p, "#")) != nil)
|
||||
*dot = 0;
|
||||
n = getfields(p, pieces, nelem(pieces), 0, " ");
|
||||
nstack = nops = 0;
|
||||
for(i = 0; i < n; i++){
|
||||
chartorune(&r, pieces[i]);
|
||||
if(r != L'÷' && r != L'×'){
|
||||
r = estrtoul(pieces[i]);
|
||||
stack[nstack++] = r;
|
||||
stack[nstack] = 0;
|
||||
} else {
|
||||
ops[nops++] = r;
|
||||
ops[nops] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
rp = stack;
|
||||
for(i = 1; i < nops-1;){
|
||||
rp2 = check(rp, fn, fn2);
|
||||
switch(ops[i]){
|
||||
case L'÷':
|
||||
if(rp2 != rp+1){
|
||||
print("break fail %X %X || %s\n", rp[0], rp[1], line);
|
||||
goto Break;
|
||||
}
|
||||
rp++;
|
||||
i++;
|
||||
break;
|
||||
case L'×':
|
||||
if(rp2 - rp == 0){
|
||||
for(j = i; j < nops - 1; j++)
|
||||
if(ops[j] != L'×')
|
||||
print("skipped %d %d %s\n", i, nops, line);
|
||||
goto Break;
|
||||
}
|
||||
for(; rp < (rp2-1); rp++, i++){
|
||||
if(ops[i] != L'×')
|
||||
print("skipped %d %d %s\n", i, nops, line);
|
||||
}
|
||||
rp = rp2;
|
||||
i++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
Break:
|
||||
;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
main(int, char)
|
||||
{
|
||||
run("/lib/ucd/GraphemeBreakTest.txt", runegbreak, utfgbreak);
|
||||
run("/lib/ucd/WordBreakTest.txt", runewbreak, utfwbreak);
|
||||
exits(nil);
|
||||
}
|
92
sys/src/libc/test/runenorm.c
Normal file
92
sys/src/libc/test/runenorm.c
Normal file
|
@ -0,0 +1,92 @@
|
|||
#include <u.h>
|
||||
#include <libc.h>
|
||||
#include <bio.h>
|
||||
|
||||
static int
|
||||
estrtoul(char *s)
|
||||
{
|
||||
char *epr;
|
||||
Rune code;
|
||||
|
||||
code = strtoul(s, &epr, 16);
|
||||
if(s == epr)
|
||||
sysfatal("bad code point hex string");
|
||||
return code;
|
||||
}
|
||||
|
||||
void
|
||||
main(int, char)
|
||||
{
|
||||
Rune buffer1[64];
|
||||
Rune buffer2[64];
|
||||
char utfbuff1[128];
|
||||
char utfbuff2[128];
|
||||
char srctmp[128], tmp1[128], tmp2[128];
|
||||
char *fields[10];
|
||||
char *runes[32];
|
||||
char *p;
|
||||
int n, n2;
|
||||
int i;
|
||||
uint fail;
|
||||
Biobuf *b;
|
||||
|
||||
b = Bopen("/lib/ucd/NormalizationTest.txt", OREAD);
|
||||
if(b == nil)
|
||||
sysfatal("could not load composition exclusions: %r");
|
||||
|
||||
struct {
|
||||
Rune src[32];
|
||||
Rune nfc[32];
|
||||
Rune nfd[32];
|
||||
} test;
|
||||
while((p = Brdline(b, '\n')) != nil){
|
||||
p[Blinelen(b)-1] = 0;
|
||||
if(p[0] == 0 || p[0] == '#' || p[0] == '@')
|
||||
continue;
|
||||
getfields(p, fields, 6 + 1, 0, ";");
|
||||
n = getfields(fields[0], runes, nelem(runes), 0, " ");
|
||||
for(i = 0; i < n; i++)
|
||||
test.src[i] = estrtoul(runes[i]);
|
||||
test.src[i] = 0;
|
||||
|
||||
n = getfields(fields[1], runes, nelem(runes), 0, " ");
|
||||
for(i = 0; i < n; i++)
|
||||
test.nfc[i] = estrtoul(runes[i]);
|
||||
test.nfc[i] = 0;
|
||||
|
||||
n = getfields(fields[2], runes, nelem(runes), 0, " ");
|
||||
for(i = 0; i < n; i++)
|
||||
test.nfd[i] = estrtoul(runes[i]);
|
||||
test.nfd[i] = 0;
|
||||
|
||||
n = runecomp(buffer1, test.src, nelem(buffer1));
|
||||
n2 = runedecomp(buffer2, test.src, nelem(buffer2));
|
||||
fail = 0;
|
||||
|
||||
if(runestrcmp(buffer1, test.nfc) != 0)
|
||||
fail |= 1<<0;
|
||||
if(runestrcmp(buffer2, test.nfd) != 0)
|
||||
fail |= 1<<1;
|
||||
if(fail)
|
||||
print("%d %d %S %S %S %S %S\n", fail, i, test.src, test.nfd, test.nfc, buffer2, buffer1);
|
||||
assert(n == runestrlen(test.nfc));
|
||||
assert(n2 == runestrlen(test.nfd));
|
||||
|
||||
snprint(srctmp, sizeof tmp1, "%S", test.src);
|
||||
snprint(tmp1, sizeof tmp1, "%S", test.nfc);
|
||||
snprint(tmp2, sizeof tmp2, "%S", test.nfd);
|
||||
|
||||
n = utfcomp(utfbuff1, srctmp, nelem(utfbuff1));
|
||||
n2 = utfdecomp(utfbuff2, srctmp, nelem(utfbuff2));
|
||||
|
||||
if(strcmp(utfbuff1, tmp1) != 0)
|
||||
fail |= 1<<2;
|
||||
if(strcmp(utfbuff2, tmp2) != 0)
|
||||
fail |= 1<<3;
|
||||
if(fail)
|
||||
print("%d %d %s %s %s %s %s\n", fail, i, srctmp, tmp2, tmp1, utfbuff2, utfbuff1);
|
||||
assert(n == strlen(tmp1));
|
||||
assert(n2 == strlen(tmp2));
|
||||
}
|
||||
exits(nil);
|
||||
}
|
Loading…
Reference in a new issue