runecomp(2)

2025-01-12 11:10:06 +00:00 · 2023-03-26 01:02:20 +00:00 · 2023-03-26 01:02:20 +00:00 · 04759ec9af
commit 04759ec9af
parent 2163aebcb8
13 changed files with 1894 additions and 2 deletions
--- a/lib/ucd/mkfile
+++ b/lib/ucd/mkfile
@ -0,0 +1,85 @@
+</$objtype/mkfile
+
+VERSION='15.0.0'
+URL='https://www.unicode.org/Public/'$VERSION'/ucd/'
+
+TXT=\
+	ArabicShaping.txt\
+	BidiBrackets.txt\
+	BidiMirroring.txt\
+	BidiTest.txt\
+	Blocks.txt\
+	CJKRadicals.txt\
+	CaseFolding.txt\
+	CompositionExclusions.txt\
+	DerivedAge.txt\
+	DerivedCoreProperties.txt\
+	DerivedNormalizationProps.txt\
+	EastAsianWidth.txt\
+	EmojiSources.txt\
+	EquivalentUnifiedIdeograph.txt\
+	HangulSyllableType.txt\
+	Index.txt\
+	IndicPositionalCategory.txt\
+	IndicSyllabicCategory.txt\
+	Jamo.txt\
+	LineBreak.txt\
+	NameAliases.txt\
+	NamedSequences.txt\
+	NamedSequencesProv.txt\
+	NamesList.txt\
+	NormalizationCorrections.txt\
+	NushuSources.txt\
+	PropList.txt\
+	PropertyAliases.txt\
+	PropertyValueAliases.txt\
+	ScriptExtensions.txt\
+	Scripts.txt\
+	SpecialCasing.txt\
+	StandardizedVariants.txt\
+	TangutSources.txt\
+	USourceData.txt\
+	UnicodeData.txt\
+	VerticalOrientation.txt\
+
+TEST=\
+	NormalizationTest.txt\
+	BidiCharacterTest.txt\
+
+PDF=\
+	USourceGlyphs.pdf\
+	USourceRSChart.pdf\
+
+AUX=\
+	WordBreakProperty.txt\
+	GraphemeBreakProperty.txt\
+
+ucd:V: UnicodeData.txt
+
+%.txt:
+	hget $URL^$target > $target >[2]/dev/null
+%.pdf:
+	hget $URL^$target > $target
+
+emoji-data.txt:
+	hget $URL^emoji/^$target > $target
+
+WordBreakProperty.txt:
+	hget $URL^'auxiliary/'^$target > $target
+
+GraphemeBreakProperty.txt:
+	hget $URL^'auxiliary/'^$target > $target
+
+WordBreakTest.txt:
+	hget $URL^'auxiliary/'^$target > $target
+
+GraphemeBreakTest.txt:
+	hget $URL^'auxiliary/'^$target > $target
+
+txt:V: $TXT
+
+pdf:V: $PDF
+
+test:V: $TEST
+
+all:V: $TXT $PDF $TEST
--- a/sys/include/libc.h
+++ b/sys/include/libc.h
@ -77,6 +77,18 @@ extern	Rune*	runestrrchr(Rune*, Rune);
 extern	long	runestrlen(Rune*);
 extern	Rune*	runestrstr(Rune*, Rune*);

+extern	int	runecomp(Rune*, Rune*, int);
+extern	int	runedecomp(Rune*, Rune*, int);
+extern	int	utfcomp(char*, char*, int);
+extern	int	utfdecomp(char*, char*, int);
+extern	char*	fullutfnorm(char*,int);
+extern	Rune*	fullrunenorm(Rune*,int);
+
+extern	Rune*	runewbreak(Rune*);
+extern	char*	utfwbreak(char*);
+extern	Rune*	runegbreak(Rune*);
+extern	char*	utfgbreak(char*);
+
 extern	Rune	tolowerrune(Rune);
 extern	Rune	totitlerune(Rune);
 extern	Rune	toupperrune(Rune);
--- a/sys/man/2/isalpharune
+++ b/sys/man/2/isalpharune
@ -48,7 +48,11 @@ The names are self-explanatory.
 .PP
 The case-conversion routines return the character unchanged if it has no case.
 .SH SOURCE
-.B /sys/src/libc/port/runetype.c
+.B /sys/src/libc/port/mkrunetype.c
+.br
+.B /sys/src/libc/port/runeistype.c
+.br
+.B /sys/src/libc/port/runetotype.c
 .SH "SEE ALSO
 .IR ctype (2) ,
 .IR "The Unicode Standard" .
--- a/sys/man/2/runecomp
+++ b/sys/man/2/runecomp
@ -0,0 +1,116 @@
+.TH RUNECOMP 2
+.SH NAME
+runecomp, runedecomp, fullrunenorm, runegbreak, runewbreak, utfcomp, utfdecomp, fullutfnorm, utfgbreak, utfwbreak \- multi-rune graphemes
+.SH SYNOPSIS
+.ta \w'\fLchar*xx'u
+.B #include <u.h>
+.br
+.B #include <libc.h>
+.PP
+.B
+int	runecomp(Rune *dst, Rune *src, int max)
+.PP
+.B
+int	runedecomp(Rune *dst, Rune *src, int max)
+.PP
+.B
+Rune*	fullrunenorm(Rune *s, int n)
+.PP
+.B
+Rune*	runegbreak(Rune *s)
+.PP
+.B
+Rune*	runewbreak(Rune *s)
+.PP
+.B
+int	utfcomp(char *dst, char *src, int max)
+.PP
+.B
+int	utfdecomp(char *dst, char *src, int max)
+.PP
+.B
+char*	fullutfnorm(char *s, int n)
+.PP
+.B
+char*	utfgbreak(char *s)
+.PP
+.B
+char*	utfwbreak(char *s)
+.SH DESCRIPTION
+These routines help in handling
+graphemes that may span multiple runes.
+.PP
+.IR Runecomp ,
+.IR runedecomp ,
+.IR utfcomp ,
+and
+.I utfdecomp
+perform Unicode® normalization on
+.IR src ,
+storing the result in
+.IR dst .
+No more than
+.I max
+elements will be written, and the resulting string
+will always be null terminated. The return value
+is always the total number of elements required to
+store the transformation. If this value is larger
+than the supplied
+.I max
+the caller can assume the result has been truncated.
+.I Runecomp
+and
+.I utfcomp
+perform NFC normalization while
+.I runedecomp
+and
+.I utfdecomp
+perform NFD normalization.
+.PP
+.IR Fullrunenorm ,
+and
+.I fullutfnorm
+determine if enough elements are present in
+.I s
+to perform normalization. If enough are present,
+a pointer is returned to the first element that begins
+the next context. Otherwise
+.I s
+is returned. No more then
+.I n
+elements will be read. In order to find the boundary, the
+first element of the next context must be peeked.
+.PP
+.I Runegbreak
+and
+.I utfgbreak
+search
+.B s
+for the next grapheme break opportunity.
+If none is found before the end of the string,
+.I s
+is returned.
+.PP
+.I Runewbreak
+and
+.I utfwbreak
+search
+.B s
+for the next word break opportunity.
+If none is found before the end of the string,
+.I s
+is returned.
+.SH SOURCE
+.B /sys/src/libc/port/mkrunetype.c
+.br
+.B /sys/src/libc/port/runenorm.c
+.br
+.B /sys/src/libc/port/runebreak.c
+.SH SEE ALSO
+Unicode® Standard Annex #15
+.br
+Unicode® Standard Annex #29
+.br
+.IR rune (2),
+.IR utf (6),
+.IR tcs (1)
--- a/sys/src/libc/port/mkfile
+++ b/sys/src/libc/port/mkfile
@ -62,6 +62,9 @@ CFILES=\
 	rand.c\
 	readn.c\
 	rune.c\
+	runebreak.c\
+	runeistype.c\
+	runenorm.c\
 	runestrcat.c\
 	runestrchr.c\
 	runestrcmp.c\
@ -74,7 +77,7 @@ CFILES=\
 	runestrrchr.c\
 	runestrlen.c\
 	runestrstr.c\
-	runetype.c\
+	runetotype.c\
 	sin.c\
 	sinh.c\
 	sqrt.c\
@ -127,3 +130,26 @@ UPDATE=mkfile\
 </sys/src/cmd/mksyslib

 profile.$O: /sys/include/tos.h
+
+runenorm.$O:	runenormdata runenorm.c
+runetotype.$O:	runetotypedata runetotype.c
+runeistype.$O:	runeistypedata runeistype.c
+runebreak.$O:	runebreakdata runebreak.c
+
+UCD=\
+	/lib/ucd/WordBreakProperty.txt\
+	/lib/ucd/GraphemeBreakProperty.txt\
+	/lib/ucd/emoji-data.txt\
+	/lib/ucd/CompositionExclusions.txt\
+	/lib/ucd/UnicodeData.txt\
+
+/lib/ucd/%:
+	cd /lib/ucd && mk $stem
+
+runenormdata runetotypedata runeistypedata runebreakdata:	mkrunetype.c $UCD
+	@{
+		eval `{grep '^[A-Z]' /$cputype/mkfile}
+		$CC $CFLAGS -o mkrunetype.$O mkrunetype.c
+		$LD $LDFLAGS -o $O.mkrunetype mkrunetype.$O
+		$O.mkrunetype
+	}
--- a/sys/src/libc/port/mkrunetype.c
+++ b/sys/src/libc/port/mkrunetype.c
@ -0,0 +1,748 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+
+enum{
+	NRUNES = 1<<21
+};
+
+typedef struct Param Param;
+typedef struct Lvl Lvl;
+struct Lvl{
+	int bits;
+	int max;
+	int mask;
+};
+struct Param{
+	Lvl idx1;
+	Lvl idx2;
+	Lvl data;
+
+	int round1max;
+};
+
+static void
+derive(Lvl *l)
+{
+	l->max = 1 << l->bits;
+	l->mask = l->max - 1;
+}
+
+static void
+param(Param *p, int idx1, int idx2)
+{
+
+	assert(idx1 + idx2 < 21);
+	p->idx1.bits = idx1;
+	p->idx2.bits = idx2;
+	p->data.bits = 21 - idx1 - idx2;
+	derive(&p->idx1);
+	derive(&p->idx2);
+	derive(&p->data);
+
+	p->round1max = NRUNES/p->data.max;
+}
+
+static int
+lkup(Param *p, int *idx1, int *idx2, int *data, int x)
+{
+	int y, z;
+
+	y = (((x)>>(p->data.bits+p->idx2.bits))&p->idx1.mask);
+	z = (((x)>>p->data.bits)&p->idx2.mask);
+	return data[idx2[idx1[y] + z] + (x&p->data.mask)];
+}
+
+static int
+mkarrvar(int fd, char *name, int *d, int len)
+{
+	int i, sz;
+	int max, min;
+	char *t;
+
+	max = min = 0;
+	for(i = 0; i < len; i++){
+		if(d[i] > max)
+			max = d[i];
+		if(d[i] < min)
+			min = d[i];
+	}
+	if(min == 0){
+		if(max < 0xFF)
+			t = "uchar", sz = 1;
+		else if(max < 0xFFFF)
+			t = "ushort", sz = 2;
+		else
+			t = "uint", sz = 4;
+	} else {
+		if(max < 1<<7)
+			t = "char", sz = 1;
+		else if(max < 1<<15)
+			t = "short", sz = 2;
+		else
+			t = "int", sz = 4;
+	}
+	if(fd < 0)
+		return sz * len;
+
+	fprint(fd, "static\n%s\t%s[%d] =\n{\n\t", t, name, len);
+	for(i = 0; i < len; i++){
+		fprint(fd, "%d,", d[i]);
+		if((i+1) % 16 == 0)
+			fprint(fd, "\n\t");
+	}
+	fprint(fd, "\n};\n");
+
+	return sz * len;
+}
+
+static int
+mkexceptarr(int fd, char *name, int *d, int n, int all)
+{
+	int i;
+	fprint(fd, "static\nRune %s[][%d] =\n{\n\t", name, all ? 3 : 2);
+	for(i = 0; i < n*3; i += 3){
+		if(all && d[i] != 0)
+			fprint(fd, "{0x%X, 0x%X, 0x%X},", d[i], d[i+1], d[i+2]);
+		else if(!all)
+			fprint(fd, "{0x%X, 0x%X},", d[i+1], d[i+2]);	
+		if((i+3) % (8*3) == 0)
+			fprint(fd, "\n\t");
+	}
+	fprint(fd, "\n};\n");
+	return n * sizeof(Rune) * 2;
+}
+
+static int
+compact(int *data, int *idx, int nidx, int *src, int chunksize)
+{
+	int i, n, ndata, best;
+	int *dot, *lp, *rp;
+
+	dot = src;
+	ndata = 0;
+	idx[0] = 0;
+	for(i = 1; i <= nidx; i++){
+		rp = dot + chunksize;
+		lp = rp - 1;
+
+		for(best = 0, n = 0; i != nidx && n < chunksize; n++, lp--){
+			if(memcmp(lp, rp, (n+1) * sizeof data[0]) == 0)
+				best = n+1;
+		}
+		memmove(data + ndata, dot, (chunksize - best) * sizeof data[0]);
+		ndata += (chunksize - best);
+		idx[i] = idx[i - 1] + (chunksize - best);
+		dot = rp;
+	}
+	return ndata;
+}
+
+
+static int
+mklkup(int fd, char *label, int *map, Param *p)
+{
+	static int data[NRUNES];
+	static int idx2[NRUNES];
+	static int idx2dest[NRUNES];
+	static int idx1[NRUNES];
+	int i, nidx2, ndata;
+	int size;
+
+	ndata = compact(data, idx2, p->round1max, map, p->data.max);
+	nidx2 = compact(idx2dest, idx1, p->idx1.max, idx2, p->idx2.max);
+
+	if(fd >= 0){
+		for(i = 0; i < NRUNES; i++)
+			if(map[i] != lkup(p, idx1, idx2dest, data, i))
+				sysfatal("mismatch in %s at %d %d %d\n", label, i, map[i], lkup(p, idx1, idx2dest, data, i));
+	}
+
+	size = mkarrvar(fd, smprint("_%sdata", label), data, ndata);
+	size += mkarrvar(fd, smprint("_%sidx2", label), idx2dest, nidx2);
+	size += mkarrvar(fd, smprint("_%sidx1", label), idx1, p->idx1.max);
+	if(fd >= 0){
+		fprint(fd, "\n");
+		fprint(fd, "#define %sindex1(x) (((x)>>(%d+%d))&0x%X)\n", label, p->data.bits, p->idx2.bits, p->idx1.mask);
+		fprint(fd, "#define %sindex2(x) (((x)>>%d)&0x%X)\n", label, p->data.bits, p->idx2.mask);
+		fprint(fd, "#define %soffset(x) ((x)&0x%X)\n", label, p->data.mask);
+		fprint(fd, "#define %slkup(x) (_%sdata[_%sidx2[_%sidx1[%sindex1(x)] + %sindex2(x)] + %soffset(x)] )\n\n",
+			label, label, label, label, label, label, label);
+	}
+	return size;
+}
+
+static void
+mklkupmatrix(char *label, int *map, Param *p)
+{
+	int bestsize, size, bestx, besty;
+	int x, y;
+
+	bestsize = bestx = besty = -1;
+	for(x = 4; x <= 12; x++)
+		for(y=4; y <= (19 - x); y++){
+			param(p, x, y);
+			size = mklkup(-1, label, map, p);
+			if(bestsize == -1 || size < bestsize){
+				bestx = x;
+				besty = y;
+				bestsize = size;
+			}
+		}
+
+	assert(bestsize != -1);
+	fprint(2, "label: %s best: %d %d (%d)\n", label, bestx, besty, bestsize);
+	param(p, bestx, besty);
+}
+
+static int myismerged[NRUNES];
+static int mytoupper[NRUNES];
+static int mytolower[NRUNES];
+static int mytotitle[NRUNES];
+static int mybreak[NRUNES];
+
+enum{ DSTART = 0xEEEE };
+static int mydecomp[NRUNES];
+static int mydespecial[256*3];
+static int nspecial;
+static int myccc[NRUNES];
+
+typedef struct KV KV;
+struct KV{
+	uint key;
+	uint val;
+	ushort next;
+};
+
+static KV myrecomp[2000];
+static int nrecomp;
+
+static int recompext[256*3];
+static int nrecompext;
+
+static uint
+hash(uint x)
+{
+	x ^= x >> 16;
+	x *= 0x21f0aaad;
+	x ^= x >> 15;
+	x *= 0xd35a2d97;
+	x ^= x >> 15;
+	return x;
+}
+
+static void
+mkrecomp(int fd)
+{
+	int i;
+	KV *p;
+	static KV vals[512];
+	static KV coll[1000];
+	int over;
+	int maxchain;
+
+	for(i = 0; i < nelem(vals); i++)
+		vals[i] = (KV){0, 0, 0};
+	for(i = 0; i < nelem(coll); i++)
+		coll[i] = (KV){0, 0, 0};
+	over = 1;
+	for(i = 0; i < nrecomp; i++){
+		p = vals + (hash(myrecomp[i].key) % nelem(vals));
+		maxchain = 0;
+		while(p->key != 0){
+			maxchain++;
+			if(p->next == 0){
+				p->next = over;
+				p = coll + over - 1;
+				over++;
+			} else
+				p = coll + p->next - 1;
+		}
+		p->key = myrecomp[i].key;
+		p->val = myrecomp[i].val;
+	}
+	fprint(2, "recomp map [%d][%d]: %d\n", nelem(vals), over-1, (nelem(vals) + over-1) * (4+2+2));
+	fprint(fd, "static\nuint\t_recompdata[] =\n{\n\t");
+	for(p = vals, i = 0;; i++){
+		assert(p->val < 0xFFFF);
+		assert(p->next < 0xFFFF);
+		fprint(fd, "%udU,%udU,", p->key, p->val | (p->next<<16));
+		if((i+1) % 8 == 0)
+			fprint(fd, "\n\t");
+
+		if(p == vals+nelem(vals)-1)
+			p = coll;
+		else if(p == coll + over - 2)
+			break;
+		else
+			p++;
+	}
+	fprint(fd, "\n};\n");
+	fprint(fd, "static uint *_recompcoll = _recompdata+%d*2;\n", nelem(vals));
+}
+
+static void
+mktables(void)
+{
+	Param p;
+	int tofd, isfd, normfd, breakfd;
+	int size;
+
+	tofd = create("runetotypedata", OWRITE, 0664);
+	if(tofd < 0)
+		sysfatal("could not create runetotypedata: %r");
+	param(&p, 10, 7);
+	size = mklkup(tofd, "upper", mytoupper, &p);
+	fprint(2, "%s: %d\n", "upper", size);
+
+	size = mklkup(tofd, "lower", mytolower, &p);
+	fprint(2, "%s: %d\n", "lower", size);
+
+	size = mklkup(tofd, "title", mytotitle, &p);
+	fprint(2, "%s: %d\n", "title", size);
+	close(tofd);
+
+	isfd = create("runeistypedata", OWRITE, 0664);
+	if(isfd < 0)
+		sysfatal("could not create runeistypedata: %r");
+	param(&p, 11, 6);
+	size = mklkup(isfd, "merged", myismerged, &p);
+	fprint(2, "%s: %d\n", "merged", size);
+	fprint(isfd, "static\nenum {\n");
+	fprint(isfd, "\tL%s = %s,\n", "space", "1<<0");
+	fprint(isfd, "\tL%s = %s,\n", "alpha", "1<<1");
+	fprint(isfd, "\tL%s = %s,\n", "digit", "1<<2");
+	fprint(isfd, "\tL%s = %s,\n", "upper", "1<<3");
+	fprint(isfd, "\tL%s = %s,\n", "lower", "1<<4");
+	fprint(isfd, "\tL%s = %s,\n", "title", "1<<5");
+	fprint(isfd, "};\n");
+	close(isfd);
+
+	normfd = create("runenormdata", OWRITE, 0664);
+	if(normfd < 0)
+		sysfatal("could not create runenormdata: %r");
+	param(&p, 10, 7);
+	size = mklkup(normfd, "decomp", mydecomp, &p);
+	fprint(2, "%s: %d\n", "decomp", size);
+
+	param(&p, 9, 7);
+	size = mklkup(normfd, "ccc", myccc, &p);
+	fprint(2, "%s: %d\n", "ccc", size);
+
+	mkexceptarr(normfd, "_decompexceptions", mydespecial, nspecial, 0);
+	mkexceptarr(normfd, "_recompexceptions", recompext, nrecompext, 1);
+	mkrecomp(normfd);
+	close(normfd);
+
+	param(&p, 10, 6);
+	breakfd = create("runebreakdata", OWRITE, 0644);
+	if(breakfd < 0)
+		sysfatal("could not create runebreakdata: %r");
+	size = mklkup(breakfd, "break", mybreak, &p);
+	fprint(2, "%s: %d\n", "break", size);
+}
+
+enum {
+	FIELD_CODE,
+	FIELD_NAME,
+	FIELD_CATEGORY,
+	FIELD_COMBINING,
+	FIELD_BIDIR,
+	FIELD_DECOMP,
+	FIELD_DECIMAL_DIG,
+	FIELD_DIG,
+	FIELD_NUMERIC_VAL,
+	FIELD_MIRRORED,
+	FIELD_UNICODE_1_NAME,
+	FIELD_COMMENT,
+	FIELD_UPPER,
+	FIELD_LOWER,
+	FIELD_TITLE,
+	NFIELDS,
+};
+
+static int
+getunicodeline(Biobuf *in, char **fields)
+{
+	char *p;
+
+	if((p = Brdline(in, '\n')) == nil)
+		return 0;
+
+	p[Blinelen(in)-1] = '\0';
+
+	if (getfields(p, fields, NFIELDS + 1, 0, ";") != NFIELDS)
+		sysfatal("bad number of fields");
+
+	return 1;
+}
+
+static int
+estrtoul(char *s, int base)
+{
+	char *epr;
+	Rune code;
+
+	code = strtoul(s, &epr, base);
+	if(s == epr)
+		sysfatal("bad code point hex string");
+	return code;
+}
+
+enum {
+	OTHER, 
+	Hebrew_Letter, Newline, Extend, Format,
+	Katakana, ALetter, MidLetter, MidNum,
+	MidNumLet, Numeric, ExtendNumLet, WSegSpace,
+	PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
+	L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
+	EMOJIEX = 0xB0,
+};
+
+static void
+markbreak(void)
+{
+	Biobuf *b;
+	char *p, *dot;
+	int i, s, e;
+	uchar v;
+
+	b = Bopen("/lib/ucd/WordBreakProperty.txt", OREAD);
+	if(b == nil)
+		sysfatal("could not load word breaks: %r");
+
+	while((p = Brdline(b, '\n')) != nil){
+		p[Blinelen(b)-1] = 0;
+		if(p[0] == 0 || p[0] == '#')
+			continue;
+		if((dot = strstr(p, "..")) != nil){
+			*dot = 0;
+			dot += 2;
+			s = estrtoul(p, 16);
+			e = estrtoul(dot, 16);
+		} else {
+			s = e = estrtoul(p, 16);
+			dot = p;
+		}
+		v = 0;
+		if(strstr(dot, "ExtendNumLet") != nil)
+			v = ExtendNumLet;
+		else if(strstr(dot, "Hebrew_Letter") != nil)
+			v = Hebrew_Letter;
+		else if(strstr(dot, "Newline") != nil)
+			v = Newline;
+		else if(strstr(dot, "Extend") != nil)
+			v = Extend;
+		else if(strstr(dot, "Format") != nil)
+			v = Format;
+		else if(strstr(dot, "Katakana") != nil)
+			v = Katakana;
+		else if(strstr(dot, "ALetter") != nil)
+			v = ALetter;
+		else if(strstr(dot, "MidLetter") != nil)
+			v = MidLetter;
+		else if(strstr(dot, "MidNum") != nil)
+			v = MidNum;
+		else if(strstr(dot, "Numeric") != nil)
+			v = Numeric;
+		else if(strstr(dot, "WSegSpace") != nil)
+			v = WSegSpace;
+		for(i = s; i <= e; i++)
+			mybreak[i] = v;
+	}
+	Bterm(b);
+	b = Bopen("/lib/ucd/GraphemeBreakProperty.txt", OREAD);
+	if(b == nil)
+		sysfatal("could not load Grapheme breaks: %r");
+
+	while((p = Brdline(b, '\n')) != nil){
+		p[Blinelen(b)-1] = 0;
+		if(p[0] == 0 || p[0] == '#')
+			continue;
+		if((dot = strstr(p, "..")) != nil){
+			*dot = 0;
+			dot += 2;
+			s = estrtoul(p, 16);
+			e = estrtoul(dot, 16);
+		} else {
+			s = e = estrtoul(p, 16);
+			dot = p;
+		}
+		v = 0;
+		if(strstr(dot, "; Prepend #") != nil)
+			v = PREPEND;
+		else if(strstr(dot, "; Control #") != nil)
+			v = CONTROL;
+		else if(strstr(dot, "; Extend #") != nil)
+			v = EXTEND;
+		else if(strstr(dot, "; Regional_Indicator #") != nil)
+			v = REGION;
+		else if(strstr(dot, "; SpacingMark #") != nil)
+			v = SPACEMK;
+		else if(strstr(dot, "; L #") != nil)
+			v = L;
+		else if(strstr(dot, "; V #") != nil)
+			v = V;
+		else if(strstr(dot, "; T #") != nil)
+			v = T;
+		else if(strstr(dot, "; LV #") != nil)
+			v = LV;
+		else if(strstr(dot, "; LVT #") != nil)
+			v = LVT;
+		for(i = s; i <= e; i++)
+			mybreak[i] |= v;
+	}
+	Bterm(b);
+
+	b = Bopen("/lib/ucd/emoji-data.txt", OREAD);
+	if(b == nil)
+		sysfatal("could not load emoji-data: %r");
+
+	while((p = Brdline(b, '\n')) != nil){
+		p[Blinelen(b)-1] = 0;
+		if(p[0] == 0 || p[0] == '#')
+			continue;
+		if((dot = strstr(p, "..")) != nil){
+			*dot = 0;
+			dot += 2;
+			s = estrtoul(p, 16);
+			e = estrtoul(dot, 16);
+		} else {
+			s = e = estrtoul(p, 16);
+			dot = p;
+		}
+		v = 0;
+		if(strstr(dot, "; Extended_Pictographic") != nil)
+			v = EMOJIEX;
+		for(i = s; i <= e; i++)
+			mybreak[i] |= v;
+	}
+	Bterm(b);
+}
+
+static void
+markexclusions(void)
+{
+	Biobuf *b;
+	char *p;
+	int i;
+	uint x;
+
+	b = Bopen("/lib/ucd/CompositionExclusions.txt", OREAD);
+	if(b == nil)
+		sysfatal("could not load composition exclusions: %r");
+
+	while((p = Brdline(b, '\n')) != nil){
+		p[Blinelen(b)-1] = 0;
+		if(p[0] == 0 || p[0] == '#')
+			continue;
+		x = estrtoul(p, 16);
+		for(i = 0; i < nrecomp; i++){
+			if(myrecomp[i].val == x){
+				myrecomp[i].val = 0;
+				break;
+			}
+		}
+		if(i == nrecomp){
+			for(i = 0; i < nrecompext; i++){
+				if(recompext[i*3] == x){
+					recompext[i*3] = 0;
+					break;
+				}
+			}
+		}
+	}
+	Bterm(b);
+}
+
+void
+main(int, char)
+{
+	static char myisspace[NRUNES];
+	static char myisalpha[NRUNES];
+	static char myisdigit[NRUNES];
+	static char myisupper[NRUNES];
+	static char myislower[NRUNES];
+	static char myistitle[NRUNES];
+	Biobuf *in;
+	char *fields[NFIELDS + 1], *fields2[NFIELDS + 1];
+	char *p, *d;
+	int i, code, last;
+	int decomp[2], *ip;
+
+	in = Bopen("/lib/ucd/UnicodeData.txt", OREAD);
+	if(in == nil)
+		sysfatal("can't open UnicodeData.txt: %r");
+
+	for(i = 0; i < NRUNES; i++){
+		mytoupper[i] = -1;
+		mytolower[i] = -1;
+		mytotitle[i] = -1;
+		mydecomp[i] = 0;
+		myccc[i] = 0;
+		mybreak[i] = 0;
+	}
+
+	myisspace['\t'] = 1;
+	myisspace['\n'] = 1;
+	myisspace['\r'] = 1;
+	myisspace['\f'] = 1;
+	myisspace['\v'] = 1;
+	myisspace[0x85] = 1;	/* control char, "next line" */
+	myisspace[0xfeff] = 1;	/* zero-width non-break space */
+
+	last = -1;
+	nspecial = nrecomp = nrecompext =  0;
+	while(getunicodeline(in, fields)){
+		code = estrtoul(fields[FIELD_CODE], 16);
+		if (code >= NRUNES)
+			sysfatal("code-point value too big: %x", code);
+		if(code <= last)
+			sysfatal("bad code sequence: %x then %x", last, code);
+		last = code;
+
+		p = fields[FIELD_CATEGORY];
+		if(strstr(fields[FIELD_NAME], ", First>") != nil){
+			if(!getunicodeline(in, fields2))
+				sysfatal("range start at eof");
+			if (strstr(fields2[FIELD_NAME], ", Last>") == nil)
+				sysfatal("range start not followed by range end");
+			last = estrtoul(fields2[FIELD_CODE], 16);
+			if(last <= code)
+				sysfatal("range out of sequence: %x then %x", code, last);
+			if(strcmp(p, fields2[FIELD_CATEGORY]) != 0)
+				sysfatal("range with mismatched category");
+		}
+
+		d = fields[FIELD_DECOMP];
+		if(strlen(d) > 0 && strstr(d, "<") == nil){
+			decomp[0] = estrtoul(d, 16);
+			d = strstr(d, " ");
+			if(d == nil){
+				/* singleton recompositions are verboden */
+				decomp[1] = 0;
+				if(decomp[0] > 0xFFFF){
+					ip = mydespecial + nspecial*3;
+					ip[0] = code;
+					ip[1] = decomp[0];
+					ip[2] = 0;
+					mydecomp[code] = (DSTART+nspecial)<<16;
+					nspecial++;
+				} else
+					mydecomp[code] = decomp[0]<<16;
+			} else {
+				d++;
+				decomp[1] = estrtoul(d, 16);
+				if(decomp[0] > 0xFFFF || decomp[1] > 0xFFFF){
+					ip = mydespecial + nspecial*3;
+					ip[0] = code;
+					ip[1] = decomp[0];
+					ip[2] = decomp[1];
+					mydecomp[code] = (DSTART+nspecial)<<16;
+					nspecial++;
+					ip = recompext + nrecompext*3;
+					ip[0] = code;
+					ip[1] = decomp[0];
+					ip[2] = decomp[1];
+					nrecompext++;
+				} else {
+					mydecomp[code] = decomp[0]<<16 | decomp[1];
+					myrecomp[nrecomp++] = (KV){decomp[0]<<16 | decomp[1], code, 0};
+				}
+			}
+		}
+
+		for (; code <= last; code++){
+			if(p[0] == 'L')
+				myisalpha[code] = 1;
+			if(p[0] == 'Z')
+				myisspace[code] = 1;
+
+			if(strcmp(p, "Lu") == 0)
+				myisupper[code] = 1;
+			if(strcmp(p, "Ll") == 0)
+				myislower[code] = 1;
+
+			if(strcmp(p, "Lt") == 0)
+				myistitle[code] = 1;
+
+			if(strcmp(p, "Nd") == 0)
+				myisdigit[code] = 1;
+
+			if(fields[FIELD_UPPER][0] != '\0')
+				mytoupper[code] = estrtoul(fields[FIELD_UPPER], 16);
+
+			if(fields[FIELD_LOWER][0] != '\0')
+				mytolower[code] = estrtoul(fields[FIELD_LOWER], 16);
+
+			if(fields[FIELD_TITLE][0] != '\0')
+				mytotitle[code] = estrtoul(fields[FIELD_TITLE], 16);
+
+			myccc[code] = estrtoul(fields[FIELD_COMBINING], 10);
+		}
+	}
+
+	Bterm(in);
+	markexclusions();
+
+	/*
+	 * according to standard, if totitle(x) is not defined in ucd
+	 * but toupper(x) is, then totitle is defined to be toupper(x)
+	 */
+	for(i = 0; i < NRUNES; i++){
+		if(mytotitle[i] == -1
+		&& mytoupper[i] != -1
+		&& !myistitle[i])
+			mytotitle[i] = mytoupper[i];
+	}
+
+	/*
+	 * A couple corrections:
+	 * is*(to*(x)) should be true.
+	 * restore undefined transformations.
+	 * store offset instead of value, makes them sparse.
+	 */
+	for(i = 0; i < NRUNES; i++){
+		if(mytoupper[i] != -1)
+			myisupper[mytoupper[i]] = 1;
+		else
+			mytoupper[i] = i;
+
+		if(mytolower[i] != -1)
+			myislower[mytolower[i]] = 1;
+		else
+			mytolower[i] = i;
+
+		if(mytotitle[i] != -1)
+			myistitle[mytotitle[i]] = 1;
+		else
+			mytotitle[i] = i;
+
+		mytoupper[i] = mytoupper[i] - i;
+		mytolower[i] = mytolower[i] - i;
+		mytotitle[i] = mytotitle[i] - i;
+	}
+
+	uchar b;
+	for(i = 0; i < NRUNES; i++){
+		b = 0;
+		if(myisspace[i])
+			b |= 1<<0;
+		if(myisalpha[i])
+			b |= 1<<1;
+		if(myisdigit[i])
+			b |= 1<<2;
+		if(myisupper[i])
+			b |= 1<<3;
+		if(myislower[i])
+			b |= 1<<4;
+		if(myistitle[i])
+			b |= 1<<5;
+
+		myismerged[i] = b;
+	}
+
+	markbreak();
+	mktables();
+	exits(nil);
+}
--- a/sys/src/libc/port/runebreak.c
+++ b/sys/src/libc/port/runebreak.c
@ -0,0 +1,293 @@
+#include <u.h>
+#include <libc.h>
+
+#include "runebreakdata"
+
+enum {
+	OTHER, 
+	Hebrew_Letter, Newline, Extend, Format,
+	Katakana, ALetter, MidLetter, MidNum,
+	MidNumLet, Numeric, ExtendNumLet, WSegSpace,
+	PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
+	L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
+	EMOJIEX = 0xB0,
+
+	ZWJ = 0x200DU,
+	LINETAB = 0xB,
+};
+
+#define IS(x, y) ((x&0xf) == y)
+#define ISG(x, y) ((x&0xf0) == y)
+
+Rune*
+runegbreak(Rune *s)
+{
+	Rune l, r;
+	uchar lt, rt;
+	Rune *p;
+
+	p = s;
+	if((l = *p++) == 0)
+		return s;
+	if((r = *p) == 0)
+		return s;
+	lt = breaklkup(l);
+	rt = breaklkup(r);
+	if(l == '\r' && r == '\n')
+		goto Done;
+	if(ISG(lt, CONTROL) || l == '\r' || l == '\n')
+		return p;
+	if(ISG(rt, CONTROL) || r == '\r' || r == '\n')
+		return p;
+	if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT)))
+		goto Done;
+	if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T)))
+		goto Done;
+	if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T)))
+		goto Done;
+	if(ISG(rt, SPACEMK) || ISG(lt, PREPEND))
+		goto Done;
+	if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){
+		while(ISG(rt, EXTEND)){
+			p++;
+			if((r = *p) == 0)
+				return s;
+			rt = breaklkup(r);
+		}
+		if(r != ZWJ)
+			return p;
+		p++;
+		if((r = *p) == 0)
+			return s;
+		rt = breaklkup(r);
+		if(ISG(rt, EMOJIEX))
+			goto Done;
+		return p;
+	}
+	if(ISG(rt, EXTEND) || r == ZWJ)
+		goto Done;
+	if(ISG(lt, REGION) && ISG(rt, REGION))
+		goto Done;
+
+	return p;
+
+Done:
+	if(p[1] == 0)
+		return s;
+	return p + 1;
+}
+
+char*
+utfgbreak(char *s)
+{
+	Rune l, r;
+	uchar lt, rt;
+	char *p;
+
+	p = s;
+	p += chartorune(&l, p);
+	if(l == 0)
+		return s;
+	chartorune(&r, p);
+	if(r == 0)
+		return s;
+	lt = breaklkup(l);
+	rt = breaklkup(r);
+	if(l == '\r' && r == '\n')
+		goto Done;
+	if(ISG(lt, CONTROL) || l == '\r' || l == '\n')
+		return p;
+	if(ISG(rt, CONTROL) || r == '\r' || r == '\n')
+		return p;
+	if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT)))
+		goto Done;
+	if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T)))
+		goto Done;
+	if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T)))
+		goto Done;
+	if(ISG(rt, SPACEMK) || ISG(lt, PREPEND))
+		goto Done;
+	if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){
+		while(ISG(rt, EXTEND)){
+			p += chartorune(&r, p);
+			chartorune(&r, p);
+			if(r == 0)
+				return s;
+			rt = breaklkup(r);
+		}
+		if(r != ZWJ)
+			return p;
+
+		p += chartorune(&r, p);
+		chartorune(&r, p);
+		if(r == 0)
+			return s;
+		rt = breaklkup(r);
+		if(ISG(rt, EMOJIEX))
+			goto Done;
+		return p;
+	}
+	if(ISG(rt, EXTEND) || r == ZWJ)
+		goto Done;
+	if(ISG(lt, REGION) && ISG(rt, REGION))
+		goto Done;
+
+	return p;
+
+Done:
+	p += chartorune(&r, p);
+	chartorune(&r, p);
+	if(r == 0)
+		return s;
+	return p;
+}
+
+#define AH(x) (IS(x, ALetter) || IS(x, Hebrew_Letter))
+#define MNLQ(x) (IS(x, MidNumLet) || x == '\'')
+
+Rune*
+runewbreak(Rune *s)
+{
+	Rune l, r;
+	uchar lt, rt;
+	Rune *p;
+
+	p = s;
+	if((l = *p++) == 0)
+		return s;
+	if((r = *p) == 0)
+		return s;
+	lt = breaklkup(l);
+	rt = breaklkup(r);
+	if(l == '\r' && r == '\n')
+		goto Done;
+	if(l == '\r' || l == '\n' || l == LINETAB)
+		return p;
+	if(r == '\r' || r == '\n' || l == LINETAB)
+		return p;
+	if(IS(lt, WSegSpace) && IS(rt, WSegSpace))
+		goto Done;
+	if(IS(rt, Format) || IS(rt, Extend))
+		goto Done;
+	if(AH(lt)){
+		if(AH(rt))
+			goto Done;
+		if((IS(rt, MidLetter) || MNLQ(rt)) && p[1] != 0 && AH(breaklkup(p[1])))
+			goto Done;
+		if(IS(lt, Hebrew_Letter) && r == '\'')
+			goto Done;
+		if(IS(lt, Hebrew_Letter) && r == '"' && p[1] != 0 && IS(breaklkup(p[1]), Hebrew_Letter))
+			goto Done;
+		if(IS(rt, Numeric))
+			goto Done;
+	}
+	if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric)))
+		goto Done;
+	if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && p[1] != 0 && IS(breaklkup(p[1]), Numeric))
+		goto Done;
+	if(IS(lt, Katakana) && IS(rt, Katakana))
+		goto Done;
+	if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet))
+		if(IS(rt, ExtendNumLet))
+			goto Done;
+	if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana)))
+		goto Done;
+	if(ISG(lt, REGION)){
+		if(ISG(rt, REGION))
+			goto Done;
+		if(r != ZWJ)
+			return p;
+		p++;
+		if((r = *p) == 0)
+			return s;
+		rt = breaklkup(r);
+		if(ISG(rt, REGION))
+			goto Done;
+	}
+
+	return p;
+
+Done:
+	if(p[1] == 0)
+		return s;
+	return p + 1;
+}
+
+char*
+utfwbreak(char *s)
+{
+	Rune l, r;
+	Rune peek;
+	uchar lt, rt;
+	char *p;
+
+	p = s;
+	p += chartorune(&l, p);
+	if(l == 0)
+		return s;
+	chartorune(&peek, p+chartorune(&r, p));
+	if(r == 0)
+		return s;
+	lt = breaklkup(l);
+	rt = breaklkup(r);
+	if(l == '\r' && r == '\n')
+		goto Done;
+	if(l == '\r' || l == '\n' || l == LINETAB)
+		return p;
+	if(r == '\r' || r == '\n' || l == LINETAB)
+		return p;
+	if(IS(lt, WSegSpace) && IS(rt, WSegSpace))
+		goto Done;
+	if(IS(rt, Format) || IS(rt, Extend))
+		goto Done;
+	if(AH(lt)){
+		if(AH(rt))
+			goto Done;
+		if(IS(rt, MidLetter) || MNLQ(rt))
+		if(peek != 0 && AH(breaklkup(peek)))
+			goto Done;
+
+		if(IS(lt, Hebrew_Letter) && r == '\'')
+			goto Done;
+
+		if(IS(lt, Hebrew_Letter) && r == '"')
+		if(peek != 0 && IS(breaklkup(peek), Hebrew_Letter))
+			goto Done;
+
+		if(IS(rt, Numeric))
+			goto Done;
+	}
+	if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric)))
+		goto Done;
+	if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && peek != 0 && IS(breaklkup(peek), Numeric))
+		goto Done;
+	if(IS(lt, Katakana) && IS(rt, Katakana))
+		goto Done;
+	if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet))
+		if(IS(rt, ExtendNumLet))
+			goto Done;
+	if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana)))
+		goto Done;
+	if(ISG(lt, REGION)){
+		if(ISG(rt, REGION))
+			goto Done;
+		if(r != ZWJ)
+			return p;
+		p += chartorune(&r, p);
+		chartorune(&r, p);
+		if(r == 0)
+			return s;
+		rt = breaklkup(r);
+		if(ISG(rt, REGION))
+			goto Done;
+	}
+
+	return p;
+
+Done:
+	p += chartorune(&r, p);
+	chartorune(&r, p);
+	if(r == 0)
+		return s;
+	return p;
+}
--- a/sys/src/libc/port/runeistype.c
+++ b/sys/src/libc/port/runeistype.c
@ -0,0 +1,40 @@
+#include <u.h>
+#include <libc.h>
+
+#include "runeistypedata"
+
+int
+isspacerune(Rune c)
+{
+	return (mergedlkup(c) & Lspace) == Lspace;
+}
+
+int
+isalpharune(Rune c)
+{
+	return (mergedlkup(c) & Lalpha) == Lalpha;
+}
+
+int
+isdigitrune(Rune c)
+{
+	return (mergedlkup(c) & Ldigit) == Ldigit;
+}
+
+int
+isupperrune(Rune c)
+{
+	return (mergedlkup(c) & Lupper) == Lupper;
+}
+
+int
+islowerrune(Rune c)
+{
+	return (mergedlkup(c) & Llower) == Llower;
+}
+
+int
+istitlerune(Rune c)
+{
+	return (mergedlkup(c) & Ltitle) == Ltitle;
+}
--- a/sys/src/libc/port/runenorm.c
+++ b/sys/src/libc/port/runenorm.c
@ -0,0 +1,334 @@
+#include <u.h>
+#include <libc.h>
+
+#include "runenormdata"
+
+//Unicode Standard: Section 3.12 Conjoining Jamo Behavior
+enum {
+	SBase = 0xAC00,
+	LBase = 0x1100,
+	VBase = 0x1161,
+	TBase = 0x11A7,
+
+	LCount = 19,
+	VCount = 21,
+	TCount = 28,
+	NCount = VCount * TCount,
+	SCount = LCount * NCount,
+
+	LLast = LBase + LCount - 1,
+	SLast = SBase + SCount - 1,
+	VLast = VBase + VCount - 1,
+	TLast = TBase + TCount - 1,
+};
+
+static void
+_runedecomp(Rune dst[2], Rune c)
+{
+	uint x;
+
+	if(c >= SBase && c <= SLast){
+		c -= SBase;
+		x = c % TCount;
+		if(x){
+			dst[0] = SBase + ((c / TCount) * TCount);
+			dst[1] = TBase + x;
+			return;
+		}
+		dst[0] = LBase + (c / NCount);
+		dst[1] = VBase + ((c % NCount) / TCount);
+		return;
+	}
+	x = decomplkup(c);
+	if((x & 0xFFFF) != 0){
+		dst[0] = x>>16;
+		dst[1] = x & 0xFFFF;
+		return;
+	}
+	x >>= 16;
+	if(x >= 0xEEEE && x <0xF8FF){
+		memmove(dst, _decompexceptions[x - 0xEEEE], sizeof(Rune)*2);
+		return;
+	}
+	dst[0] = x;
+	dst[1] = 0;
+}
+
+static Rune
+_runerecomp(Rune r[2])
+{
+	uint x, y, *p, next;
+
+	if(r[0] >= LBase && r[0] <= LLast){
+		if(r[1] < VBase || r[1] > VLast)
+			return 0;
+		x = (r[0] - LBase) * NCount + (r[1] - VBase) * TCount;
+		return SBase + x;
+	}
+	if(r[0] >= SBase && r[0] <= SLast && (r[0] - SBase) % TCount == 0){
+		if(r[1] > TBase && r[1] <= TLast)
+			return r[0] + (r[1] - TBase);
+		return 0;
+	}
+	if(r[0] > 0xFFFF || r[1] > 0xFFFF){
+		for(x = 0; x < nelem(_recompexceptions); x++)
+			if(r[0] == _recompexceptions[x][1] && r[1] == _recompexceptions[x][2])
+				return  _recompexceptions[x][0];
+		return 0;
+	}
+	y = x = r[0]<<16 | r[1];
+	x ^= x >> 16;
+	x *= 0x21f0aaad;
+	x ^= x >> 15;
+	x *= 0xd35a2d97;
+	x ^= x >> 15;
+	p = _recompdata + (x%512)*2;
+	while(p[0] != y){
+		next = p[1]>>16;
+		if(!next)
+			return 0;
+		p = _recompcoll + (next-1)*2;
+	}
+	return p[1] & 0xFFFF;
+}
+
+static void
+runecccsort(Rune *a, int len)
+{
+	Rune r;
+	int i;
+	int fail;
+
+	do {
+		fail = 0;
+		for(i = 0; i < len - 1; i++){
+			if(ccclkup(a[i]) > ccclkup(a[i+1]) > 0){
+				r = a[i];
+				a[i] = a[i+1];
+				a[i + 1] = r;
+				fail = 1;
+			}
+		}
+	} while(fail);
+}
+
+char*
+fullutfnorm(char *s, int n)
+{
+	Rune r, peek;
+	char *p, *p2;
+
+	p = s;
+	if(fullrune(p, n) == 0)
+		return s;
+
+	p += chartorune(&r, p);
+	n -= (p - s);
+
+	if((r >= LBase && r <= LLast) || (r >= SBase && r <= SLast)){
+		do {
+			if(fullrune(p, n) == 0)
+				return s;
+			p2 = p + chartorune(&peek, p);
+			n -= (p2 - p);
+			p = p2;
+		} while(n > 0 && (peek >= VBase && peek <= VLast) || (peek > TBase && peek <= TLast));
+		if(n <= 0)
+			return s;
+		return p;
+	}
+
+	do {
+		if(fullrune(p, n) == 0)
+			return s;
+		p2 = p + chartorune(&peek, p);
+		n -= (p2 - p);
+		p = p2;
+		if(ccclkup(peek) == 0)
+			return p;
+	} while(n > 0);
+
+	return s;
+}
+
+Rune*
+fullrunenorm(Rune *r, int n)
+{
+	Rune *e, *p;
+
+	p = r;
+	e = p + n;
+
+	if((*p >= LBase && *p <= LLast) || (*p >= SBase && *p <= SLast)){
+		p++;
+		while(p < e && (*p >= VBase && *p <= VLast) || (*p > TBase && *p <= TLast))
+			p++;
+
+		if(p >= e)
+			return r;
+		return p;
+	}
+
+	for(; p < e && p + 1 < e; p++)
+		if(ccclkup(p[1]) == 0)
+			return p + 1;
+
+	return r;
+}
+
+static int
+runenorm(Rune *dst, Rune *src, char *sdst, char *ssrc, int max, int compose)
+{
+	Rune c, r[2], _stack[32];
+	Rune *p, *stack, *sp, *tp;
+	char *strp, *strstop;
+	Rune *rp, *rrp;
+	Rune *stop;
+	Rune peek;
+	int w, w2, size;
+	int mode;
+
+	if(src){
+		mode = 1;
+		p = src;
+		stop = dst + (max - 1);
+		strp = "";
+		strstop = nil;
+	} else {
+		mode = 0;
+		p = L"";
+		stop = nil;
+		strp = ssrc;
+		strstop = sdst + (max - 1);
+	}
+
+	stack = _stack + nelem(_stack)/2;
+	size = 0;
+	w = w2 = 0;
+	while(*strp || *p){
+		if(mode)
+			c = *p;
+		else
+			w = chartorune(&c, strp);
+
+		sp = stack - 1;
+		tp = stack;
+		_runedecomp(r, c);
+		while(r[0] != 0){
+			c = r[0];
+			if(r[1] != 0){
+				*sp-- = r[1];
+				if(sp == _stack)
+					break;
+			}
+			_runedecomp(r, c);
+		}
+
+		*sp = c;
+		if(mode)
+			peek = p[1];
+		else
+			w2 = chartorune(&peek, strp+w);
+
+		if((*sp >= LBase && *sp <= LLast) || (*sp >= SBase && *sp <= SLast)){
+			while(peek != 0 && (peek >= VBase && peek <= VLast) || (peek > TBase && peek <= TLast)){
+				*tp++ = peek;
+				if(mode){
+					p++;
+					peek = p[1];
+				} else {
+					strp += w;
+					w = w2;
+					w2 = chartorune(&peek, strp+w);
+				}
+				if(tp == _stack + nelem(_stack))
+					break;
+			}
+		}
+		while(peek != 0 && ccclkup(peek) != 0){
+			_runedecomp(r, peek);
+			if(r[1] != 0){
+				if(tp+1 >= _stack + nelem(_stack))
+					break;
+				*tp++ = r[0];
+				*tp++ = r[1];
+			} else if(r[0] != 0)
+				*tp++ = r[0];
+			else
+				*tp++ = peek;
+
+			if(mode){
+				p++;
+				peek = p[1];
+			} else {
+				strp += w;
+				w = w2;
+				w2 = chartorune(&peek, strp+w);
+			}
+			if(tp == _stack + nelem(_stack))
+				break;
+		}
+		runecccsort(sp, tp - sp);
+
+		if(compose && ccclkup(*sp) == 0){
+			for(rp = sp + 1; rp < tp; rp++){
+				r[0] = *sp;
+				r[1] = *rp;
+				c = _runerecomp(r);
+				if(c != 0){
+					*sp = c;
+					for(rrp = rp; rrp > sp; rrp--)
+						*rrp = rrp[-1];
+					sp++;
+				} else while(rp + 1 < tp && ccclkup(*rp) == ccclkup(*(rp+1)))
+					rp++;
+			}
+		}
+
+		for(; sp < tp; sp++){
+			if(mode){
+				if(dst < stop)
+					*dst++ = *sp;
+				size++;
+			} else {
+				w2 = runelen(*sp);
+				if(sdst+w2 < strstop)
+					sdst += runetochar(sdst, sp);
+				size += w2;
+			}
+		}
+		if(mode)
+			p++;
+		else
+			strp += w;
+	}
+	if(mode)
+		*dst = 0;
+	else
+		*sdst = 0;
+	return size;
+}
+
+int
+runecomp(Rune *dst, Rune *src, int max)
+{
+	return runenorm(dst, src, nil, nil, max, 1);
+}
+
+int
+runedecomp(Rune *dst, Rune *src, int max)
+{
+	return runenorm(dst, src, nil, nil, max, 0);
+}
+
+int
+utfcomp(char *dst, char *src, int max)
+{
+	return runenorm(nil, nil, dst, src, max, 1);
+}
+
+int
+utfdecomp(char *dst, char *src, int max)
+{
+	return runenorm(nil, nil, dst, src, max, 0);	
+}
--- a/sys/src/libc/port/runetotype.c
+++ b/sys/src/libc/port/runetotype.c
@ -0,0 +1,22 @@
+#include <u.h>
+#include <libc.h>
+
+#include "runetotypedata"
+
+Rune
+toupperrune(Rune c)
+{
+	return c + upperlkup(c);
+}
+
+Rune
+tolowerrune(Rune c)
+{
+	return c + lowerlkup(c);
+}
+
+Rune
+totitlerune(Rune c)
+{
+	return c + titlelkup(c);
+}
--- a/sys/src/libc/test/mkfile
+++ b/sys/src/libc/test/mkfile
@ -3,6 +3,14 @@
 TEST=\
 	date\
 	pow\
+	runebreak\
+	runenorm\
 	strchr\

 </sys/src/cmd/mktest
+
+/lib/ucd/%:
+	cd /lib/ucd && mk $stem
+
+runebreak.test:	/lib/ucd/GraphemeBreakTest.txt /lib/ucd/WordBreakTest.txt
+runenorm.test: /lib/ucd/NormalizationTest.txt
--- a/sys/src/libc/test/runebreak.c
+++ b/sys/src/libc/test/runebreak.c
@ -0,0 +1,112 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+
+static int
+estrtoul(char *s)
+{
+	char *epr;
+	Rune code;
+
+	code = strtoul(s, &epr, 16);
+	if(s == epr)
+		sysfatal("bad code point hex string");
+	return code;
+}
+
+static Rune*
+check(Rune *r, Rune* (*fn)(Rune*), char* (*fn2)(char*))
+{
+	Rune *r2, *tmp;
+	char *p, *p2;
+
+	p = smprint("%S", r);
+	r2 = fn(r);
+	p2 = fn2(p);
+
+	tmp = runesmprint("%.*s", (int)(p2-p), p);
+	if(memcmp(r, tmp, r2-r) != 0)
+		print("utf mismstach\n");
+	
+	free(p);
+	free(tmp);
+	return r2;
+}
+
+static void
+run(char *file, Rune* (*fn)(Rune*), char* (*fn2)(char*))
+{
+	Biobuf *b;
+	char *p, *dot;
+	char *pieces[16];
+	int i, j, n;
+	Rune stack[16], ops[16];
+	int nstack, nops;
+	Rune r, *rp, *rp2;
+	char *line;
+
+	b = Bopen(file, OREAD);
+	if(b == nil)
+		sysfatal("could not load composition exclusions: %r");
+
+	for(;(p = Brdline(b, '\n')) != nil; free(line)){
+		p[Blinelen(b)-1] = 0;
+		line = strdup(p);
+		if(p[0] == 0 || p[0] == '#')
+			continue;
+		if((dot = strstr(p, "#")) != nil)
+			*dot = 0;
+		n = getfields(p, pieces, nelem(pieces), 0, " ");
+		nstack = nops = 0;
+		for(i = 0; i < n; i++){
+			chartorune(&r, pieces[i]);
+			if(r != L'÷' && r != L'×'){
+				r = estrtoul(pieces[i]);
+				stack[nstack++] = r;
+				stack[nstack] = 0;
+			} else {
+				ops[nops++] = r;
+				ops[nops] = 0;
+			}
+		}
+
+		rp = stack;
+		for(i = 1; i < nops-1;){
+			rp2 = check(rp, fn, fn2);
+			switch(ops[i]){
+			case L'÷':
+				if(rp2 != rp+1){
+					print("break fail %X %X || %s\n", rp[0], rp[1], line);
+					goto Break;
+				}
+				rp++;
+				i++;
+				break;
+			case L'×':
+				if(rp2 - rp == 0){
+					for(j = i; j < nops - 1; j++)
+						if(ops[j] !=  L'×')
+							print("skipped %d %d %s\n", i, nops, line);
+					goto Break;
+				}
+				for(; rp < (rp2-1); rp++, i++){
+					if(ops[i] != L'×')
+						print("skipped %d %d %s\n", i, nops, line);
+				}
+				rp = rp2;
+				i++;
+				break;
+			}
+		}
+Break:
+		;
+	}
+}
+
+void
+main(int, char)
+{
+	run("/lib/ucd/GraphemeBreakTest.txt", runegbreak, utfgbreak);
+	run("/lib/ucd/WordBreakTest.txt", runewbreak, utfwbreak);
+	exits(nil);
+}
--- a/sys/src/libc/test/runenorm.c
+++ b/sys/src/libc/test/runenorm.c
@ -0,0 +1,92 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+
+static int
+estrtoul(char *s)
+{
+	char *epr;
+	Rune code;
+
+	code = strtoul(s, &epr, 16);
+	if(s == epr)
+		sysfatal("bad code point hex string");
+	return code;
+}
+
+void
+main(int, char)
+{
+	Rune buffer1[64];
+	Rune buffer2[64];
+	char utfbuff1[128];
+	char utfbuff2[128];
+	char srctmp[128], tmp1[128], tmp2[128];
+	char *fields[10];
+	char *runes[32];
+	char *p;
+	int n, n2;
+	int i;
+	uint fail;
+	Biobuf *b;
+
+	b = Bopen("/lib/ucd/NormalizationTest.txt", OREAD);
+	if(b == nil)
+		sysfatal("could not load composition exclusions: %r");
+
+	struct {
+		Rune src[32];
+		Rune nfc[32];
+		Rune nfd[32];
+	} test;
+	while((p = Brdline(b, '\n')) != nil){
+		p[Blinelen(b)-1] = 0;
+		if(p[0] == 0 || p[0] == '#' || p[0] == '@')
+			continue;
+		getfields(p, fields, 6 + 1, 0, ";");
+		n = getfields(fields[0], runes, nelem(runes), 0, " ");
+		for(i = 0; i < n; i++)
+			test.src[i] = estrtoul(runes[i]);
+		test.src[i] = 0;
+
+		n = getfields(fields[1], runes, nelem(runes), 0, " ");
+		for(i = 0; i < n; i++)
+			test.nfc[i] = estrtoul(runes[i]);
+		test.nfc[i] = 0;
+
+		n = getfields(fields[2], runes, nelem(runes), 0, " ");
+		for(i = 0; i < n; i++)
+			test.nfd[i] = estrtoul(runes[i]);
+		test.nfd[i] = 0;
+
+		n = runecomp(buffer1, test.src, nelem(buffer1));
+		n2 = runedecomp(buffer2, test.src, nelem(buffer2));
+		fail = 0;
+
+		if(runestrcmp(buffer1, test.nfc) != 0)
+			fail |= 1<<0;
+		if(runestrcmp(buffer2, test.nfd) != 0)
+			fail |= 1<<1;
+		if(fail)
+			print("%d %d %S %S %S %S %S\n", fail, i, test.src, test.nfd, test.nfc, buffer2, buffer1);
+		assert(n == runestrlen(test.nfc));
+		assert(n2 == runestrlen(test.nfd));
+
+		snprint(srctmp, sizeof tmp1, "%S", test.src);
+		snprint(tmp1, sizeof tmp1, "%S", test.nfc);
+		snprint(tmp2, sizeof tmp2, "%S", test.nfd);
+
+		n = utfcomp(utfbuff1, srctmp, nelem(utfbuff1));
+		n2 = utfdecomp(utfbuff2, srctmp, nelem(utfbuff2));
+
+		if(strcmp(utfbuff1, tmp1) != 0)
+			fail |= 1<<2;
+		if(strcmp(utfbuff2, tmp2) != 0)
+			fail |= 1<<3;
+		if(fail)
+			print("%d %d %s %s %s %s %s\n", fail, i, srctmp, tmp2, tmp1, utfbuff2, utfbuff1);
+		assert(n == strlen(tmp1));
+		assert(n2 == strlen(tmp2));
+	}
+	exits(nil);
+}