convert to 4-byte UTF-8 and 32-bit Rune

http://codereview.appspot.com/116075
This commit is contained in:
Russ Cox 2009-09-11 17:03:06 -04:00
parent 4dbf255619
commit 0cadb4301d
14 changed files with 86 additions and 44 deletions

View file

@ -30,7 +30,7 @@ struct Fmt{
void *farg; /* to make flush a closure */ void *farg; /* to make flush a closure */
int nfmt; /* num chars formatted so far */ int nfmt; /* num chars formatted so far */
va_list args; /* args passed to dofmt */ va_list args; /* args passed to dofmt */
int r; /* % format Rune */ Rune r; /* % format Rune */
int width; int width;
int prec; int prec;
unsigned long flags; unsigned long flags;

View file

@ -4,14 +4,15 @@
extern "C" { extern "C" {
#endif #endif
typedef unsigned short Rune; /* 16 bits */ typedef unsigned int Rune; /* 32 bits */
enum enum
{ {
UTFmax = 3, /* maximum bytes per rune */ UTFmax = 4, /* maximum bytes per rune */
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
Runeself = 0x80, /* rune and UTF sequences are the same (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */
Runeerror = 0xFFFD /* decoding error in UTF */ Runeerror = 0xFFFD, /* decoding error in UTF */
Runemax = 0x10FFFF /* maximum rune value */
}; };
/* Edit .+1,/^$/ | cfn $PLAN9/src/lib9/utf/?*.c | grep -v static |grep -v __ */ /* Edit .+1,/^$/ | cfn $PLAN9/src/lib9/utf/?*.c | grep -v static |grep -v __ */

View file

@ -193,7 +193,7 @@ winctl(void *arg)
Rune *rp, *bp, *up, *kbdr; Rune *rp, *bp, *up, *kbdr;
uint qh; uint qh;
int nr, nb, c, wid, i, npart, initial, lastb; int nr, nb, c, wid, i, npart, initial, lastb;
char *s, *t, part[3]; char *s, *t, part[UTFmax];
Window *w; Window *w;
Mousestate *mp, m; Mousestate *mp, m;
enum { WKey, WMouse, WMouseread, WCtl, WCwrite, WCread, WWread, NWALT }; enum { WKey, WMouse, WMouseread, WCtl, WCwrite, WCread, WWread, NWALT };

View file

@ -488,7 +488,7 @@ bldcclass(void)
exprp++; /* eat '-' */ exprp++; /* eat '-' */
if((c2 = nextrec()) == ']') if((c2 = nextrec()) == ']')
goto Error; goto Error;
classp[n+0] = 0xFFFF; classp[n+0] = Runemax;
classp[n+1] = c1; classp[n+1] = c1;
classp[n+2] = c2; classp[n+2] = c2;
n += 3; n += 3;
@ -510,7 +510,7 @@ classmatch(int classno, int c, int negate)
p = class[classno]; p = class[classno];
while(*p){ while(*p){
if(*p == 0xFFFF){ if(*p == Runemax){
if(p[1]<=c && c<=p[2]) if(p[1]<=c && c<=p[2])
return !negate; return !negate;
p += 3; p += 3;

View file

@ -71,7 +71,7 @@ int
inputc(void) inputc(void)
{ {
int n, nbuf; int n, nbuf;
char buf[3]; char buf[UTFmax];
Rune r; Rune r;
Again: Again:

View file

@ -494,7 +494,7 @@ bldcclass(void)
exprp++; /* eat '-' */ exprp++; /* eat '-' */
if((c2 = nextrec()) == ']') if((c2 = nextrec()) == ']')
goto Error; goto Error;
classp[n+0] = 0xFFFF; classp[n+0] = Runemax;
classp[n+1] = c1; classp[n+1] = c1;
classp[n+2] = c2; classp[n+2] = c2;
n += 3; n += 3;
@ -516,7 +516,7 @@ classmatch(int classno, int c, int negate)
p = class[classno]; p = class[classno];
while(*p){ while(*p){
if(*p == 0xFFFF){ if(*p == Runemax){
if(p[1]<=c && c<=p[2]) if(p[1]<=c && c<=p[2])
return !negate; return !negate;
p += 3; p += 3;

View file

@ -615,7 +615,7 @@ compsub(Rune *rhs, Rune *end)
while ((r = *cp++) != '\0') { while ((r = *cp++) != '\0') {
if(r == '\\') { if(r == '\\') {
if (rhs < end) if (rhs < end)
*rhs++ = 0xFFFF; *rhs++ = Runemax;
else else
return 0; return 0;
r = *cp++; r = *cp++;
@ -1050,7 +1050,7 @@ dosub(Rune *rhsbuf)
sp = place(sp, loc1, loc2); sp = place(sp, loc1, loc2);
continue; continue;
} }
if (c == 0xFFFF && (c = *rp++) >= '1' && c < MAXSUB+'0') { if (c == Runemax && (c = *rp++) >= '1' && c < MAXSUB+'0') {
n = c-'0'; n = c-'0';
if (subexp[n].s.rsp && subexp[n].e.rep) { if (subexp[n].s.rsp && subexp[n].e.rep) {
sp = place(sp, subexp[n].s.rsp, subexp[n].e.rep); sp = place(sp, subexp[n].s.rsp, subexp[n].e.rep);

View file

@ -15,7 +15,7 @@ uchar bits[] = { 1, 2, 4, 8, 16, 32, 64, 128 };
#define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07]) #define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07])
#define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07]) #define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07])
#define MAXRUNE 0xFFFF #define MAXRUNE Runemax
uchar f[(MAXRUNE+1)/8]; uchar f[(MAXRUNE+1)/8];
uchar t[(MAXRUNE+1)/8]; uchar t[(MAXRUNE+1)/8];

View file

@ -152,9 +152,9 @@ wcstombs(char *s, const wchar_t *pwcs, size_t n)
if(p+d <= pe+3) { if(p+d <= pe+3) {
*p++ = buf[0]; *p++ = buf[0];
if(d > 1) { if(d > 1) {
*p++ = buf[2]; *p++ = buf[1];
if(d > 2) if(d > 2)
*p++ = buf[3]; *p++ = buf[2];
} }
} }
if(c == 0) if(c == 0)

View file

@ -51,13 +51,13 @@ range(char *argv[])
return "bad range"; return "bad range";
} }
min = strtoul(q, &q, 16); min = strtoul(q, &q, 16);
if(min<0 || min>0xFFFF || *q!='-') if(min<0 || min>Runemax || *q!='-')
goto err; goto err;
q++; q++;
if(strchr(hex, *q) == 0) if(strchr(hex, *q) == 0)
goto err; goto err;
max = strtoul(q, &q, 16); max = strtoul(q, &q, 16);
if(max<0 || max>0xFFFF || max<min || *q!=0) if(max<0 || max>Runemax || max<min || *q!=0)
goto err; goto err;
i = 0; i = 0;
do{ do{
@ -111,7 +111,7 @@ chars(char *argv[])
return "bad char"; return "bad char";
} }
m = strtoul(q, &q, 16); m = strtoul(q, &q, 16);
if(m<0 || m>0xFFFF || *q!=0) if(m<0 || m>Runemax || *q!=0)
goto err; goto err;
Bprint(&bout, "%C", m); Bprint(&bout, "%C", m);
if(!text) if(!text)

View file

@ -605,12 +605,13 @@ __flagfmt(Fmt *f)
int int
__badfmt(Fmt *f) __badfmt(Fmt *f)
{ {
char x[3]; char x[2+UTFmax];
int n;
x[0] = '%'; x[0] = '%';
x[1] = f->r; n = 1 + runetochar(x+1, &f->r);
x[2] = '%'; x[n++] = '%';
f->prec = 3; f->prec = n;
__fmtcpy(f, (const void*)x, 3, 3); __fmtcpy(f, (const void*)x, n, n);
return 0; return 0;
} }

View file

@ -23,16 +23,19 @@ enum
Bit2 = 5, Bit2 = 5,
Bit3 = 4, Bit3 = 4,
Bit4 = 3, Bit4 = 3,
Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */
Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */ Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */ Testx = Maskx ^ 0xFF, /* 1100 0000 */
@ -43,7 +46,7 @@ enum
int int
chartorune(Rune *rune, char *str) chartorune(Rune *rune, char *str)
{ {
int c, c1, c2; int c, c1, c2, c3;
long l; long l;
/* /*
@ -88,6 +91,25 @@ chartorune(Rune *rune, char *str)
return 3; return 3;
} }
/*
* four character sequence
* 10000-10FFFF => T4 Tx Tx Tx
*/
if(UTFmax >= 4) {
c3 = *(uchar*)(str+3) ^ Tx;
if(c3 & Testx)
goto bad;
if(c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if(l <= Rune3)
goto bad;
if(l > Runemax)
goto bad;
*rune = l;
return 4;
}
}
/* /*
* bad decoding * bad decoding
*/ */
@ -113,7 +135,7 @@ runetochar(char *str, Rune *rune)
/* /*
* two character sequence * two character sequence
* 0080-07FF => T2 Tx * 00080-007FF => T2 Tx
*/ */
if(c <= Rune2) { if(c <= Rune2) {
str[0] = T2 | (c >> 1*Bitx); str[0] = T2 | (c >> 1*Bitx);
@ -123,14 +145,28 @@ runetochar(char *str, Rune *rune)
/* /*
* three character sequence * three character sequence
* 0800-FFFF => T3 Tx Tx * 00800-0FFFF => T3 Tx Tx
*/ */
if(c > Runemax)
c = Runeerror;
if(c <= Rune3) {
str[0] = T3 | (c >> 2*Bitx); str[0] = T3 | (c >> 2*Bitx);
str[1] = Tx | ((c >> 1*Bitx) & Maskx); str[1] = Tx | ((c >> 1*Bitx) & Maskx);
str[2] = Tx | (c & Maskx); str[2] = Tx | (c & Maskx);
return 3; return 3;
} }
/*
* four character sequence
* 010000-1FFFFF => T4 Tx Tx Tx
*/
str[0] = T4 | (c >> 3*Bitx);
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
str[3] = Tx | (c & Maskx);
return 4;
}
int int
runelen(long c) runelen(long c)
{ {
@ -155,7 +191,10 @@ runenlen(Rune *r, int nrune)
if(c <= Rune2) if(c <= Rune2)
nb += 2; nb += 2;
else else
if(c <= Rune3 || c > Runemax)
nb += 3; nb += 3;
else
nb += 4;
} }
return nb; return nb;
} }
@ -165,13 +204,14 @@ fullrune(char *str, int n)
{ {
int c; int c;
if(n > 0) { if(n <= 0)
return 0;
c = *(uchar*)str; c = *(uchar*)str;
if(c < Tx) if(c < Tx)
return 1; return 1;
if(n > 1) if(c < T3)
if(c < T3 || n > 2) return n >= 2;
return 1; if(UTFmax == 3 || c < T4)
} return n >= 3;
return 0; return n >= 4;
} }

View file

@ -7,7 +7,7 @@ Bgetrune(Biobuf *bp)
{ {
int c, i; int c, i;
Rune rune; Rune rune;
char str[4]; char str[UTFmax];
c = Bgetc(bp); c = Bgetc(bp);
if(c < Runeself) { /* one char */ if(c < Runeself) { /* one char */

View file

@ -6,7 +6,7 @@ int
Bputrune(Biobuf *bp, long c) Bputrune(Biobuf *bp, long c)
{ {
Rune rune; Rune rune;
char str[4]; char str[UTFmax];
int n; int n;
rune = c; rune = c;