mirror of
https://github.com/9fans/plan9port.git
synced 2025-01-12 11:10:07 +00:00
convert to 4-byte UTF-8 and 32-bit Rune
http://codereview.appspot.com/116075
This commit is contained in:
parent
4dbf255619
commit
0cadb4301d
14 changed files with 86 additions and 44 deletions
|
@ -30,7 +30,7 @@ struct Fmt{
|
||||||
void *farg; /* to make flush a closure */
|
void *farg; /* to make flush a closure */
|
||||||
int nfmt; /* num chars formatted so far */
|
int nfmt; /* num chars formatted so far */
|
||||||
va_list args; /* args passed to dofmt */
|
va_list args; /* args passed to dofmt */
|
||||||
int r; /* % format Rune */
|
Rune r; /* % format Rune */
|
||||||
int width;
|
int width;
|
||||||
int prec;
|
int prec;
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
|
|
|
@ -4,14 +4,15 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef unsigned short Rune; /* 16 bits */
|
typedef unsigned int Rune; /* 32 bits */
|
||||||
|
|
||||||
enum
|
enum
|
||||||
{
|
{
|
||||||
UTFmax = 3, /* maximum bytes per rune */
|
UTFmax = 4, /* maximum bytes per rune */
|
||||||
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
|
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
|
||||||
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
|
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
|
||||||
Runeerror = 0xFFFD /* decoding error in UTF */
|
Runeerror = 0xFFFD, /* decoding error in UTF */
|
||||||
|
Runemax = 0x10FFFF /* maximum rune value */
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Edit .+1,/^$/ | cfn $PLAN9/src/lib9/utf/?*.c | grep -v static |grep -v __ */
|
/* Edit .+1,/^$/ | cfn $PLAN9/src/lib9/utf/?*.c | grep -v static |grep -v __ */
|
||||||
|
|
|
@ -193,7 +193,7 @@ winctl(void *arg)
|
||||||
Rune *rp, *bp, *up, *kbdr;
|
Rune *rp, *bp, *up, *kbdr;
|
||||||
uint qh;
|
uint qh;
|
||||||
int nr, nb, c, wid, i, npart, initial, lastb;
|
int nr, nb, c, wid, i, npart, initial, lastb;
|
||||||
char *s, *t, part[3];
|
char *s, *t, part[UTFmax];
|
||||||
Window *w;
|
Window *w;
|
||||||
Mousestate *mp, m;
|
Mousestate *mp, m;
|
||||||
enum { WKey, WMouse, WMouseread, WCtl, WCwrite, WCread, WWread, NWALT };
|
enum { WKey, WMouse, WMouseread, WCtl, WCwrite, WCread, WWread, NWALT };
|
||||||
|
|
|
@ -488,7 +488,7 @@ bldcclass(void)
|
||||||
exprp++; /* eat '-' */
|
exprp++; /* eat '-' */
|
||||||
if((c2 = nextrec()) == ']')
|
if((c2 = nextrec()) == ']')
|
||||||
goto Error;
|
goto Error;
|
||||||
classp[n+0] = 0xFFFF;
|
classp[n+0] = Runemax;
|
||||||
classp[n+1] = c1;
|
classp[n+1] = c1;
|
||||||
classp[n+2] = c2;
|
classp[n+2] = c2;
|
||||||
n += 3;
|
n += 3;
|
||||||
|
@ -510,7 +510,7 @@ classmatch(int classno, int c, int negate)
|
||||||
|
|
||||||
p = class[classno];
|
p = class[classno];
|
||||||
while(*p){
|
while(*p){
|
||||||
if(*p == 0xFFFF){
|
if(*p == Runemax){
|
||||||
if(p[1]<=c && c<=p[2])
|
if(p[1]<=c && c<=p[2])
|
||||||
return !negate;
|
return !negate;
|
||||||
p += 3;
|
p += 3;
|
||||||
|
|
|
@ -71,7 +71,7 @@ int
|
||||||
inputc(void)
|
inputc(void)
|
||||||
{
|
{
|
||||||
int n, nbuf;
|
int n, nbuf;
|
||||||
char buf[3];
|
char buf[UTFmax];
|
||||||
Rune r;
|
Rune r;
|
||||||
|
|
||||||
Again:
|
Again:
|
||||||
|
|
|
@ -494,7 +494,7 @@ bldcclass(void)
|
||||||
exprp++; /* eat '-' */
|
exprp++; /* eat '-' */
|
||||||
if((c2 = nextrec()) == ']')
|
if((c2 = nextrec()) == ']')
|
||||||
goto Error;
|
goto Error;
|
||||||
classp[n+0] = 0xFFFF;
|
classp[n+0] = Runemax;
|
||||||
classp[n+1] = c1;
|
classp[n+1] = c1;
|
||||||
classp[n+2] = c2;
|
classp[n+2] = c2;
|
||||||
n += 3;
|
n += 3;
|
||||||
|
@ -516,7 +516,7 @@ classmatch(int classno, int c, int negate)
|
||||||
|
|
||||||
p = class[classno];
|
p = class[classno];
|
||||||
while(*p){
|
while(*p){
|
||||||
if(*p == 0xFFFF){
|
if(*p == Runemax){
|
||||||
if(p[1]<=c && c<=p[2])
|
if(p[1]<=c && c<=p[2])
|
||||||
return !negate;
|
return !negate;
|
||||||
p += 3;
|
p += 3;
|
||||||
|
|
|
@ -615,7 +615,7 @@ compsub(Rune *rhs, Rune *end)
|
||||||
while ((r = *cp++) != '\0') {
|
while ((r = *cp++) != '\0') {
|
||||||
if(r == '\\') {
|
if(r == '\\') {
|
||||||
if (rhs < end)
|
if (rhs < end)
|
||||||
*rhs++ = 0xFFFF;
|
*rhs++ = Runemax;
|
||||||
else
|
else
|
||||||
return 0;
|
return 0;
|
||||||
r = *cp++;
|
r = *cp++;
|
||||||
|
@ -1050,7 +1050,7 @@ dosub(Rune *rhsbuf)
|
||||||
sp = place(sp, loc1, loc2);
|
sp = place(sp, loc1, loc2);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (c == 0xFFFF && (c = *rp++) >= '1' && c < MAXSUB+'0') {
|
if (c == Runemax && (c = *rp++) >= '1' && c < MAXSUB+'0') {
|
||||||
n = c-'0';
|
n = c-'0';
|
||||||
if (subexp[n].s.rsp && subexp[n].e.rep) {
|
if (subexp[n].s.rsp && subexp[n].e.rep) {
|
||||||
sp = place(sp, subexp[n].s.rsp, subexp[n].e.rep);
|
sp = place(sp, subexp[n].s.rsp, subexp[n].e.rep);
|
||||||
|
|
|
@ -15,7 +15,7 @@ uchar bits[] = { 1, 2, 4, 8, 16, 32, 64, 128 };
|
||||||
#define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07])
|
#define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07])
|
||||||
#define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07])
|
#define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07])
|
||||||
|
|
||||||
#define MAXRUNE 0xFFFF
|
#define MAXRUNE Runemax
|
||||||
|
|
||||||
uchar f[(MAXRUNE+1)/8];
|
uchar f[(MAXRUNE+1)/8];
|
||||||
uchar t[(MAXRUNE+1)/8];
|
uchar t[(MAXRUNE+1)/8];
|
||||||
|
|
|
@ -152,9 +152,9 @@ wcstombs(char *s, const wchar_t *pwcs, size_t n)
|
||||||
if(p+d <= pe+3) {
|
if(p+d <= pe+3) {
|
||||||
*p++ = buf[0];
|
*p++ = buf[0];
|
||||||
if(d > 1) {
|
if(d > 1) {
|
||||||
*p++ = buf[2];
|
*p++ = buf[1];
|
||||||
if(d > 2)
|
if(d > 2)
|
||||||
*p++ = buf[3];
|
*p++ = buf[2];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(c == 0)
|
if(c == 0)
|
||||||
|
|
|
@ -51,13 +51,13 @@ range(char *argv[])
|
||||||
return "bad range";
|
return "bad range";
|
||||||
}
|
}
|
||||||
min = strtoul(q, &q, 16);
|
min = strtoul(q, &q, 16);
|
||||||
if(min<0 || min>0xFFFF || *q!='-')
|
if(min<0 || min>Runemax || *q!='-')
|
||||||
goto err;
|
goto err;
|
||||||
q++;
|
q++;
|
||||||
if(strchr(hex, *q) == 0)
|
if(strchr(hex, *q) == 0)
|
||||||
goto err;
|
goto err;
|
||||||
max = strtoul(q, &q, 16);
|
max = strtoul(q, &q, 16);
|
||||||
if(max<0 || max>0xFFFF || max<min || *q!=0)
|
if(max<0 || max>Runemax || max<min || *q!=0)
|
||||||
goto err;
|
goto err;
|
||||||
i = 0;
|
i = 0;
|
||||||
do{
|
do{
|
||||||
|
@ -111,7 +111,7 @@ chars(char *argv[])
|
||||||
return "bad char";
|
return "bad char";
|
||||||
}
|
}
|
||||||
m = strtoul(q, &q, 16);
|
m = strtoul(q, &q, 16);
|
||||||
if(m<0 || m>0xFFFF || *q!=0)
|
if(m<0 || m>Runemax || *q!=0)
|
||||||
goto err;
|
goto err;
|
||||||
Bprint(&bout, "%C", m);
|
Bprint(&bout, "%C", m);
|
||||||
if(!text)
|
if(!text)
|
||||||
|
|
|
@ -605,12 +605,13 @@ __flagfmt(Fmt *f)
|
||||||
int
|
int
|
||||||
__badfmt(Fmt *f)
|
__badfmt(Fmt *f)
|
||||||
{
|
{
|
||||||
char x[3];
|
char x[2+UTFmax];
|
||||||
|
int n;
|
||||||
|
|
||||||
x[0] = '%';
|
x[0] = '%';
|
||||||
x[1] = f->r;
|
n = 1 + runetochar(x+1, &f->r);
|
||||||
x[2] = '%';
|
x[n++] = '%';
|
||||||
f->prec = 3;
|
f->prec = n;
|
||||||
__fmtcpy(f, (const void*)x, 3, 3);
|
__fmtcpy(f, (const void*)x, n, n);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,16 +23,19 @@ enum
|
||||||
Bit2 = 5,
|
Bit2 = 5,
|
||||||
Bit3 = 4,
|
Bit3 = 4,
|
||||||
Bit4 = 3,
|
Bit4 = 3,
|
||||||
|
Bit5 = 2,
|
||||||
|
|
||||||
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
|
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
|
||||||
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
|
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
|
||||||
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
|
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
|
||||||
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
|
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
|
||||||
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
|
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
|
||||||
|
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
|
||||||
|
|
||||||
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
|
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
|
||||||
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
|
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */
|
||||||
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
|
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */
|
||||||
|
Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */
|
||||||
|
|
||||||
Maskx = (1<<Bitx)-1, /* 0011 1111 */
|
Maskx = (1<<Bitx)-1, /* 0011 1111 */
|
||||||
Testx = Maskx ^ 0xFF, /* 1100 0000 */
|
Testx = Maskx ^ 0xFF, /* 1100 0000 */
|
||||||
|
@ -43,7 +46,7 @@ enum
|
||||||
int
|
int
|
||||||
chartorune(Rune *rune, char *str)
|
chartorune(Rune *rune, char *str)
|
||||||
{
|
{
|
||||||
int c, c1, c2;
|
int c, c1, c2, c3;
|
||||||
long l;
|
long l;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -88,6 +91,25 @@ chartorune(Rune *rune, char *str)
|
||||||
return 3;
|
return 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* four character sequence
|
||||||
|
* 10000-10FFFF => T4 Tx Tx Tx
|
||||||
|
*/
|
||||||
|
if(UTFmax >= 4) {
|
||||||
|
c3 = *(uchar*)(str+3) ^ Tx;
|
||||||
|
if(c3 & Testx)
|
||||||
|
goto bad;
|
||||||
|
if(c < T5) {
|
||||||
|
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
|
||||||
|
if(l <= Rune3)
|
||||||
|
goto bad;
|
||||||
|
if(l > Runemax)
|
||||||
|
goto bad;
|
||||||
|
*rune = l;
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* bad decoding
|
* bad decoding
|
||||||
*/
|
*/
|
||||||
|
@ -113,7 +135,7 @@ runetochar(char *str, Rune *rune)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* two character sequence
|
* two character sequence
|
||||||
* 0080-07FF => T2 Tx
|
* 00080-007FF => T2 Tx
|
||||||
*/
|
*/
|
||||||
if(c <= Rune2) {
|
if(c <= Rune2) {
|
||||||
str[0] = T2 | (c >> 1*Bitx);
|
str[0] = T2 | (c >> 1*Bitx);
|
||||||
|
@ -123,12 +145,26 @@ runetochar(char *str, Rune *rune)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* three character sequence
|
* three character sequence
|
||||||
* 0800-FFFF => T3 Tx Tx
|
* 00800-0FFFF => T3 Tx Tx
|
||||||
*/
|
*/
|
||||||
str[0] = T3 | (c >> 2*Bitx);
|
if(c > Runemax)
|
||||||
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
|
c = Runeerror;
|
||||||
str[2] = Tx | (c & Maskx);
|
if(c <= Rune3) {
|
||||||
return 3;
|
str[0] = T3 | (c >> 2*Bitx);
|
||||||
|
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||||
|
str[2] = Tx | (c & Maskx);
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* four character sequence
|
||||||
|
* 010000-1FFFFF => T4 Tx Tx Tx
|
||||||
|
*/
|
||||||
|
str[0] = T4 | (c >> 3*Bitx);
|
||||||
|
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
|
||||||
|
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||||
|
str[3] = Tx | (c & Maskx);
|
||||||
|
return 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
|
@ -155,7 +191,10 @@ runenlen(Rune *r, int nrune)
|
||||||
if(c <= Rune2)
|
if(c <= Rune2)
|
||||||
nb += 2;
|
nb += 2;
|
||||||
else
|
else
|
||||||
|
if(c <= Rune3 || c > Runemax)
|
||||||
nb += 3;
|
nb += 3;
|
||||||
|
else
|
||||||
|
nb += 4;
|
||||||
}
|
}
|
||||||
return nb;
|
return nb;
|
||||||
}
|
}
|
||||||
|
@ -165,13 +204,14 @@ fullrune(char *str, int n)
|
||||||
{
|
{
|
||||||
int c;
|
int c;
|
||||||
|
|
||||||
if(n > 0) {
|
if(n <= 0)
|
||||||
c = *(uchar*)str;
|
return 0;
|
||||||
if(c < Tx)
|
c = *(uchar*)str;
|
||||||
return 1;
|
if(c < Tx)
|
||||||
if(n > 1)
|
return 1;
|
||||||
if(c < T3 || n > 2)
|
if(c < T3)
|
||||||
return 1;
|
return n >= 2;
|
||||||
}
|
if(UTFmax == 3 || c < T4)
|
||||||
return 0;
|
return n >= 3;
|
||||||
|
return n >= 4;
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,7 +7,7 @@ Bgetrune(Biobuf *bp)
|
||||||
{
|
{
|
||||||
int c, i;
|
int c, i;
|
||||||
Rune rune;
|
Rune rune;
|
||||||
char str[4];
|
char str[UTFmax];
|
||||||
|
|
||||||
c = Bgetc(bp);
|
c = Bgetc(bp);
|
||||||
if(c < Runeself) { /* one char */
|
if(c < Runeself) { /* one char */
|
||||||
|
|
|
@ -6,7 +6,7 @@ int
|
||||||
Bputrune(Biobuf *bp, long c)
|
Bputrune(Biobuf *bp, long c)
|
||||||
{
|
{
|
||||||
Rune rune;
|
Rune rune;
|
||||||
char str[4];
|
char str[UTFmax];
|
||||||
int n;
|
int n;
|
||||||
|
|
||||||
rune = c;
|
rune = c;
|
||||||
|
|
Loading…
Reference in a new issue