mirror of
https://github.com/9fans/plan9port.git
synced 2025-01-15 11:20:03 +00:00
535 lines
11 KiB
C
535 lines
11 KiB
C
#ifdef PLAN9
|
|
#include <u.h>
|
|
#include <libc.h>
|
|
#include <bio.h>
|
|
#else
|
|
#include <stdio.h>
|
|
#include <unistd.h>
|
|
#include "plan9.h"
|
|
#endif
|
|
#include "hdr.h"
|
|
#include "conv.h"
|
|
#include "kuten208.h"
|
|
#include "jis.h"
|
|
|
|
/*
|
|
a state machine for interpreting all sorts of encodings
|
|
*/
|
|
static void
|
|
alljis(int c, Rune **r, long input_loc)
|
|
{
|
|
static enum { state0, state1, state2, state3, state4 } state = state0;
|
|
static int set8 = 0;
|
|
static int japan646 = 0;
|
|
static int lastc;
|
|
int n;
|
|
long l;
|
|
|
|
again:
|
|
switch(state)
|
|
{
|
|
case state0: /* idle state */
|
|
if(c == ESC){ state = state1; return; }
|
|
if(c < 0) return;
|
|
if(!set8 && (c < 128)){
|
|
if(japan646){
|
|
switch(c)
|
|
{
|
|
case '\\': emit(0xA5); return; /* yen */
|
|
case '~': emit(0xAF); return; /* spacing macron */
|
|
default: emit(c); return;
|
|
}
|
|
} else {
|
|
emit(c);
|
|
return;
|
|
}
|
|
}
|
|
if(c < 0x21){ /* guard against bogus characters in JIS mode */
|
|
if(squawk)
|
|
EPR "%s: non-JIS character %02x in %s near byte %ld\n", argv0, c, file, input_loc);
|
|
emit(c);
|
|
return;
|
|
}
|
|
lastc = c; state = state4; return;
|
|
|
|
case state1: /* seen an escape */
|
|
if(c == '$'){ state = state2; return; }
|
|
if(c == '('){ state = state3; return; }
|
|
emit(ESC); state = state0; goto again;
|
|
|
|
case state2: /* may be shifting into JIS */
|
|
if((c == '@') || (c == 'B')){
|
|
set8 = 1; state = state0; return;
|
|
}
|
|
emit(ESC); emit('$'); state = state0; goto again;
|
|
|
|
case state3: /* may be shifting out of JIS */
|
|
if((c == 'J') || (c == 'H') || (c == 'B')){
|
|
japan646 = (c == 'J');
|
|
set8 = 0; state = state0; return;
|
|
}
|
|
emit(ESC); emit('('); state = state0; goto again;
|
|
|
|
case state4: /* two part char */
|
|
if(c < 0){
|
|
if(squawk)
|
|
EPR "%s: unexpected EOF in %s\n", argv0, file);
|
|
c = 0x21 | (lastc&0x80);
|
|
}
|
|
if(CANS2J(lastc, c)){ /* ms dos sjis */
|
|
int hi = lastc, lo = c;
|
|
S2J(hi, lo); /* convert to 208 */
|
|
n = hi*100 + lo - 3232; /* convert to kuten208 */
|
|
} else
|
|
n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */
|
|
if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
|
|
nerrors++;
|
|
if(squawk)
|
|
EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
|
|
if(!clean)
|
|
emit(BADMAP);
|
|
} else {
|
|
if(l < 0){
|
|
l = -l;
|
|
if(squawk)
|
|
EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
|
|
}
|
|
emit(l);
|
|
}
|
|
state = state0;
|
|
}
|
|
}
|
|
|
|
/*
|
|
a state machine for interpreting ms-kanji == shift-jis.
|
|
*/
|
|
static void
|
|
ms(int c, Rune **r, long input_loc)
|
|
{
|
|
static enum { state0, state1, state2, state3, state4 } state = state0;
|
|
static int set8 = 0;
|
|
static int japan646 = 0;
|
|
static int lastc;
|
|
int n;
|
|
long l;
|
|
|
|
again:
|
|
switch(state)
|
|
{
|
|
case state0: /* idle state */
|
|
if(c == ESC){ state = state1; return; }
|
|
if(c < 0) return;
|
|
if(!set8 && (c < 128)){
|
|
if(japan646){
|
|
switch(c)
|
|
{
|
|
case '\\': emit(0xA5); return; /* yen */
|
|
case '~': emit(0xAF); return; /* spacing macron */
|
|
default: emit(c); return;
|
|
}
|
|
} else {
|
|
emit(c);
|
|
return;
|
|
}
|
|
}
|
|
lastc = c; state = state4; return;
|
|
|
|
case state1: /* seen an escape */
|
|
if(c == '$'){ state = state2; return; }
|
|
if(c == '('){ state = state3; return; }
|
|
emit(ESC); state = state0; goto again;
|
|
|
|
case state2: /* may be shifting into JIS */
|
|
if((c == '@') || (c == 'B')){
|
|
set8 = 1; state = state0; return;
|
|
}
|
|
emit(ESC); emit('$'); state = state0; goto again;
|
|
|
|
case state3: /* may be shifting out of JIS */
|
|
if((c == 'J') || (c == 'H') || (c == 'B')){
|
|
japan646 = (c == 'J');
|
|
set8 = 0; state = state0; return;
|
|
}
|
|
emit(ESC); emit('('); state = state0; goto again;
|
|
|
|
case state4: /* two part char */
|
|
if(c < 0){
|
|
if(squawk)
|
|
EPR "%s: unexpected EOF in %s\n", argv0, file);
|
|
c = 0x21 | (lastc&0x80);
|
|
}
|
|
if(CANS2J(lastc, c)){ /* ms dos sjis */
|
|
int hi = lastc, lo = c;
|
|
S2J(hi, lo); /* convert to 208 */
|
|
n = hi*100 + lo - 3232; /* convert to kuten208 */
|
|
} else {
|
|
nerrors++;
|
|
if(squawk)
|
|
EPR "%s: illegal byte pair (0x%x,0x%x) near byte %ld in %s\n", argv0, lastc, c, input_loc, file);
|
|
if(!clean)
|
|
emit(BADMAP);
|
|
state = state0;
|
|
goto again;
|
|
}
|
|
if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
|
|
nerrors++;
|
|
if(squawk)
|
|
EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
|
|
if(!clean)
|
|
emit(BADMAP);
|
|
} else {
|
|
if(l < 0){
|
|
l = -l;
|
|
if(squawk)
|
|
EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
|
|
}
|
|
emit(l);
|
|
}
|
|
state = state0;
|
|
}
|
|
}
|
|
|
|
/*
|
|
a state machine for interpreting ujis == EUC
|
|
*/
|
|
static void
|
|
ujis(int c, Rune **r, long input_loc)
|
|
{
|
|
static enum { state0, state1 } state = state0;
|
|
static int lastc;
|
|
int n;
|
|
long l;
|
|
|
|
switch(state)
|
|
{
|
|
case state0: /* idle state */
|
|
if(c < 0) return;
|
|
if(c < 128){
|
|
emit(c);
|
|
return;
|
|
}
|
|
if(c == 0x8e){ /* codeset 2 */
|
|
nerrors++;
|
|
if(squawk)
|
|
EPR "%s: unknown codeset 2 near byte %ld in %s\n", argv0, input_loc, file);
|
|
if(!clean)
|
|
emit(BADMAP);
|
|
return;
|
|
}
|
|
if(c == 0x8f){ /* codeset 3 */
|
|
nerrors++;
|
|
if(squawk)
|
|
EPR "%s: unknown codeset 3 near byte %ld in %s\n", argv0, input_loc, file);
|
|
if(!clean)
|
|
emit(BADMAP);
|
|
return;
|
|
}
|
|
lastc = c;
|
|
state = state1;
|
|
return;
|
|
|
|
case state1: /* two part char */
|
|
if(c < 0){
|
|
if(squawk)
|
|
EPR "%s: unexpected EOF in %s\n", argv0, file);
|
|
c = 0xA1;
|
|
}
|
|
n = (lastc&0x7F)*100 + (c&0x7F) - 3232; /* kuten208 */
|
|
if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
|
|
nerrors++;
|
|
if(squawk)
|
|
EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
|
|
if(!clean)
|
|
emit(BADMAP);
|
|
} else {
|
|
if(l < 0){
|
|
l = -l;
|
|
if(squawk)
|
|
EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
|
|
}
|
|
emit(l);
|
|
}
|
|
state = state0;
|
|
}
|
|
}
|
|
|
|
/*
|
|
a state machine for interpreting jis-kanji == 2022-JP
|
|
*/
|
|
static void
|
|
jis(int c, Rune **r, long input_loc)
|
|
{
|
|
static enum { state0, state1, state2, state3, state4 } state = state0;
|
|
static int set8 = 0;
|
|
static int japan646 = 0;
|
|
static int lastc;
|
|
int n;
|
|
long l;
|
|
|
|
again:
|
|
switch(state)
|
|
{
|
|
case state0: /* idle state */
|
|
if(c == ESC){ state = state1; return; }
|
|
if(c < 0) return;
|
|
if(!set8 && (c < 128)){
|
|
if(japan646){
|
|
switch(c)
|
|
{
|
|
case '\\': emit(0xA5); return; /* yen */
|
|
case '~': emit(0xAF); return; /* spacing macron */
|
|
default: emit(c); return;
|
|
}
|
|
} else {
|
|
emit(c);
|
|
return;
|
|
}
|
|
}
|
|
lastc = c; state = state4; return;
|
|
|
|
case state1: /* seen an escape */
|
|
if(c == '$'){ state = state2; return; }
|
|
if(c == '('){ state = state3; return; }
|
|
emit(ESC); state = state0; goto again;
|
|
|
|
case state2: /* may be shifting into JIS */
|
|
if((c == '@') || (c == 'B')){
|
|
set8 = 1; state = state0; return;
|
|
}
|
|
emit(ESC); emit('$'); state = state0; goto again;
|
|
|
|
case state3: /* may be shifting out of JIS */
|
|
if((c == 'J') || (c == 'H') || (c == 'B')){
|
|
japan646 = (c == 'J');
|
|
set8 = 0; state = state0; return;
|
|
}
|
|
emit(ESC); emit('('); state = state0; goto again;
|
|
|
|
case state4: /* two part char */
|
|
if(c < 0){
|
|
if(squawk)
|
|
EPR "%s: unexpected EOF in %s\n", argv0, file);
|
|
c = 0x21 | (lastc&0x80);
|
|
}
|
|
if((lastc&0x80) != (c&0x80)){ /* guard against latin1 in jis */
|
|
emit(lastc);
|
|
state = state0;
|
|
goto again;
|
|
}
|
|
n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */
|
|
if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
|
|
nerrors++;
|
|
if(squawk)
|
|
EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
|
|
if(!clean)
|
|
emit(BADMAP);
|
|
} else {
|
|
if(l < 0){
|
|
l = -l;
|
|
if(squawk)
|
|
EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
|
|
}
|
|
emit(l);
|
|
}
|
|
state = state0;
|
|
}
|
|
}
|
|
|
|
static void
|
|
do_in(int fd, void (*procfn)(int, Rune **, long), struct convert *out)
|
|
{
|
|
Rune ob[N];
|
|
Rune *r, *re;
|
|
uchar ibuf[N];
|
|
int n, i;
|
|
long nin;
|
|
|
|
r = ob;
|
|
re = ob+N-3;
|
|
nin = 0;
|
|
while((n = read(fd, ibuf, sizeof ibuf)) > 0){
|
|
for(i = 0; i < n; i++){
|
|
(*procfn)(ibuf[i], &r, nin++);
|
|
if(r >= re){
|
|
OUT(out, ob, r-ob);
|
|
r = ob;
|
|
}
|
|
}
|
|
if(r > ob){
|
|
OUT(out, ob, r-ob);
|
|
r = ob;
|
|
}
|
|
}
|
|
(*procfn)(-1, &r, nin);
|
|
if(r > ob)
|
|
OUT(out, ob, r-ob);
|
|
}
|
|
|
|
void
|
|
jis_in(int fd, long *notused, struct convert *out)
|
|
{
|
|
USED(notused);
|
|
do_in(fd, alljis, out);
|
|
}
|
|
|
|
void
|
|
ujis_in(int fd, long *notused, struct convert *out)
|
|
{
|
|
USED(notused);
|
|
do_in(fd, ujis, out);
|
|
}
|
|
|
|
void
|
|
msjis_in(int fd, long *notused, struct convert *out)
|
|
{
|
|
USED(notused);
|
|
do_in(fd, ms, out);
|
|
}
|
|
|
|
void
|
|
jisjis_in(int fd, long *notused, struct convert *out)
|
|
{
|
|
USED(notused);
|
|
do_in(fd, jis, out);
|
|
}
|
|
|
|
static int first = 1;
|
|
|
|
static void
|
|
tab_init(void)
|
|
{
|
|
int i;
|
|
long l;
|
|
|
|
first = 0;
|
|
for(i = 0; i < NRUNE; i++)
|
|
tab[i] = -1;
|
|
for(i = 0; i < KUTEN208MAX; i++)
|
|
if((l = tabkuten208[i]) != -1){
|
|
if(l < 0)
|
|
tab[-l] = i;
|
|
else
|
|
tab[l] = i;
|
|
}
|
|
}
|
|
|
|
|
|
/* jis-kanji, or ISO 2022-JP */
|
|
void
|
|
jisjis_out(Rune *base, int n, long *notused)
|
|
{
|
|
char *p;
|
|
int i;
|
|
Rune r;
|
|
static enum { ascii, japan646, jp2022 } state = ascii;
|
|
|
|
USED(notused);
|
|
if(first)
|
|
tab_init();
|
|
nrunes += n;
|
|
p = obuf;
|
|
for(i = 0; i < n; i++){
|
|
r = base[i];
|
|
if(r < 128){
|
|
if(state == jp2022){
|
|
*p++ = ESC; *p++ = '('; *p++ = 'B';
|
|
state = ascii;
|
|
}
|
|
*p++ = r;
|
|
} else {
|
|
if(tab[r] != -1){
|
|
if(state != jp2022){
|
|
*p++ = ESC; *p++ = '$'; *p++ = 'B';
|
|
state = jp2022;
|
|
}
|
|
*p++ = tab[r]/100 + ' ';
|
|
*p++ = tab[r]%100 + ' ';
|
|
continue;
|
|
}
|
|
if(squawk)
|
|
EPR "%s: rune 0x%x not in output cs\n", argv0, r);
|
|
nerrors++;
|
|
if(clean)
|
|
continue;
|
|
*p++ = BYTEBADMAP;
|
|
}
|
|
}
|
|
noutput += p-obuf;
|
|
if(p > obuf)
|
|
write(1, obuf, p-obuf);
|
|
}
|
|
|
|
/* ms-kanji, or Shift-JIS */
|
|
void
|
|
msjis_out(Rune *base, int n, long *notused)
|
|
{
|
|
char *p;
|
|
int i, hi, lo;
|
|
Rune r;
|
|
|
|
USED(notused);
|
|
if(first)
|
|
tab_init();
|
|
nrunes += n;
|
|
p = obuf;
|
|
for(i = 0; i < n; i++){
|
|
r = base[i];
|
|
if(r < 128)
|
|
*p++ = r;
|
|
else {
|
|
if(tab[r] != -1){
|
|
hi = tab[r]/100 + ' ';
|
|
lo = tab[r]%100 + ' ';
|
|
J2S(hi, lo);
|
|
*p++ = hi;
|
|
*p++ = lo;
|
|
continue;
|
|
}
|
|
if(squawk)
|
|
EPR "%s: rune 0x%x not in output cs\n", argv0, r);
|
|
nerrors++;
|
|
if(clean)
|
|
continue;
|
|
*p++ = BYTEBADMAP;
|
|
}
|
|
}
|
|
noutput += p-obuf;
|
|
if(p > obuf)
|
|
write(1, obuf, p-obuf);
|
|
}
|
|
|
|
/* ujis, or EUC */
|
|
void
|
|
ujis_out(Rune *base, int n, long *notused)
|
|
{
|
|
char *p;
|
|
int i;
|
|
Rune r;
|
|
|
|
USED(notused);
|
|
if(first)
|
|
tab_init();
|
|
nrunes += n;
|
|
p = obuf;
|
|
for(i = 0; i < n; i++){
|
|
r = base[i];
|
|
if(r < 128)
|
|
*p++ = r;
|
|
else {
|
|
if(tab[r] != -1){
|
|
*p++ = 0x80 | (tab[r]/100 + ' ');
|
|
*p++ = 0x80 | (tab[r]%100 + ' ');
|
|
continue;
|
|
}
|
|
if(squawk)
|
|
EPR "%s: rune 0x%x not in output cs\n", argv0, r);
|
|
nerrors++;
|
|
if(clean)
|
|
continue;
|
|
*p++ = BYTEBADMAP;
|
|
}
|
|
}
|
|
noutput += p-obuf;
|
|
if(p > obuf)
|
|
write(1, obuf, p-obuf);
|
|
}
|