This commit is contained in:
rsc 2003-11-25 03:37:45 +00:00
parent 091f74d0a0
commit 0870887793
27 changed files with 8343 additions and 0 deletions

139
src/cmd/dict/ahd.c Normal file
View file

@ -0,0 +1,139 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
#include "dict.h"
/*
* American Heritage Dictionary (encrypted)
*/
static Rune intab[256] = {
[0x82] 0xe9,
[0x85] 0xe0,
[0x89] 0xeb,
[0x8a] 0xe8,
[0xa4] 0xf1,
[0xf8] 0xb0,
[0xf9] 0xb7,
};
static char tag[64];
enum{
Run, Openper, Openat, Closeat
};
void
ahdprintentry(Entry e, int cmd)
{
static int inited;
long addr;
char *p, *t = tag;
int obreaklen;
int c, state = Run;
if(!inited){
for(c=0; c<256; c++)
if(intab[c] == 0)
intab[c] = c;
inited = 1;
}
obreaklen = breaklen;
breaklen = 80;
addr = e.doff;
for(p=e.start; p<e.end; p++){
c = intab[(*p ^ (addr++>>1))&0xff];
switch(state){
case Run:
if(c == '%'){
t = tag;
state = Openper;
break;
}
Putchar:
if(c == '\n')
outnl(0);
else if(c < Runeself)
outchar(c);
else
outrune(c);
break;
case Openper:
if(c == '@')
state = Openat;
else{
outchar('%');
state = Run;
goto Putchar;
}
break;
case Openat:
if(c == '@')
state = Closeat;
else if(t < &tag[sizeof tag-1])
*t++ = c;
break;
case Closeat:
if(c == '%'){
*t = 0;
switch(cmd){
case 'h':
if(strcmp("EH", tag) == 0)
goto out;
break;
case 'r':
outprint("%%@%s@%%", tag);
break;
}
state = Run;
}else{
if(t < &tag[sizeof tag-1])
*t++ = '@';
if(t < &tag[sizeof tag-1])
*t++ = c;
state = Openat;
}
break;
}
}
out:
outnl(0);
breaklen = obreaklen;
}
long
ahdnextoff(long fromoff)
{
static char *patterns[] = { "%@NL@%", "%@2@%", 0 };
int c, k = 0, state = 0;
char *pat = patterns[0];
long defoff = -1;
if(Bseek(bdict, fromoff, 0) < 0)
return -1;
while((c = Bgetc(bdict)) >= 0){
c ^= (fromoff++>>1)&0xff;
if(c != pat[state]){
state = 0;
continue;
}
if(pat[++state])
continue;
if(pat = patterns[++k]){ /* assign = */
state = 0;
defoff = fromoff-6;
continue;
}
return fromoff-5;
}
return defoff;
}
void
ahdprintkey(void)
{
Bprint(bout, "No pronunciations.\n");
}

29
src/cmd/dict/canonind.awk Normal file
View file

@ -0,0 +1,29 @@
# turn output of mkindex into form needed by dict
BEGIN {
if(ARGC != 2) {
print "Usage: awk -F' ' -f canonind.awk rawindex > index"
exit 1
}
file = ARGV[1]
ARGV[1] = ""
while ((getline < file) > 0) {
for(i = 2; i <= NF; i++) {
w = $i
if(length(w) == 0)
continue
b = index(w, "(")
e = index(w, ")")
if(b && e && b < e) {
w1 = substr(w, 1, b-1)
w2 = substr(w, b+1, e-b-1)
w3 = substr(w, e+1)
printf "%s%s\t%d\n", w1, w3, $1 > "junk"
printf "%s%s%s\t%d\n", w1, w2, w3, $1 > "junk"
} else
printf "%s\t%d\n", w, $1 > "junk"
}
}
system("sort -u -t' ' +0f -1 +0 -1 +1n -2 < junk")
system("rm junk")
exit 0
}

56
src/cmd/dict/comfix.awk Normal file
View file

@ -0,0 +1,56 @@
# when raw index has a lot of entries like
# 1578324 problematico, a, ci, che
# apply this algorithm:
# treat things after comma as suffixes
# for each suffix:
# if single letter, replace last letter
# else search backwards for beginning of suffix
# and if it leads to an old suffix of approximately
# the same length, put replace that suffix
# This will still leave some commas to fix by hand
# Usage: awk -F' ' -f comfix.awk rawindex > newrawindex
NF == 2 {
i = index($2, ",")
if(i == 0 || length($2) == 0)
print $0
else {
n = split($2, a, /,[ ]*/)
w = a[1]
printf "%s\t%s\n", $1, w
for(i = 2; i <= n; i++) {
suf = a[i]
m = matchsuflen(w, suf)
if(m) {
nw = substr(w, 1, length(w)-m) suf
printf "%s\t%s\n", $1, nw
} else
printf "%s\t%s\n", $1, w ", " suf
}
}
}
NF != 2 {
print $0
}
function matchsuflen(w, suf, wlen,suflen,c,pat,k,d)
{
wlen = length(w)
suflen = length(suf)
if(suflen == 1)
return 1
else {
c = substr(suf, 1, 1)
for (k = 1; k <= wlen ; k++)
if(substr(w, wlen-k+1, 1) == c)
break
if(k > wlen)
return 0
d = k-suflen
if(d < 0)
d = -d
if(d > 3)
return 0
return k
}
}

681
src/cmd/dict/dict.c Normal file
View file

@ -0,0 +1,681 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
#include <regexp.h>
#include <ctype.h>
#include "dict.h"
/*
* Assumed index file structure: lines of form
* [^\t]+\t[0-9]+
* First field is key, second is byte offset into dictionary.
* Should be sorted with args -u -t' ' +0f -1 +0 -1 +1n -2
*/
typedef struct Addr Addr;
struct Addr {
int n; /* number of offsets */
int cur; /* current position within doff array */
int maxn; /* actual current size of doff array */
ulong doff[1]; /* doff[maxn], with 0..n-1 significant */
};
Biobuf binbuf;
Biobuf boutbuf;
Biobuf *bin = &binbuf; /* user cmd input */
Biobuf *bout = &boutbuf; /* output */
Biobuf *bdict; /* dictionary */
Biobuf *bindex; /* index file */
long indextop; /* index offset at end of file */
int lastcmd; /* last executed command */
Addr *dot; /* "current" address */
Dict *dict; /* current dictionary */
int linelen;
int breaklen = 60;
int outinhibit;
int debug;
void execcmd(int);
int getpref(char*, Rune*);
Entry getentry(int);
int getfield(Rune*);
long locate(Rune*);
int parseaddr(char*, char**);
int parsecmd(char*);
int search(char*, int);
long seeknextline(Biobuf*, long);
void setdotnext(void);
void setdotprev(void);
void sortaddr(Addr*);
void usage(void);
enum {
Plen=300, /* max length of a search pattern */
Fieldlen=200, /* max length of an index field */
Aslots=10, /* initial number of slots in an address */
};
void
main(int argc, char **argv)
{
int i, cmd, kflag;
char *line, *p;
Binit(&binbuf, 0, OREAD);
Binit(&boutbuf, 1, OWRITE);
kflag = 0;
line = 0;
dict = 0;
p = getenv("PLAN9");
if(p == nil)
p = "/usr/local/plan9";
if(chdir(p) < 0)
sysfatal("chdir %s: %r", p);
for(i=0; dicts[i].name; i++){
if(access(dicts[i].path, 0)>=0 && access(dicts[i].indexpath, 0)>=0){
dict = &dicts[i];
break;
}
}
ARGBEGIN {
case 'd':
p = ARGF();
dict = 0;
if(p) {
for(i=0; dicts[i].name; i++)
if(strcmp(p, dicts[i].name)==0) {
dict = &dicts[i];
break;
}
}
if(!dict)
usage();
break;
case 'c':
line = ARGF();
if(!line)
usage();
break;
case 'k':
kflag++;
break;
case 'D':
debug++;
break;
default:
usage();
ARGEND }
if(dict == 0){
err("no dictionaries present on this system");
exits("nodict");
}
if(kflag) {
(*dict->printkey)();
exits(0);
}
if(argc > 1)
usage();
else if(argc == 1) {
if(line)
usage();
p = argv[0];
line = malloc(strlen(p)+5);
sprint(line, "/%s/P\n", p);
}
bdict = Bopen(dict->path, OREAD);
if(!bdict) {
err("can't open dictionary %s/%s", p, dict->path);
exits("nodict");
}
bindex = Bopen(dict->indexpath, OREAD);
if(!bindex) {
err("can't open index %s/%s", p, dict->indexpath);
exits("noindex");
}
indextop = Bseek(bindex, 0L, 2);
dot = malloc(sizeof(Addr)+(Aslots-1)*sizeof(ulong));
dot->n = 0;
dot->cur = 0;
dot->maxn = Aslots;
lastcmd = 0;
if(line) {
cmd = parsecmd(line);
if(cmd)
execcmd(cmd);
} else {
for(;;) {
Bprint(bout, "*");
Bflush(bout);
line = Brdline(bin, '\n');
linelen = 0;
if(!line)
break;
cmd = parsecmd(line);
if(cmd) {
execcmd(cmd);
lastcmd = cmd;
}
}
}
exits(0);
}
void
usage(void)
{
int i;
char *a, *b;
Bprint(bout, "Usage: %s [-d dict] [-k] [-c cmd] [word]\n", argv0);
Bprint(bout, "dictionaries (brackets mark dictionaries not present on this system):\n");
for(i = 0; dicts[i].name; i++){
a = b = "";
if(access(dicts[i].path, 0)<0 || access(dicts[i].indexpath, 0)<0){
a = "[";
b = "]";
}
Bprint(bout, " %s%s\t%s%s\n", a, dicts[i].name, dicts[i].desc, b);
}
exits("usage");
}
int
parsecmd(char *line)
{
char *e;
int cmd, ans;
if(parseaddr(line, &e) >= 0)
line = e;
else
return 0;
cmd = *line;
ans = cmd;
if(isupper(cmd))
cmd = tolower(cmd);
if(!(cmd == 'a' || cmd == 'h' || cmd == 'p' || cmd == 'r' ||
cmd == '\n')) {
err("unknown command %c", cmd);
return 0;
}
if(cmd == '\n')
switch(lastcmd) {
case 0: ans = 'H'; break;
case 'H': ans = 'p'; break;
default : ans = lastcmd; break;
}
else if(line[1] != '\n' && line[1] != 0)
err("extra stuff after command %c ignored", cmd);
return ans;
}
void
execcmd(int cmd)
{
Entry e;
int cur, doall;
if(isupper(cmd)) {
doall = 1;
cmd = tolower(cmd);
cur = 0;
} else {
doall = 0;
cur = dot->cur;
}
if(debug && doall && cmd == 'a')
Bprint(bout, "%d entries, cur=%d\n", dot->n, cur+1);
for(;;){
if(cur >= dot->n)
break;
if(doall) {
Bprint(bout, "%d\t", cur+1);
linelen += 4 + (cur >= 10);
}
switch(cmd) {
case 'a':
Bprint(bout, "#%lud\n", dot->doff[cur]);
break;
case 'h':
case 'p':
case 'r':
e = getentry(cur);
(*dict->printentry)(e, cmd);
break;
}
cur++;
if(doall) {
if(cmd == 'p' || cmd == 'r') {
Bputc(bout, '\n');
linelen = 0;
}
} else
break;
}
if(cur >= dot->n)
cur = 0;
dot->cur = cur;
}
/*
* Address syntax: ('.' | '/' re '/' | '!' re '!' | number | '#' number) ('+' | '-')*
* Answer goes in dot.
* Return -1 if address starts, but get error.
* Return 0 if no address.
*/
int
parseaddr(char *line, char **eptr)
{
int delim, plen;
ulong v;
char *e;
char pat[Plen];
if(*line == '/' || *line == '!') {
/* anchored regular expression match; '!' means no folding */
if(*line == '/') {
delim = '/';
e = strpbrk(line+1, "/\n");
} else {
delim = '!';
e = strpbrk(line+1, "!\n");
}
plen = e-line-1;
if(plen >= Plen-3) {
err("pattern too big");
return -1;
}
pat[0] = '^';
memcpy(pat+1, line+1, plen);
pat[plen+1] = '$';
pat[plen+2] = 0;
if(*e == '\n')
line = e;
else
line = e+1;
if(!search(pat, delim == '/')) {
err("pattern not found");
return -1;
}
} else if(*line == '#') {
/* absolute byte offset into dictionary */
line++;
if(!isdigit(*line))
return -1;
v = strtoul(line, &e, 10);
line = e;
dot->doff[0] = v;
dot->n = 1;
dot->cur = 0;
} else if(isdigit(*line)) {
v = strtoul(line, &e, 10);
line = e;
if(v < 1 || v > dot->n)
err(".%d not in range [1,%d], ignored",
v, dot->n);
else
dot->cur = v-1;
} else if(*line == '.') {
line++;
} else {
*eptr = line;
return 0;
}
while(*line == '+' || *line == '-') {
if(*line == '+')
setdotnext();
else
setdotprev();
line++;
}
*eptr = line;
return 1;
}
/*
* Index file is sorted by folded field1.
* Method: find pre, a folded prefix of r.e. pat,
* and then low = offset to beginning of
* line in index file where first match of prefix occurs.
* Then go through index until prefix no longer matches,
* adding each line that matches real pattern to dot.
* Finally, sort dot offsets (uniquing).
* We know pat len < Plen, and that it is surrounded by ^..$
*/
int
search(char *pat, int dofold)
{
int needre, prelen, match, n;
Reprog *re;
long ioff, v;
Rune pre[Plen];
Rune lit[Plen];
Rune entry[Fieldlen];
char fpat[Plen];
prelen = getpref(pat+1, pre);
if(pat[prelen+1] == 0 || pat[prelen+1] == '$') {
runescpy(lit, pre);
if(dofold)
fold(lit);
needre = 0;
SET(re);
} else {
needre = 1;
if(dofold) {
foldre(fpat, pat);
re = regcomp(fpat);
} else
re = regcomp(pat);
}
fold(pre);
ioff = locate(pre);
if(ioff < 0)
return 0;
dot->n = 0;
Bseek(bindex, ioff, 0);
for(;;) {
if(!getfield(entry))
break;
if(dofold)
fold(entry);
if(needre)
match = rregexec(re, entry, 0, 0);
else
match = (acomp(lit, entry) == 0);
if(match) {
if(!getfield(entry))
break;
v = runetol(entry);
if(dot->n >= dot->maxn) {
n = 2*dot->maxn;
dot = realloc(dot,
sizeof(Addr)+(n-1)*sizeof(long));
if(!dot) {
err("out of memory");
exits("nomem");
}
dot->maxn = n;
}
dot->doff[dot->n++] = v;
} else {
if(!dofold)
fold(entry);
if(*pre) {
n = acomp(pre, entry);
if(n < -1 || (!needre && n < 0))
break;
}
/* get to next index entry */
if(!getfield(entry))
break;
}
}
sortaddr(dot);
dot->cur = 0;
return dot->n;
}
/*
* Return offset in index file of first line whose folded
* first field has pre as a prefix. -1 if none found.
*/
long
locate(Rune *pre)
{
long top, bot, mid;
Rune entry[Fieldlen];
if(*pre == 0)
return 0;
bot = 0;
top = indextop;
if(debug>1)
fprint(2, "locate looking for prefix %S\n", pre);
for(;;) {
/*
* Loop invariant: foldkey(bot) < pre <= foldkey(top)
* and bot < top, and bot,top point at beginning of lines
*/
mid = (top+bot) / 2;
mid = seeknextline(bindex, mid);
if(debug > 1)
fprint(2, "bot=%ld, mid=%ld->%ld, top=%ld\n",
bot, (top+bot) / 2, mid, top);
if(mid == top || !getfield(entry))
break;
if(debug > 1)
fprint(2, "key=%S\n", entry);
/*
* here mid is strictly between bot and top
*/
fold(entry);
if(acomp(pre, entry) <= 0)
top = mid;
else
bot = mid;
}
/*
* bot < top, but they don't necessarily point at successive lines
* Use linear search from bot to find first line that pre is a
* prefix of
*/
while((bot = seeknextline(bindex, bot)) <= top) {
if(!getfield(entry))
return -1;
if(debug > 1)
fprint(2, "key=%S\n", entry);
fold(entry);
switch(acomp(pre, entry)) {
case -2:
return -1;
case -1:
case 0:
return bot;
case 1:
case 2:
continue;
}
}
return -1;
}
/*
* Get prefix of non re-metacharacters, runified, into pre,
* and return length
*/
int
getpref(char *pat, Rune *pre)
{
int n, r;
char *p;
p = pat;
while(*p) {
n = chartorune(pre, p);
r = *pre;
switch(r) {
case 0x2e: case 0x2a: case 0x2b: case 0x3f:
case 0x5b: case 0x5d: case 0x28: case ')':
case 0x7c: case 0x5e: case 0x24:
*pre = 0;
return p-pat;
case L'\\':
p += n;
p += chartorune(++pre, p);
pre++;
break;
default:
p += n;
pre++;
}
}
return p-pat;
}
long
seeknextline(Biobuf *b, long off)
{
long c;
Bseek(b, off, 0);
do {
c = Bgetrune(b);
} while(c>=0 && c!='\n');
return Boffset(b);
}
/*
* Get next field out of index file (either tab- or nl- terminated)
* Answer in *rp, assumed to be Fieldlen long.
* Return 0 if read error first.
*/
int
getfield(Rune *rp)
{
long c;
int n;
for(n=Fieldlen; n-- > 0; ) {
if ((c = Bgetrune(bindex)) < 0)
return 0;
if(c == '\t' || c == '\n') {
*rp = L'\0';
return 1;
}
*rp++ = c;
}
err("word too long");
return 0;
}
/*
* A compare longs function suitable for qsort
*/
static int
longcmp(const void *av, const void *bv)
{
long v;
long *a, *b;
a = (long*)av;
b = (long*)bv;
v = *a - *b;
if(v < 0)
return -1;
else if(v == 0)
return 0;
else
return 1;
}
void
sortaddr(Addr *a)
{
int i, j;
long v;
if(a->n <= 1)
return;
qsort(a->doff, a->n, sizeof(long), longcmp);
/* remove duplicates */
for(i=0, j=0; j < a->n; j++) {
v = a->doff[j];
if(i > 0 && v == a->doff[i-1])
continue;
a->doff[i++] = v;
}
a->n = i;
}
Entry
getentry(int i)
{
long b, e, n;
static Entry ans;
static int anslen = 0;
b = dot->doff[i];
e = (*dict->nextoff)(b+1);
ans.doff = b;
if(e < 0) {
err("couldn't seek to entry");
ans.start = 0;
ans.end = 0;
} else {
n = e-b;
if(n+1 > anslen) {
ans.start = realloc(ans.start, n+1);
if(!ans.start) {
err("out of memory");
exits("nomem");
}
anslen = n+1;
}
Bseek(bdict, b, 0);
n = Bread(bdict, ans.start, n);
ans.end = ans.start + n;
*ans.end = 0;
}
return ans;
}
void
setdotnext(void)
{
long b;
b = (*dict->nextoff)(dot->doff[dot->cur]+1);
if(b < 0) {
err("couldn't find a next entry");
return;
}
dot->doff[0] = b;
dot->n = 1;
dot->cur = 0;
}
void
setdotprev(void)
{
int tryback;
long here, last, p;
if(dot->cur < 0 || dot->cur >= dot->n)
return;
tryback = 2000;
here = dot->doff[dot->cur];
last = 0;
while(last == 0) {
p = here - tryback;
if(p < 0)
p = 0;
for(;;) {
p = (*dict->nextoff)(p+1);
if(p < 0)
return; /* shouldn't happen */
if(p >= here)
break;
last = p;
}
if(!last) {
if(here - tryback < 0) {
err("can't find a previous entry");
return;
}
tryback = 2*tryback;
}
}
dot->doff[0] = last;
dot->n = 1;
dot->cur = 0;
}

160
src/cmd/dict/dict.h Normal file
View file

@ -0,0 +1,160 @@
/* Runes for special purposes (0xe800-0xfdff is Private Use Area) */
enum { NONE=0xe800, /* Emit nothing */
TAGS, /* Start of tag */
TAGE, /* End of tag */
SPCS, /* Start of special character name */
PAR, /* Newline, indent */
LIGS, /* Start of ligature codes */
LACU=LIGS, /* Acute (´) ligatures */
LGRV, /* Grave (ˋ) ligatures */
LUML, /* Umlaut (¨) ligatures */
LCED, /* Cedilla (¸) ligatures */
LTIL, /* Tilde (˜) ligatures */
LBRV, /* Breve (˘) ligatures */
LRNG, /* Ring (˚) ligatures */
LDOT, /* Dot (˙) ligatures */
LDTB, /* Dot below (.) ligatures */
LFRN, /* Frown (⌢) ligatures */
LFRB, /* Frown below (̯) ligatures */
LOGO, /* Ogonek (˛) ligatures */
LMAC, /* Macron (¯) ligatures */
LHCK, /* Hacek (ˇ) ligatures */
LASP, /* Asper (ʽ) ligatures */
LLEN, /* Lenis (ʼ) ligatures */
LBRB, /* Breve below (̮) ligatures */
LIGE, /* End of ligature codes */
MULTI, /* Start of multi-rune codes */
MAAS=MULTI, /* ʽα */
MALN, /* ʼα */
MAND, /* and */
MAOQ, /* a/q */
MBRA, /* <| */
MDD, /* .. */
MDDD, /* ... */
MEAS, /* ʽε */
MELN, /* ʼε */
MEMM, /* —— */
MHAS, /* ʽη */
MHLN, /* ʼη */
MIAS, /* ʽι */
MILN, /* ʼι */
MLCT, /* ct */
MLFF, /* ff */
MLFFI, /* ffi */
MLFFL, /* ffl */
MLFL, /* fl */
MLFI, /* fi */
MLLS, /* ll with swing */
MLST, /* st */
MOAS, /* ʽο */
MOLN, /* ʼο */
MOR, /* or */
MRAS, /* ʽρ */
MRLN, /* ʼρ */
MTT, /* ~~ */
MUAS, /* ʽυ */
MULN, /* ʼυ */
MWAS, /* ʽω */
MWLN, /* ʼω */
MOE, /* oe */
MES, /* em space */
MULTIE, /* End of multi-rune codes */
};
#define Nligs (LIGE-LIGS)
#define Nmulti (MULTIE-MULTI)
typedef struct Entry Entry;
typedef struct Assoc Assoc;
typedef struct Nassoc Nassoc;
typedef struct Dict Dict;
struct Entry {
char *start; /* entry starts at start */
char *end; /* and finishes just before end */
long doff; /* dictionary offset (for debugging) */
};
struct Assoc {
char *key;
long val;
};
struct Nassoc {
long key;
long val;
};
struct Dict {
char *name; /* dictionary name */
char *desc; /* description */
char *path; /* path to dictionary data */
char *indexpath; /* path to index data */
long (*nextoff)(long); /* function to find next entry offset from arg */
void (*printentry)(Entry, int); /* function to print entry */
void (*printkey)(void); /* function to print pronunciation key */
};
int acomp(Rune*, Rune*);
Rune *changett(Rune *, Rune *, int);
void err(char*, ...);
void fold(Rune *);
void foldre(char*, char*);
Rune liglookup(Rune, Rune);
long lookassoc(Assoc*, int, char*);
long looknassoc(Nassoc*, int, long);
void outprint(char*, ...);
void outrune(long);
void outrunes(Rune *);
void outchar(int);
void outchars(char *);
void outnl(int);
void outpiece(char *, char *);
void runescpy(Rune*, Rune*);
long runetol(Rune*);
long oednextoff(long);
void oedprintentry(Entry, int);
void oedprintkey(void);
long ahdnextoff(long);
void ahdprintentry(Entry, int);
void ahdprintkey(void);
long pcollnextoff(long);
void pcollprintentry(Entry, int);
void pcollprintkey(void);
long pcollgnextoff(long);
void pcollgprintentry(Entry, int);
void pcollgprintkey(void);
long movienextoff(long);
void movieprintentry(Entry, int);
void movieprintkey(void);
long pgwnextoff(long);
void pgwprintentry(Entry,int);
void pgwprintkey(void);
long slangnextoff(long);
void slangprintentry(Entry, int);
void slangprintkey(void);
long robertnextoff(long);
void robertindexentry(Entry, int);
void robertprintkey(void);
long robertnextflex(long);
void robertflexentry(Entry, int);
long simplenextoff(long);
void simpleprintentry(Entry, int);
void simpleprintkey(void);
long thesnextoff(long);
void thesprintentry(Entry, int);
void thesprintkey(void);
long worldnextoff(long);
void worldprintentry(Entry, int);
void worldprintkey(void);
extern Biobuf *bdict;
extern Biobuf *bout;
extern int linelen;
extern int breaklen;
extern int outinhibit;
extern int debug;
extern Rune multitab[][5];
extern Dict dicts[];
#define asize(a) (sizeof (a)/sizeof(a[0]))

15
src/cmd/dict/egfix Executable file
View file

@ -0,0 +1,15 @@
#!/bin/rc
sed '
s/[ ]+$//
/ /!d
/, /{; h; s/,.*//; p; g; s/ .*, / /; }
' $1 |
sed '
/\(/{; h; s/\([^)]+\)//; p; g; s/[()]//g; }
' |
sed '
s/ +/ /
s/[ ]+$//
s/ +/ /g
'

8
src/cmd/dict/egfix2 Executable file
View file

@ -0,0 +1,8 @@
#!/bin/rc
awk '
BEGIN { FS = " |, " }
{ for(i=2; i<=NF; i++)print $i " " $1 }
' $1 |
tr A-Z a-z |
sort -u -t' ' +0f -1 +0 -1 +1n -2

1108
src/cmd/dict/gb2312.c Normal file

File diff suppressed because it is too large Load diff

23
src/cmd/dict/gefix Executable file
View file

@ -0,0 +1,23 @@
#!/bin/rc
sed '
s/[ ]+$//
/ /!d
s/\\N''349''//g
s/''//g
s/ -/ /
s/-$//
/\([^,) ]+(\)|$)/{; h; s///; p; g; s/\(//; s/\)//; }
/\(r, s\)$/{; s///; p; s/$/r/; p; s/r$/s/; }
' $1 |
sed '
/\([^,) ]+(\)|$)/{; h; s///; p; g; s/\(//; s/\)//; }
/\(r, s\)$/{; s///; p; s/$/r/; p; s/r$/s/; }
' |
sed '/ß/{; p; s/ß/ss/g; }' |
awk '
BEGIN { FS = " |, " }
{ for(i=2; i<=NF; i++)print $i " " $1 }
' |
tr A-Z a-z |
sort -u -t' ' +0f -1 +0 -1 +1n -2

8
src/cmd/dict/getneeds Executable file
View file

@ -0,0 +1,8 @@
#!/bin/rc
for (x in spec tag aux status) {
grep ' '^$x^' ' $1 > junk1
sort +4 -5 +3n -4 junk1 > junk2
awk '{if ($5 != prev) print $0; prev = $5}' junk2 > junk3
sort -n +2 -3 junk3 > need$x
rm junk*
}

1059
src/cmd/dict/jis208.c Normal file

File diff suppressed because it is too large Load diff

114
src/cmd/dict/kuten.h Normal file
View file

@ -0,0 +1,114 @@
/*
following astonishing goo courtesy of kogure.
*/
/*
* MicroSoft Kanji Encoding (SJIS) Transformation
*/
/*
* void
* J2S(unsigned char *_h, unsigned char *_l)
* JIS X 208 to MS kanji transformation.
*
* Calling/Exit State:
* _h and _l should be in their valid range.
* No return value.
*/
#define J2S(_h, _l) { \
/* lower: 21-7e >> 40-9d,9e-fb >> 40-7e,(skip 7f),80-fc */ \
if (((_l) += (((_h)-- % 2) ? 0x1f : 0x7d)) > 0x7e) (_l)++; \
/* upper: 21-7e >> 81-af >> 81-9f,(skip a0-df),e0-ef */ \
if (((_h) = ((_h) / 2 + 0x71)) > 0x9f) (_h) += 0x40; \
}
/*
* void
* S2J(unsigned char *_h, unsigned char *_l)
* MS kanji to JIS X 208 transformation.
*
* Calling/Exit State:
* _h and _l should be in valid range.
* No return value.
*/
#define S2J(_h, _l) { \
/* lower: 40-7e,80-fc >> 21-5f,61-dd >> 21-7e,7f-dc */ \
if (((_l) -= 0x1f) > 0x60) (_l)--; \
/* upper: 81-9f,e0-ef >> 00-1e,5f-6e >> 00-2e >> 21-7d */ \
if (((_h) -= 0x81) > 0x5e) (_h) -= 0x40; (_h) *= 2, (_h) += 0x21; \
/* upper: ,21-7d >> ,22-7e ; lower: ,7f-dc >> ,21-7e */ \
if ((_l) > 0x7e) (_h)++, (_l) -= 0x5e; \
}
/*
* int
* ISJKANA(const unsigned char *_b)
* Tests given byte is in the range of JIS X 0201 katakana.
*
* Calling/Exit State:
* Returns 1 if it is, or 0 otherwise.
*/
#define ISJKANA(_b) (0xa0 <= (_b) && (_b) < 0xe0)
/*
* int
* CANS2JH(const unsigned char *_h)
* Tests given byte is in the range of valid first byte of MS
* kanji code; either acts as a subroutine of CANS2J() macro
* or can be used to parse MS kanji encoded strings.
*
* Calling/Exit State:
* Returns 1 if it is, or 0 otherwise.
*/
#define CANS2JH(_h) ((0x81 <= (_h) && (_h) < 0xf0) && !ISJKANA(_h))
/*
* int
* CANS2JL(const unsigned char *_l)
* Tests given byte is in the range of valid second byte of MS
* kanji code; acts as a subroutine of CANS2J() macro.
*
* Calling/Exit State:
* Returns 1 if it is, or 0 otherwise.
*/
#define CANS2JL(_l) (0x40 <= (_l) && (_l) < 0xfd && (_l) != 0x7f)
/*
* int
* CANS2J(const unsigned char *_h, const unsinged char *_l)
* Tests given bytes form a MS kanji code point which can be
* transformed to a valid JIS X 208 code point.
*
* Calling/Exit State:
* Returns 1 if they are, or 0 otherwise.
*/
#define CANS2J(_h, _l) (CANS2JH(_h) && CANS2JL(_l))
/*
* int
* CANJ2SB(const unsigned char *_b)
* Tests given bytes is in the range of valid 94 graphic
* character set; acts as a subroutine of CANJ2S() macro.
*
* Calling/Exit State:
* Returns 1 if it is, or 0 otherwise.
*/
#define CANJ2SB(_b) (0x21 <= (_b) && (_b) < 0x7f)
/*
* int
* CANJ2S(const unsigned char *_h, const unsigned char *_l)
* Tests given bytes form valid JIS X 208 code points
* (which can be transformed to MS kanji).
*
* Calling/Exit State:
* Returns 1 if they are, or 0 otherwise.
*/
#define CANJ2S(_h, _l) (CANJ2SB(_h) && CANJ2SB(_l))
#define JIS208MAX 8407
#define GB2312MAX 8795
#define BIG5MAX 13973
extern Rune tabjis208[JIS208MAX]; /* runes indexed by kuten */
extern Rune tabgb2312[GB2312MAX];
extern Rune tabbig5[BIG5MAX];

18
src/cmd/dict/mkfile Normal file
View file

@ -0,0 +1,18 @@
PLAN9=../../..
<$PLAN9/src/mkhdr
TARG=dict
LFILES=oed.$O ahd.$O pcollins.$O pcollinsg.$O movie.$O slang.$O robert.$O\
world.$O jis208.$O gb2312.$O thesaurus.$O simple.$O pgw.$O
OFILES=dict.$O\
$LFILES\
utils.$O
HFILES=dict.h kuten.h
LDFLAGS=$LDFLAGS -lbio -l9 -lregexp9 -lfmt -lutf
<$PLAN9/src/mkone
mkindex: mkindex.$O $LFILES utils.$O
$LD $LDFLAGS -o $target $prereq

106
src/cmd/dict/mkindex.c Normal file
View file

@ -0,0 +1,106 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
#include "dict.h"
/*
* Use this to start making an index for a new dictionary.
* Get the dictionary-specific nextoff and printentry(_,'h')
* commands working, add a record to the dicts[] array below,
* and run this program to get a list of offset,headword
* pairs
*/
Biobuf boutbuf;
Biobuf *bdict;
Biobuf *bout = &boutbuf;
int linelen;
int breaklen = 2000;
int outinhibit;
int debug;
Dict *dict; /* current dictionary */
Entry getentry(long);
void
main(int argc, char **argv)
{
int i;
long a, ae;
char *p;
Entry e;
Binit(&boutbuf, 1, OWRITE);
dict = &dicts[0];
ARGBEGIN {
case 'd':
p = ARGF();
dict = 0;
if(p) {
for(i=0; dicts[i].name; i++)
if(strcmp(p, dicts[i].name)==0) {
dict = &dicts[i];
break;
}
}
if(!dict) {
err("unknown dictionary: %s", p);
exits("nodict");
}
break;
case 'D':
debug++;
break;
ARGEND }
USED(argc,argv);
bdict = Bopen(dict->path, OREAD);
ae = Bseek(bdict, 0, 2);
if(!bdict) {
err("can't open dictionary %s", dict->path);
exits("nodict");
}
for(a = 0; a < ae; a = (*dict->nextoff)(a+1)) {
linelen = 0;
e = getentry(a);
Bprint(bout, "%ld\t", a);
linelen = 4; /* only has to be approx right */
(*dict->printentry)(e, 'h');
}
exits(0);
}
Entry
getentry(long b)
{
long e, n, dtop;
static Entry ans;
static int anslen = 0;
e = (*dict->nextoff)(b+1);
ans.doff = b;
if(e < 0) {
dtop = Bseek(bdict, 0L, 2);
if(b < dtop) {
e = dtop;
} else {
err("couldn't seek to entry");
ans.start = 0;
ans.end = 0;
}
}
n = e-b;
if(n) {
if(n > anslen) {
ans.start = realloc(ans.start, n);
if(!ans.start) {
err("out of memory");
exits("nomem");
}
anslen = n;
}
Bseek(bdict, b, 0);
n = Bread(bdict, ans.start, n);
ans.end = ans.start + n;
}
return ans;
}

328
src/cmd/dict/movie.c Normal file
View file

@ -0,0 +1,328 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
#include "dict.h"
/* Possible tags */
enum {
BEG, /* beginning of entry */
AB, /* abstract */
AN, /* database serial number */
AS, /* author (one at a time) */
AU, /* all authors */
AW, /* award_awardee */
BW, /* bw or c */
CA, /* cast: character_actor */
CN, /* cinematography */
CO, /* country */
CR, /* miscellaneous job_name */
DE, /* topic keyword */
DR, /* director */
ED, /* editor */
MP, /* MPAA rating (R, PG, etc.) */
NT, /* note */
PR, /* producer and for ...*/
PS, /* producer (repeats info in PR) */
RA, /* rating (letter) */
RD, /* release date */
RT, /* running time */
RV, /* review citation */
ST, /* production or release company (repeats info in PR) */
TI, /* title[; original foreign title] */
TX, /* paragraph of descriptive text */
VD, /* video information (format_time_company; or "Not Avail.") */
NTAG /* number of tags */
};
/* Assoc tables must be sorted on first field */
static char *tagtab[] = {
[BEG] "$$",
[AB] "AB",
[AN] "AN",
[AS] "AS",
[AU] "AU",
[AW] "AW",
[BW] "BW",
[CA] "CA",
[CN] "CN",
[CO] "CO",
[CR] "CR",
[DE] "DE",
[DR] "DR",
[ED] "ED",
[MP] "MP",
[NT] "NT",
[PR] "PR",
[PS] "PS",
[RA] "RA",
[RD] "RD",
[RT] "RT",
[RV] "RV",
[ST] "ST",
[TI] "TI",
[TX] "TX",
[VD] "VD",
};
static char *mget(int, char *, char *, char **);
#if 0
static void moutall(int, char *, char *);
#endif
static void moutall2(int, char *, char *);
void
movieprintentry(Entry ent, int cmd)
{
char *p, *e, *ps, *pe, *pn;
int n;
ps = ent.start;
pe = ent.end;
if(cmd == 'r') {
Bwrite(bout, ps, pe-ps);
return;
}
p = mget(TI, ps, pe, &e);
if(p) {
outpiece(p, e);
outnl(0);
}
if(cmd == 'h')
return;
outnl(2);
n = 0;
p = mget(RD, ps, pe, &e);
if(p) {
outchars("Released: ");
outpiece(p, e);
n++;
}
p = mget(CO, ps, pe, &e);
if(p) {
if(n)
outchars(", ");
outpiece(p, e);
n++;
}
p = mget(RT, ps, pe, &e);
if(p) {
if(n)
outchars(", ");
outchars("Running time: ");
outpiece(p, e);
n++;
}
p = mget(MP, ps, pe, &e);
if(p) {
if(n)
outchars(", ");
outpiece(p, e);
n++;
}
p = mget(BW, ps, pe, &e);
if(p) {
if(n)
outchars(", ");
if(*p == 'c' || *p == 'C')
outchars("Color");
else
outchars("B&W");
n++;
}
if(n) {
outchar('.');
outnl(1);
}
p = mget(VD, ps, pe, &e);
if(p) {
outchars("Video: ");
outpiece(p, e);
outnl(1);
}
p = mget(AU, ps, pe, &e);
if(p) {
outchars("By: ");
moutall2(AU, ps, pe);
outnl(1);
}
p = mget(DR, ps, pe, &e);
if(p) {
outchars("Director: ");
outpiece(p, e);
outnl(1);
}
p = mget(PR, ps, pe, &e);
if(p) {
outchars("Producer: ");
outpiece(p, e);
outnl(1);
}
p = mget(CN, ps, pe, &e);
if(p) {
outchars("Cinematograpy: ");
outpiece(p, e);
outnl(1);
}
p = mget(CR, ps, pe, &e);
if(p) {
outchars("Other Credits: ");
moutall2(CR, ps, pe);
}
outnl(2);
p = mget(CA, ps, pe, &e);
if(p) {
outchars("Cast: ");
moutall2(CA, ps, pe);
}
outnl(2);
p = mget(AW, ps, pe, &e);
if(p) {
outchars("Awards: ");
moutall2(AW, ps, pe);
outnl(2);
}
p = mget(NT, ps, pe, &e);
if(p) {
outpiece(p, e);
outnl(2);
}
p = mget(AB, ps, pe, &e);
if(p) {
outpiece(p, e);
outnl(2);
}
pn = ps;
n = 0;
while((p = mget(TX, pn, pe, &pn)) != 0) {
if(n++)
outnl(1);
outpiece(p, pn);
}
outnl(0);
}
long
movienextoff(long fromoff)
{
long a;
char *p;
a = Bseek(bdict, fromoff, 0);
if(a < 0)
return -1;
for(;;) {
p = Brdline(bdict, '\n');
if(!p)
break;
if(p[0] == '$' && p[1] == '$')
return (Boffset(bdict)-Blinelen(bdict));
}
return -1;
}
void
movieprintkey(void)
{
Bprint(bout, "No key\n");
}
/*
* write a comma-separated list of all tag values between b and e
*/
#if 0
static void
moutall(int tag, char *b, char *e)
{
char *p, *pn;
int n;
n = 0;
pn = b;
while((p = mget(tag, pn, e, &pn)) != 0) {
if(n++)
outchars(", ");
outpiece(p, pn);
}
}
#endif
/*
* like moutall, but values are expected to have form:
* field1_field2
* and we are to output 'field2 (field1)' for each
* (sometimes field1 has underscores, so search from end)
*/
static void
moutall2(int tag, char *b, char *e)
{
char *p, *pn, *us, *q;
int n;
n = 0;
pn = b;
while((p = mget(tag, pn, e, &pn)) != 0) {
if(n++)
outchars(", ");
us = 0;
for(q = pn-1; q >= p; q--)
if(*q == '_') {
us = q;
break;
}
if(us) {
/*
* Hack to fix cast list Himself/Herself
*/
if(strncmp(us+1, "Himself", 7) == 0 ||
strncmp(us+1, "Herself", 7) == 0) {
outpiece(p, us);
outchars(" (");
outpiece(us+1, pn);
outchar(')');
} else {
outpiece(us+1, pn);
outchars(" (");
outpiece(p, us);
outchar(')');
}
} else {
outpiece(p, pn);
}
}
}
/*
* Starting from b, find next line beginning with tagtab[tag].
* Don't go past e, but assume *e==0.
* Return pointer to beginning of value (after tag), and set
* eptr to point at newline that ends the value
*/
static char *
mget(int tag, char *b, char *e, char **eptr)
{
char *p, *t, *ans;
if(tag < 0 || tag >= NTAG)
return 0;
t = tagtab[tag];
ans = 0;
for(p = b;;) {
p = strchr(p, '\n');
if(!p || ++p >= e) {
if(ans)
*eptr = e-1;
break;
}
if(!ans) {
if(p[0] == t[0] && p[1] == t[1])
ans = p+3;
} else {
if(p[0] != ' ') {
*eptr = p-1;
break;
}
}
}
return ans;
}

1425
src/cmd/dict/oed.c Normal file

File diff suppressed because it is too large Load diff

226
src/cmd/dict/pcollins.c Normal file
View file

@ -0,0 +1,226 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
#include "dict.h"
/*
* Routines for handling dictionaries in the "Paperback Collins"
* format (with tags surrounded by >....<)
*/
enum {
Buflen=1000,
};
/* More special runes */
enum {
B = MULTIE+1, /* bold */
H, /* headword start */
I, /* italics */
Ps, /* pronunciation start */
Pe, /* pronunciation end */
R, /* roman */
X, /* headword end */
};
/* Assoc tables must be sorted on first field */
static Assoc tagtab[] = {
{"AA", 0xc5},
{"AC", LACU},
{"B", B},
{"CE", LCED},
{"CI", LFRN},
{"Di", 0x131},
{"EL", 0x2d},
{"GR", LGRV},
{"H", H},
{"I", I},
{"OE", 0x152},
{"R", R},
{"TI", LTIL},
{"UM", LUML},
{"X", X},
{"[", Ps},
{"]", Pe},
{"ac", LACU},
{"ce", LCED},
{"ci", LFRN},
{"gr", LGRV},
{"oe", 0x153},
{"supe", 0x65}, /* should be raised */
{"supo", 0x6f}, /* should be raised */
{"ti", LTIL},
{"um", LUML},
{"{", Ps},
{"~", 0x7e},
{"~~", MTT},
};
static Rune normtab[128] = {
/*0*/ /*1*/ /*2*/ /*3*/ /*4*/ /*5*/ /*6*/ /*7*/
/*00*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
NONE, NONE, 0x20, NONE, NONE, NONE, NONE, NONE,
/*10*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
/*20*/ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, '\'',
0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
/*30*/ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
0x38, 0x39, 0x3a, 0x3b, TAGE, 0x3d, TAGS, 0x3f,
/*40*/ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
0x48, 0x49, 0x4a, 0x4b, L'L', 0x4d, 0x4e, 0x4f,
/*50*/ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
0x58, 0x59, 0x5a, 0x5b, L'\\', 0x5d, 0x5e, 0x5f,
/*60*/ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
/*70*/ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, NONE,
};
static char *gettag(char *, char *);
static Entry curentry;
static char tag[Buflen];
#define cursize (curentry.end-curentry.start)
void
pcollprintentry(Entry e, int cmd)
{
char *p, *pe;
long r, rprev, t, rlig;
int saveoi;
Rune *transtab;
p = e.start;
pe = e.end;
transtab = normtab;
rprev = NONE;
changett(0, 0, 0);
curentry = e;
saveoi = 0;
if(cmd == 'h')
outinhibit = 1;
while(p < pe) {
if(cmd == 'r') {
outchar(*p++);
continue;
}
r = transtab[(*p++)&0x7F];
if(r < NONE) {
/* Emit the rune, but buffer in case of ligature */
if(rprev != NONE)
outrune(rprev);
rprev = r;
} else if(r == TAGS) {
p = gettag(p, pe);
t = lookassoc(tagtab, asize(tagtab), tag);
if(t == -1) {
if(debug && !outinhibit)
err("tag %ld %d %s",
e.doff, cursize, tag);
continue;
}
if(t < NONE) {
if(rprev != NONE)
outrune(rprev);
rprev = t;
} else if(t >= LIGS && t < LIGE) {
/* handle possible ligature */
rlig = liglookup(t, rprev);
if(rlig != NONE)
rprev = rlig; /* overwrite rprev */
else {
/* could print accent, but let's not */
if(rprev != NONE) outrune(rprev);
rprev = NONE;
}
} else if(t >= MULTI && t < MULTIE) {
if(rprev != NONE) {
outrune(rprev);
rprev = NONE;
}
outrunes(multitab[t-MULTI]);
} else {
if(rprev != NONE) {
outrune(rprev);
rprev = NONE;
}
switch(t){
case H:
if(cmd == 'h')
outinhibit = 0;
else
outnl(0);
break;
case X:
if(cmd == 'h')
outinhibit = 1;
else
outchars(". ");
break;
case Ps:
/* don't know enough of pron. key yet */
saveoi = outinhibit;
outinhibit = 1;
break;
case Pe:
outinhibit = saveoi;
break;
}
}
}
}
if(cmd == 'h')
outinhibit = 0;
outnl(0);
}
long
pcollnextoff(long fromoff)
{
long a;
char *p;
a = Bseek(bdict, fromoff, 0);
if(a < 0)
return -1;
for(;;) {
p = Brdline(bdict, '\n');
if(!p)
break;
if(p[0] == '>' && p[1] == 'H' && p[2] == '<')
return (Boffset(bdict)-Blinelen(bdict));
}
return -1;
}
void
pcollprintkey(void)
{
Bprint(bout, "No pronunciation key yet\n");
}
/*
* f points just after '>'; fe points at end of entry.
* Expect next characters from bin to match:
* [^ <]+<
* tag
* Accumulate the tag in tag[].
* Return pointer to after final '<'.
*/
static char *
gettag(char *f, char *fe)
{
char *t;
int c, i;
t = tag;
i = Buflen;
while(--i > 0) {
c = *f++;
if(c == '<' || f == fe)
break;
*t++ = c;
}
*t = 0;
return f;
}

248
src/cmd/dict/pcollinsg.c Normal file
View file

@ -0,0 +1,248 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
#include "dict.h"
/*
* Routines for handling dictionaries in the "Paperback Collins"
* `German' format (with tags surrounded by \5\6 and \xba\xba)
*/
/*
* \5...\6 escapes (fonts, mostly)
*
* h headword (helvetica 7 pt)
* c clause (helvetica 7 pt)
* 3 helvetica 7 pt
* 4 helvetica 6.5 pt
* s helvetica 8 pt
* x helvetica 8 pt
* y helvetica 5 pt
* m helvetica 30 pt
* 1 roman 6 pt
* 9 roman 4.5 pt
* p roman 7 pt
* q roman 4.5 pt
* 2 italic 6 pt
* 7 italic 4.5 pt
* b bold 6 pt
* a `indent 0:4 left'
* k `keep 9'
* l `size 12'
*/
enum {
IBASE=0x69, /* dotless i */
Taglen=32,
};
static Rune intab[256] = {
/*0*/ /*1*/ /*2*/ /*3*/ /*4*/ /*5*/ /*6*/ /*7*/
/*00*/ NONE, NONE, NONE, NONE, NONE, TAGS, TAGE, NONE,
NONE, NONE, NONE, NONE, NONE, 0x20, NONE, NONE,
/*10*/ NONE, 0x2d, 0x20, 0x20, NONE, NONE, NONE, NONE,
0x20, NONE, NONE, NONE, 0x20, NONE, NONE, 0x2d,
/*20*/ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, '\'',
0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
/*30*/ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
/*40*/ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
0x48, 0x49, 0x4a, 0x4b, L'L', 0x4d, 0x4e, 0x4f,
/*50*/ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
0x58, 0x59, 0x5a, 0x5b, L'\\', 0x5d, 0x5e, 0x5f,
/*60*/ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
/*70*/ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, NONE,
/*80*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
NONE, NONE, 0x20, NONE, NONE, NONE, NONE, NONE,
/*90*/ 0xdf, 0xe6, NONE, MOE, NONE, NONE, NONE, 0xf8,
NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
/*A0*/ NONE, NONE, 0x22, 0xa3, NONE, NONE, NONE, NONE,
NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
/*B0*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, 0x7e,
NONE, IBASE, SPCS, NONE, NONE, NONE, NONE, NONE,
/*C0*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
/*D0*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
/*E0*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
/*F0*/ 0x20, 0x20, NONE, NONE, NONE, NONE, NONE, NONE,
NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
};
static Nassoc numtab[] = {
{1, 0x2b},
{4, 0x3d},
{7, 0xb0},
{11, 0x2248},
{69, 0x2666},
{114, 0xae},
{340, 0x25b},
{341, 0x254},
{342, 0x28c},
{343, 0x259},
{345, 0x292},
{346, 0x283},
{347, 0x275},
{348, 0x28a},
{349, 0x2c8},
{351, 0x26a},
{352, 0x25c},
{354, 0x251},
{355, 0x7e},
{356, 0x252},
{384, 0x273},
{445, 0xf0}, /* BUG -- should be script eth */
};
static Nassoc overtab[] = {
{0x2c, LCED},
{0x2f, LACU},
{0x3a, LUML},
{L'\\', LGRV},
{0x5e, LFRN},
{0x7e, LTIL},
};
static uchar *reach(uchar*, int);
static Entry curentry;
static char tag[Taglen];
void
pcollgprintentry(Entry e, int cmd)
{
uchar *p, *pe;
int r, rprev = NONE, rx, over = 0, font;
char buf[16];
p = (uchar *)e.start;
pe = (uchar *)e.end;
curentry = e;
if(cmd == 'h')
outinhibit = 1;
while(p < pe){
if(cmd == 'r'){
outchar(*p++);
continue;
}
switch(r = intab[*p++]){ /* assign = */
case TAGS:
if(rprev != NONE){
outrune(rprev);
rprev = NONE;
}
p = reach(p, 0x06);
font = tag[0];
if(cmd == 'h')
outinhibit = (font != 'h');
break;
case TAGE: /* an extra one */
break;
case SPCS:
p = reach(p, 0xba);
r = looknassoc(numtab, asize(numtab), strtol(tag,0,0));
if(r < 0){
if(rprev != NONE){
outrune(rprev);
rprev = NONE;
}
sprint(buf, "\\N'%s'", tag);
outchars(buf);
break;
}
/* else fall through */
default:
if(over){
rx = looknassoc(overtab, asize(overtab), r);
if(rx > 0)
rx = liglookup(rx, rprev);
if(rx > 0 && rx != NONE)
outrune(rx);
else{
outrune(rprev);
if(r == ':')
outrune(0xa8);
else{
outrune(0x5e);
outrune(r);
}
}
over = 0;
rprev = NONE;
}else if(r == '^'){
over = 1;
}else{
if(rprev != NONE)
outrune(rprev);
rprev = r;
}
}
}
if(rprev != NONE)
outrune(rprev);
if(cmd == 'h')
outinhibit = 0;
outnl(0);
}
long
pcollgnextoff(long fromoff)
{
int c, state = 0, defoff = -1;
if(Bseek(bdict, fromoff, 0) < 0)
return -1;
while((c = Bgetc(bdict)) >= 0){
if(c == '\r')
defoff = Boffset(bdict);
switch(state){
case 0:
if(c == 0x05)
state = 1;
break;
case 1:
if(c == 'h')
state = 2;
else
state = 0;
break;
case 2:
if(c == 0x06)
return (Boffset(bdict)-3);
else
state = 0;
break;
}
}
return defoff;
}
void
pcollgprintkey(void)
{
Bprint(bout, "No pronunciation key yet\n");
}
static uchar *
reach(uchar *p, int tagchar)
{
int c; char *q=tag;
while(p < (uchar *)curentry.end){
c = *p++;
if(c == tagchar)
break;
*q++ = c;
if(q >= &tag[sizeof tag-1])
break;
}
*q = 0;
return p;
}

1165
src/cmd/dict/pgw.c Normal file

File diff suppressed because it is too large Load diff

6
src/cmd/dict/rev.awk Normal file
View file

@ -0,0 +1,6 @@
NF == 2 {
printf "%s\t%s\n", $2, $1
}
NF != 2 {
print "ERROR " $0
}

312
src/cmd/dict/robert.c Normal file
View file

@ -0,0 +1,312 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
#include "dict.h"
/*
* Robert Électronique.
*/
enum
{
CIT = MULTIE+1, /* citation ptr followed by long int and ascii label */
BROM, /* bold roman */
ITON, /* start italic */
ROM, /* roman */
SYM, /* symbol font? */
HEL, /* helvetica */
BHEL, /* helvetica bold */
SMALL, /* smaller? */
ITOFF, /* end italic */
SUP, /* following character is superscript */
SUB /* following character is subscript */
};
static Rune intab[256] = {
/*0*/ /*1*/ /*2*/ /*3*/ /*4*/ /*5*/ /*6*/ /*7*/
/*00*/ NONE, 0x263a, 0x263b, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022,
0x25d8, 0x298, L'\n', 0x2642, 0x2640, 0x266a, 0x266b, 0x203b,
/*10*/ 0x21e8, 0x21e6, 0x2195, 0x203c, 0xb6, 0xa7, 0x2043, 0x21a8,
0x2191, 0x2193, 0x2192, 0x2190, 0x2319, 0x2194, 0x25b4, 0x25be,
/*20*/ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, L'\'',
0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
/*30*/ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
/*40*/ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
0x48, 0x49, 0x4a, 0x4b, L'L', 0x4d, 0x4e, 0x4f,
/*50*/ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
0x58, 0x59, 0x5a, 0x5b, L'\\', 0x5d, 0x5e, 0x5f,
/*60*/ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
/*70*/ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
/*80*/ 0xc7, 0xfc, 0xe9, 0xe2, 0xe4, 0xe0, 0xe5, 0xe7,
0xea, 0xeb, 0xe8, 0xef, 0xee, 0xec, 0xc4, 0xc5,
/*90*/ 0xc9, 0xe6, 0xc6, 0xf4, 0xf6, 0xf2, 0xfb, 0xf9,
0xff, 0xd6, 0xdc, 0xa2, 0xa3, 0xa5, 0x20a7, 0x283,
/*a0*/ 0xe1, 0xed, 0xf3, 0xfa, 0xf1, 0xd1, 0xaa, 0xba,
0xbf, 0x2310, 0xac, 0xbd, 0xbc, 0xa1, 0xab, 0xbb,
/*b0*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
/*c0*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
CIT, BROM, NONE, ITON, ROM, SYM, HEL, BHEL,
/*d0*/ NONE, SMALL, ITOFF, SUP, SUB, NONE, NONE, NONE,
NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
/*e0*/ 0x3b1, 0xdf, 0x3b3, 0x3c0, 0x3a3, 0x3c3, 0xb5, 0x3c4,
0x3a6, 0x398, 0x3a9, 0x3b4, 0x221e, 0xd8, 0x3b5, 0x2229,
/*f0*/ 0x2261, 0xb1, 0x2265, 0x2264, 0x2320, 0x2321, 0xf7, 0x2248,
0xb0, 0x2219, 0xb7, 0x221a, 0x207f, 0xb2, 0x220e, 0xa0,
};
static Rune suptab[] = {
['0'] 0x2070, ['1'] 0x2071, ['2'] 0x2072, ['3'] 0x2073,
['4'] 0x2074, ['5'] 0x2075, ['6'] 0x2076, ['7'] 0x2077,
['8'] 0x2078, ['9'] 0x2079, ['+'] 0x207a, ['-'] 0x207b,
['='] 0x207c, ['('] 0x207d, [')'] 0x207e, ['a'] 0xaa,
['n'] 0x207f, ['o'] 0xba
};
static Rune subtab[] = {
['0'] 0x2080, ['1'] 0x2081, ['2'] 0x2082, ['3'] 0x2083,
['4'] 0x2084, ['5'] 0x2085, ['6'] 0x2086, ['7'] 0x2087,
['8'] 0x2088, ['9'] 0x2089, ['+'] 0x208a, ['-'] 0x208b,
['='] 0x208c, ['('] 0x208d, [')'] 0x208e
};
#define GSHORT(p) (((p)[0]<<8) | (p)[1])
#define GLONG(p) (((p)[0]<<24) | ((p)[1]<<16) | ((p)[2]<<8) | (p)[3])
static char cfile[] = "/lib/dict/robert/cits.rob";
static char dfile[] = "/lib/dict/robert/defs.rob";
static char efile[] = "/lib/dict/robert/etym.rob";
static char kfile[] = "/lib/dict/robert/_phon";
static Biobuf * cb;
static Biobuf * db;
static Biobuf * eb;
static Biobuf * Bouvrir(char*);
static void citation(int, int);
static void robertprintentry(Entry*, Entry*, int);
void
robertindexentry(Entry e, int cmd)
{
uchar *p = (uchar *)e.start;
long ea, el, da, dl, fa;
Entry def, etym;
ea = GLONG(&p[0]);
el = GSHORT(&p[4]);
da = GLONG(&p[6]);
dl = GSHORT(&p[10]);
fa = GLONG(&p[12]);
USED(fa);
if(db == 0)
db = Bouvrir(dfile);
def.start = malloc(dl+1);
def.end = def.start + dl;
def.doff = da;
Bseek(db, da, 0);
Bread(db, def.start, dl);
*def.end = 0;
if(cmd == 'h'){
robertprintentry(&def, 0, cmd);
}else{
if(eb == 0)
eb = Bouvrir(efile);
etym.start = malloc(el+1);
etym.end = etym.start + el;
etym.doff = ea;
Bseek(eb, ea, 0);
Bread(eb, etym.start, el);
*etym.end = 0;
robertprintentry(&def, &etym, cmd);
free(etym.start);
}
free(def.start);
}
static void
robertprintentry(Entry *def, Entry *etym, int cmd)
{
uchar *p, *pe;
Rune r; int c, n;
int baseline = 0;
int lineno = 0;
int cit = 0;
p = (uchar *)def->start;
pe = (uchar *)def->end;
while(p < pe){
if(cmd == 'r'){
outchar(*p++);
continue;
}
c = *p++;
switch(r = intab[c]){ /* assign = */
case BROM:
case ITON:
case ROM:
case SYM:
case HEL:
case BHEL:
case SMALL:
case ITOFF:
case NONE:
if(debug)
outprint("\\%.2ux", c);
baseline = 0;
break;
case SUP:
baseline = 1;
break;
case SUB:
baseline = -1;
break;
case CIT:
n = p[0] | (p[1]<<8) | (p[2]<<16) | (p[3]<<24);
p += 4;
if(debug)
outprint("[%d]", n);
while(*p == ' ' || ('0'<=*p && *p<='9') || *p == '.'){
if(debug)
outchar(*p);
++p;
}
++cit;
outnl(2);
citation(n, cmd);
baseline = 0;
break;
case '\n':
outnl(0);
baseline = 0;
++lineno;
break;
default:
if(baseline > 0 && r < nelem(suptab))
r = suptab[r];
else if(baseline < 0 && r < nelem(subtab))
r = subtab[r];
if(cit){
outchar('\n');
cit = 0;
}
outrune(r);
baseline = 0;
break;
}
if(r == '\n'){
if(cmd == 'h')
break;
if(lineno == 1 && etym)
robertprintentry(etym, 0, cmd);
}
}
outnl(0);
}
static void
citation(int addr, int cmd)
{
Entry cit;
if(cb == 0)
cb = Bouvrir(cfile);
Bseek(cb, addr, 0);
cit.start = Brdline(cb, 0xc8);
cit.end = cit.start + Blinelen(cb) - 1;
cit.doff = addr;
*cit.end = 0;
robertprintentry(&cit, 0, cmd);
}
long
robertnextoff(long fromoff)
{
return (fromoff & ~15) + 16;
}
void
robertprintkey(void)
{
Biobuf *db;
char *l;
db = Bouvrir(kfile);
while(l = Brdline(db, '\n')) /* assign = */
Bwrite(bout, l, Blinelen(db));
Bterm(db);
}
void
robertflexentry(Entry e, int cmd)
{
uchar *p, *pe;
Rune r; int c;
int lineno = 1;
p = (uchar *)e.start;
pe = (uchar *)e.end;
while(p < pe){
if(cmd == 'r'){
Bputc(bout, *p++);
continue;
}
c = *p++;
r = intab[c];
if(r == '$')
r = '\n';
if(r == '\n'){
++lineno;
if(cmd == 'h' && lineno > 2)
break;
}
if(cmd == 'h' && lineno < 2)
continue;
if(r > MULTIE){
if(debug)
Bprint(bout, "\\%.2ux", c);
continue;
}
if(r < Runeself)
Bputc(bout, r);
else
Bputrune(bout, r);
}
outnl(0);
}
long
robertnextflex(long fromoff)
{
int c;
if(Bseek(bdict, fromoff, 0) < 0)
return -1;
while((c = Bgetc(bdict)) >= 0){
if(c == '$')
return Boffset(bdict);
}
return -1;
}
static Biobuf *
Bouvrir(char *fichier)
{
Biobuf *db;
db = Bopen(fichier, OREAD);
if(db == 0){
fprint(2, "%s: impossible d'ouvrir %s: %r\n", argv0, fichier);
exits("ouvrir");
}
return db;
}

46
src/cmd/dict/simple.c Normal file
View file

@ -0,0 +1,46 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
#include "dict.h"
/*
* Routines for handling dictionaries in UTF, headword
* separated from entry by tab, entries separated by newline.
*/
void
simpleprintentry(Entry e, int cmd)
{
uchar *p, *pe;
p = (uchar *)e.start;
pe = (uchar *)e.end;
while(p < pe){
if(*p == '\t'){
if(cmd == 'h')
break;
else
outchar(' '), ++p;
}else if(*p == '\n')
break;
else
outchar(*p++);
}
outnl(0);
}
long
simplenextoff(long fromoff)
{
if(Bseek(bdict, fromoff, 0) < 0)
return -1;
if(Brdline(bdict, '\n') == 0)
return -1;
return Boffset(bdict);
}
void
simpleprintkey(void)
{
Bprint(bout, "No pronunciation key.\n");
}

203
src/cmd/dict/slang.c Normal file
View file

@ -0,0 +1,203 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
#include "dict.h"
/* Possible tags */
enum {
DF, /* definition */
DX, /* definition/example */
ET, /* etymology */
EX, /* example */
LA, /* label */
ME, /* main entry */
NU, /* sense number */
PR, /* pronunciation */
PS, /* grammar part */
XR, /* cross reference */
XX, /* cross reference (whole entry) */
};
/* Assoc tables must be sorted on first field */
static Assoc tagtab[] = {
{"df", DF},
{"dx", DX},
{"et", ET},
{"ex", EX},
{"la", LA},
{"me", ME},
{"nu", NU},
{"pr", PR},
{"ps", PS},
{"xr", XR},
{"xx", XX},
};
static long sget(char *, char *, char **, char **);
static void soutpiece(char *, char *);
void
slangprintentry(Entry e, int cmd)
{
char *p, *pe, *vs, *ve;
long t;
p = e.start;
pe = e.end;
if(cmd == 'h') {
t = sget(p, pe, &vs, &ve);
if(t == ME)
soutpiece(vs, ve);
outnl(0);
return;
}
while(p < pe) {
switch(sget(p, pe, &vs, &ve)) {
case DF:
soutpiece(vs, ve);
outchars(". ");
break;
case DX:
soutpiece(vs, ve);
outchars(". ");
break;
case ET:
outchars("[");
soutpiece(vs, ve);
outchars("] ");
break;
case EX:
outchars("E.g., ");
soutpiece(vs, ve);
outchars(". ");
break;
case LA:
outchars("(");
soutpiece(vs, ve);
outchars(") ");
break;
case ME:
outnl(0);
soutpiece(vs, ve);
outnl(0);
break;
case NU:
outnl(2);
soutpiece(vs, ve);
outchars(". ");
break;
case PR:
outchars("[");
soutpiece(vs, ve);
outchars("] ");
break;
case PS:
outnl(1);
soutpiece(vs, ve);
outchars(". ");
break;
case XR:
outchars("See ");
soutpiece(vs, ve);
outchars(". ");
break;
case XX:
outchars("See ");
soutpiece(vs, ve);
outchars(". ");
break;
default:
ve = pe; /* will end loop */
break;
}
p = ve;
}
outnl(0);
}
long
slangnextoff(long fromoff)
{
long a;
char *p;
a = Bseek(bdict, fromoff, 0);
if(a < 0)
return -1;
for(;;) {
p = Brdline(bdict, '\n');
if(!p)
break;
if(p[0] == 'm' && p[1] == 'e' && p[2] == ' ')
return (Boffset(bdict)-Blinelen(bdict));
}
return -1;
}
void
slangprintkey(void)
{
Bprint(bout, "No key\n");
}
/*
* Starting from b, find next line beginning with a tag.
* Don't go past e, but assume *e==0.
* Return tag value, or -1 if no more tags before e.
* Set pvb to beginning of value (after tag).
* Set pve to point at newline that ends the value.
*/
static long
sget(char *b, char *e, char **pvb, char **pve)
{
char *p;
char buf[3];
long t, tans;
buf[2] = 0;
tans = -1;
for(p = b;;) {
if(p[2] == ' ') {
buf[0] = p[0];
buf[1] = p[1];
t = lookassoc(tagtab, asize(tagtab), buf);
if(t < 0) {
if(debug)
err("tag %s\n", buf);
p += 3;
} else {
if(tans < 0) {
p += 3;
tans = t;
*pvb = p;
} else {
*pve = p;
break;
}
}
}
p = strchr(p, '\n');
if(!p || ++p >= e) {
if(tans >= 0)
*pve = e-1;
break;
}
}
return tans;
}
static void
soutpiece(char *b, char *e)
{
int c, lastc;
lastc = 0;
while(b < e) {
c = *b++;
if(c == '\n')
c = ' ';
if(!(c == ' ' && lastc == ' ') && c != '@')
outchar(c);
lastc = c;
}
}

13
src/cmd/dict/t.awk Normal file
View file

@ -0,0 +1,13 @@
NF == 2 {
if($2 !~ / or / || $2 ~ /\(or/)
print $0
else {
n = split($2, a, / or /)
for(i = 1; i <= n; i++) {
printf "%s\t%s\n", $1, a[i]
}
}
}
NF != 2 {
print $0
}

86
src/cmd/dict/thesaurus.c Normal file
View file

@ -0,0 +1,86 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
#include "dict.h"
void
thesprintentry(Entry e, int cmd)
{
char *p, *pe;
int c, i;
p = e.start;
pe = e.end;
while(p < pe) {
c = *p++;
if(cmd == 'r') {
outchar(c);
continue;
}
switch(c) {
case '*':
c = *p++;
if(cmd == 'h' && c != 'L') {
outnl(0);
return;
}
if(c == 'L' && cmd != 'h')
outnl(0);
if(c == 'S') {
outchar('(');
outchar(*p++);
outchar(')');
}
break;
case '#':
c = *p++;
i = *p++ - '0' - 1;
if(i < 0 || i > 4)
break;
switch(c) {
case 'a': outrune(L"áàâäa"[i]); break;
case 'e': outrune(L"éèêëe"[i]); break;
case 'o': outrune(L"óòôöo"[i]); break;
case 'c': outrune(L"ccccç"[i]); break;
default: outchar(c); break;
}
break;
case '+':
case '<':
break;
case ' ':
if(cmd == 'h' && *p == '*') {
outnl(0);
return;
}
default:
outchar(c);
}
}
outnl(0);
}
long
thesnextoff(long fromoff)
{
long a;
char *p;
a = Bseek(bdict, fromoff, 0);
if(a < 0)
return -1;
for(;;) {
p = Brdline(bdict, '\n');
if(!p)
break;
if(p[0] == '*' && p[1] == 'L')
return (Boffset(bdict)-Blinelen(bdict));
}
return -1;
}
void
thesprintkey(void)
{
Bprint(bout, "No key\n");
}

577
src/cmd/dict/utils.c Normal file
View file

@ -0,0 +1,577 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
#include "dict.h"
Dict dicts[] = {
{"oed", "Oxford English Dictionary, 2nd Ed.",
"dict/oed2", "dict/oed2index",
oednextoff, oedprintentry, oedprintkey},
{"ahd", "American Heritage Dictionary, 2nd College Ed.",
"ahd/DICT.DB", "ahd/index",
ahdnextoff, ahdprintentry, ahdprintkey},
{"pgw", "Project Gutenberg Webster Dictionary",
"dict/pgw", "dict/pgwindex",
pgwnextoff, pgwprintentry, pgwprintkey},
{"thesaurus", "Collins Thesaurus",
"dict/thesaurus", "dict/thesindex",
thesnextoff, thesprintentry, thesprintkey},
{"ce", "Gendai Chinese->English",
"dict/world/sansdata/sandic24.dat",
"dict/world/sansdata/ceindex",
worldnextoff, worldprintentry, worldprintkey},
{"ceh", "Gendai Chinese->English (Hanzi index)",
"dict/world/sansdata/sandic24.dat",
"dict/world/sansdata/cehindex",
worldnextoff, worldprintentry, worldprintkey},
{"ec", "Gendai English->Chinese",
"dict/world/sansdata/sandic24.dat",
"dict/world/sansdata/ecindex",
worldnextoff, worldprintentry, worldprintkey},
{"dae", "Gyldendal Danish->English",
"dict/world/gylddata/sandic30.dat",
"dict/world/gylddata/daeindex",
worldnextoff, worldprintentry, worldprintkey},
{"eda", "Gyldendal English->Danish",
"dict/world/gylddata/sandic29.dat",
"dict/world/gylddata/edaindex",
worldnextoff, worldprintentry, worldprintkey},
{"due", "Wolters-Noordhoff Dutch->English",
"dict/world/woltdata/sandic07.dat",
"dict/world/woltdata/deindex",
worldnextoff, worldprintentry, worldprintkey},
{"edu", "Wolters-Noordhoff English->Dutch",
"dict/world/woltdata/sandic06.dat",
"dict/world/woltdata/edindex",
worldnextoff, worldprintentry, worldprintkey},
{"fie", "WSOY Finnish->English",
"dict/world/werndata/sandic32.dat",
"dict/world/werndata/fieindex",
worldnextoff, worldprintentry, worldprintkey},
{"efi", "WSOY English->Finnish",
"dict/world/werndata/sandic31.dat",
"dict/world/werndata/efiindex",
worldnextoff, worldprintentry, worldprintkey},
{"fe", "Collins French->English",
"dict/fe", "dict/feindex",
pcollnextoff, pcollprintentry, pcollprintkey},
{"ef", "Collins English->French",
"dict/ef", "dict/efindex",
pcollnextoff, pcollprintentry, pcollprintkey},
{"ge", "Collins German->English",
"dict/ge", "dict/geindex",
pcollgnextoff, pcollgprintentry, pcollgprintkey},
{"eg", "Collins English->German",
"dict/eg", "dict/egindex",
pcollgnextoff, pcollgprintentry, pcollgprintkey},
{"ie", "Collins Italian->English",
"dict/ie", "dict/ieindex",
pcollnextoff, pcollprintentry, pcollprintkey},
{"ei", "Collins English->Italian",
"dict/ei", "dict/eiindex",
pcollnextoff, pcollprintentry, pcollprintkey},
{"je", "Sanshusha Japanese->English",
"dict/world/sansdata/sandic18.dat",
"dict/world/sansdata/jeindex",
worldnextoff, worldprintentry, worldprintkey},
{"jek", "Sanshusha Japanese->English (Kanji index)",
"dict/world/sansdata/sandic18.dat",
"dict/world/sansdata/jekindex",
worldnextoff, worldprintentry, worldprintkey},
{"ej", "Sanshusha English->Japanese",
"dict/world/sansdata/sandic18.dat",
"dict/world/sansdata/ejindex",
worldnextoff, worldprintentry, worldprintkey},
{"tjeg", "Sanshusha technical Japanese->English,German",
"dict/world/sansdata/sandic16.dat",
"dict/world/sansdata/tjegindex",
worldnextoff, worldprintentry, worldprintkey},
{"tjegk", "Sanshusha technical Japanese->English,German (Kanji index)",
"dict/world/sansdata/sandic16.dat",
"dict/world/sansdata/tjegkindex",
worldnextoff, worldprintentry, worldprintkey},
{"tegj", "Sanshusha technical English->German,Japanese",
"dict/world/sansdata/sandic16.dat",
"dict/world/sansdata/tegjindex",
worldnextoff, worldprintentry, worldprintkey},
{"tgje", "Sanshusha technical German->Japanese,English",
"dict/world/sansdata/sandic16.dat",
"dict/world/sansdata/tgjeindex",
worldnextoff, worldprintentry, worldprintkey},
{"ne", "Kunnskapforlaget Norwegian->English",
"dict/world/kunndata/sandic28.dat",
"dict/world/kunndata/neindex",
worldnextoff, worldprintentry, worldprintkey},
{"en", "Kunnskapforlaget English->Norwegian",
"dict/world/kunndata/sandic27.dat",
"dict/world/kunndata/enindex",
worldnextoff, worldprintentry, worldprintkey},
{"re", "Leon Ungier Russian->English",
"dict/re", "dict/reindex",
simplenextoff, simpleprintentry, simpleprintkey},
{"er", "Leon Ungier English->Russian",
"dict/re", "dict/erindex",
simplenextoff, simpleprintentry, simpleprintkey},
{"se", "Collins Spanish->English",
"dict/se", "dict/seindex",
pcollnextoff, pcollprintentry, pcollprintkey},
{"es", "Collins English->Spanish",
"dict/es", "dict/esindex",
pcollnextoff, pcollprintentry, pcollprintkey},
{"swe", "Esselte Studium Swedish->English",
"dict/world/essedata/sandic34.dat",
"dict/world/essedata/sweindex",
worldnextoff, worldprintentry, worldprintkey},
{"esw", "Esselte Studium English->Swedish",
"dict/world/essedata/sandic33.dat",
"dict/world/essedata/eswindex",
worldnextoff, worldprintentry, worldprintkey},
{"movie", "Movies -- by title",
"movie/data", "dict/movtindex",
movienextoff, movieprintentry, movieprintkey},
{"moviea", "Movies -- by actor",
"movie/data", "dict/movaindex",
movienextoff, movieprintentry, movieprintkey},
{"movied", "Movies -- by director",
"movie/data", "dict/movdindex",
movienextoff, movieprintentry, movieprintkey},
{"slang", "English Slang",
"dict/slang", "dict/slangindex",
slangnextoff, slangprintentry, slangprintkey},
{"robert", "Robert Électronique",
"dict/robert/_pointers", "dict/robert/_index",
robertnextoff, robertindexentry, robertprintkey},
{"robertv", "Robert Électronique - formes des verbes",
"dict/robert/flex.rob", "dict/robert/_flexindex",
robertnextflex, robertflexentry, robertprintkey},
{0, 0, 0, 0, 0}
};
typedef struct Lig Lig;
struct Lig {
Rune start; /* accent rune */
Rune pairs[100]; /* <char,accented version> pairs */
};
static Lig ligtab[Nligs] = {
[LACU-LIGS] {0xb4, {0x41, 0xc1, 0x61, 0xe1, 0x43, 0x106, 0x63, 0x107, 0x45, 0xc9, 0x65, 0xe9, 0x67, 0x123, 0x49, 0xcd, 0x69, 0xed, 0x131, 0xed, 0x4c, 0x139, 0x6c, 0x13a, 0x4e, 0x143, 0x6e, 0x144, 0x4f, 0xd3, 0x6f, 0xf3, 0x52, 0x154, 0x72, 0x155, 0x53, 0x15a, 0x73, 0x15b, 0x55, 0xda, 0x75, 0xfa, 0x59, 0xdd, 0x79, 0xfd, 0x5a, 0x179, 0x7a, 0x17a, 0}},
[LGRV-LIGS] {0x2cb, {0x41, 0xc0, 0x61, 0xe0, 0x45, 0xc8, 0x65, 0xe8, 0x49, 0xcc, 0x69, 0xec, 0x131, 0xec, 0x4f, 0xd2, 0x6f, 0xf2, 0x55, 0xd9, 0x75, 0xf9, 0}},
[LUML-LIGS] {0xa8, {0x41, 0xc4, 0x61, 0xe4, 0x45, 0xcb, 0x65, 0xeb, 0x49, 0xcf, 0x69, 0xef, 0x4f, 0xd6, 0x6f, 0xf6, 0x55, 0xdc, 0x75, 0xfc, 0x59, 0x178, 0x79, 0xff, 0}},
[LCED-LIGS] {0xb8, {0x43, 0xc7, 0x63, 0xe7, 0x47, 0x122, 0x4b, 0x136, 0x6b, 0x137, 0x4c, 0x13b, 0x6c, 0x13c, 0x4e, 0x145, 0x6e, 0x146, 0x52, 0x156, 0x72, 0x157, 0x53, 0x15e, 0x73, 0x15f, 0x54, 0x162, 0x74, 0x163, 0}},
[LTIL-LIGS] {0x2dc, {0x41, 0xc3, 0x61, 0xe3, 0x49, 0x128, 0x69, 0x129, 0x131, 0x129, 0x4e, 0xd1, 0x6e, 0xf1, 0x4f, 0xd5, 0x6f, 0xf5, 0x55, 0x168, 0x75, 0x169, 0}},
[LBRV-LIGS] {0x2d8, {0x41, 0x102, 0x61, 0x103, 0x45, 0x114, 0x65, 0x115, 0x47, 0x11e, 0x67, 0x11f, 0x49, 0x12c, 0x69, 0x12d, 0x131, 0x12d, 0x4f, 0x14e, 0x6f, 0x14f, 0x55, 0x16c, 0x75, 0x16d, 0}},
[LRNG-LIGS] {0x2da, {0x41, 0xc5, 0x61, 0xe5, 0x55, 0x16e, 0x75, 0x16f, 0}},
[LDOT-LIGS] {0x2d9, {0x43, 0x10a, 0x63, 0x10b, 0x45, 0x116, 0x65, 0x117, 0x47, 0x120, 0x67, 0x121, 0x49, 0x130, 0x4c, 0x13f, 0x6c, 0x140, 0x5a, 0x17b, 0x7a, 0x17c, 0}},
[LDTB-LIGS] {0x2e, {0}},
[LFRN-LIGS] {0x2322, {0x41, 0xc2, 0x61, 0xe2, 0x43, 0x108, 0x63, 0x109, 0x45, 0xca, 0x65, 0xea, 0x47, 0x11c, 0x67, 0x11d, 0x48, 0x124, 0x68, 0x125, 0x49, 0xce, 0x69, 0xee, 0x131, 0xee, 0x4a, 0x134, 0x6a, 0x135, 0x4f, 0xd4, 0x6f, 0xf4, 0x53, 0x15c, 0x73, 0x15d, 0x55, 0xdb, 0x75, 0xfb, 0x57, 0x174, 0x77, 0x175, 0x59, 0x176, 0x79, 0x177, 0}},
[LFRB-LIGS] {0x32f, {0}},
[LOGO-LIGS] {0x2db, {0x41, 0x104, 0x61, 0x105, 0x45, 0x118, 0x65, 0x119, 0x49, 0x12e, 0x69, 0x12f, 0x131, 0x12f, 0x55, 0x172, 0x75, 0x173, 0}},
[LMAC-LIGS] {0xaf, {0x41, 0x100, 0x61, 0x101, 0x45, 0x112, 0x65, 0x113, 0x49, 0x12a, 0x69, 0x12b, 0x131, 0x12b, 0x4f, 0x14c, 0x6f, 0x14d, 0x55, 0x16a, 0x75, 0x16b, 0}},
[LHCK-LIGS] {0x2c7, {0x43, 0x10c, 0x63, 0x10d, 0x44, 0x10e, 0x64, 0x10f, 0x45, 0x11a, 0x65, 0x11b, 0x4c, 0x13d, 0x6c, 0x13e, 0x4e, 0x147, 0x6e, 0x148, 0x52, 0x158, 0x72, 0x159, 0x53, 0x160, 0x73, 0x161, 0x54, 0x164, 0x74, 0x165, 0x5a, 0x17d, 0x7a, 0x17e, 0}},
[LASP-LIGS] {0x2bd, {0}},
[LLEN-LIGS] {0x2bc, {0}},
[LBRB-LIGS] {0x32e, {0}}
};
Rune multitab[Nmulti][5] = {
[MAAS-MULTI] {0x2bd, 0x3b1, 0},
[MALN-MULTI] {0x2bc, 0x3b1, 0},
[MAND-MULTI] {0x61, 0x6e, 0x64, 0},
[MAOQ-MULTI] {0x61, 0x2f, 0x71, 0},
[MBRA-MULTI] {0x3c, 0x7c, 0},
[MDD-MULTI] {0x2e, 0x2e, 0},
[MDDD-MULTI] {0x2e, 0x2e, 0x2e, 0},
[MEAS-MULTI] {0x2bd, 0x3b5, 0},
[MELN-MULTI] {0x2bc, 0x3b5, 0},
[MEMM-MULTI] {0x2014, 0x2014, 0},
[MHAS-MULTI] {0x2bd, 0x3b7, 0},
[MHLN-MULTI] {0x2bc, 0x3b7, 0},
[MIAS-MULTI] {0x2bd, 0x3b9, 0},
[MILN-MULTI] {0x2bc, 0x3b9, 0},
[MLCT-MULTI] {0x63, 0x74, 0},
[MLFF-MULTI] {0x66, 0x66, 0},
[MLFFI-MULTI] {0x66, 0x66, 0x69, 0},
[MLFFL-MULTI] {0x66, 0x66, 0x6c, 0},
[MLFL-MULTI] {0x66, 0x6c, 0},
[MLFI-MULTI] {0x66, 0x69, 0},
[MLLS-MULTI] {0x26b, 0x26b, 0},
[MLST-MULTI] {0x73, 0x74, 0},
[MOAS-MULTI] {0x2bd, 0x3bf, 0},
[MOLN-MULTI] {0x2bc, 0x3bf, 0},
[MOR-MULTI] {0x6f, 0x72, 0},
[MRAS-MULTI] {0x2bd, 0x3c1, 0},
[MRLN-MULTI] {0x2bc, 0x3c1, 0},
[MTT-MULTI] {0x7e, 0x7e, 0},
[MUAS-MULTI] {0x2bd, 0x3c5, 0},
[MULN-MULTI] {0x2bc, 0x3c5, 0},
[MWAS-MULTI] {0x2bd, 0x3c9, 0},
[MWLN-MULTI] {0x2bc, 0x3c9, 0},
[MOE-MULTI] {0x6f, 0x65, 0},
[MES-MULTI] {0x20, 0x20, 0},
};
#define risupper(r) (0x41 <= (r) && (r) <= 0x5a)
#define rislatin1(r) (0xC0 <= (r) && (r) <= 0xFF)
#define rtolower(r) ((r)-'A'+'a')
static Rune latin_fold_tab[] =
{
/* Table to fold latin 1 characters to ASCII equivalents
based at Rune value 0xc0
À Á Â Ã Ä Å Æ Ç
È É Ê Ë Ì Í Î Ï
Ð Ñ Ò Ó Ô Õ Ö ×
Ø Ù Ú Û Ü Ý Þ ß
à á â ã ä å æ ç
è é ê ë ì í î ï
ð ñ ò ó ô õ ö ÷
ø ù ú û ü ý þ ÿ
*/
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
'd', 'n', 'o', 'o', 'o', 'o', 'o', 0 ,
'o', 'u', 'u', 'u', 'u', 'y', 0 , 0 ,
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
'd', 'n', 'o', 'o', 'o', 'o', 'o', 0 ,
'o', 'u', 'u', 'u', 'u', 'y', 0 , 'y',
};
static Rune *ttabstack[20];
static int ntt;
/*
* tab is an array of n Assoc's, sorted by key.
* Look for key in tab, and return corresponding val
* or -1 if not there
*/
long
lookassoc(Assoc *tab, int n, char *key)
{
Assoc *q;
long i, low, high;
int r;
for(low = -1, high = n; high > low+1; ){
i = (high+low)/2;
q = &tab[i];
if((r=strcmp(key, q->key))<0)
high = i;
else if(r == 0)
return q->val;
else
low=i;
}
return -1;
}
long
looknassoc(Nassoc *tab, int n, long key)
{
Nassoc *q;
long i, low, high;
for(low = -1, high = n; high > low+1; ){
i = (high+low)/2;
q = &tab[i];
if(key < q->key)
high = i;
else if(key == q->key)
return q->val;
else
low=i;
}
return -1;
}
void
err(char *fmt, ...)
{
char buf[1000];
va_list v;
va_start(v, fmt);
vsnprint(buf, sizeof(buf), fmt, v);
va_end(v);
fprint(2, "%s: %s\n", argv0, buf);
}
/*
* Write the rune r to bout, keeping track of line length
* and breaking the lines (at blanks) when they get too long
*/
void
outrune(long r)
{
if(outinhibit)
return;
if(++linelen > breaklen && r == 0x20) {
Bputc(bout, '\n');
linelen = 0;
} else
Bputrune(bout, r);
}
void
outrunes(Rune *rp)
{
Rune r;
while((r = *rp++) != 0)
outrune(r);
}
/* like outrune, but when arg is know to be a char */
void
outchar(int c)
{
if(outinhibit)
return;
if(++linelen > breaklen && c == ' ') {
c ='\n';
linelen = 0;
}
Bputc(bout, c);
}
void
outchars(char *s)
{
char c;
while((c = *s++) != 0)
outchar(c);
}
void
outprint(char *fmt, ...)
{
char buf[1000];
va_list v;
va_start(v, fmt);
vsnprint(buf, sizeof(buf), fmt, v);
va_end(v);
outchars(buf);
}
void
outpiece(char *b, char *e)
{
int c, lastc;
lastc = 0;
while(b < e) {
c = *b++;
if(c == '\n')
c = ' ';
if(!(c == ' ' && lastc == ' '))
outchar(c);
lastc = c;
}
}
/*
* Go to new line if not already there; indent if ind != 0.
* If ind > 1, leave a blank line too.
* Slight hack: assume if current line is only one or two
* characters long, then they were spaces.
*/
void
outnl(int ind)
{
if(outinhibit)
return;
if(ind) {
if(ind > 1) {
if(linelen > 2)
Bputc(bout, '\n');
Bprint(bout, "\n ");
} else if(linelen == 0)
Bprint(bout, " ");
else if(linelen == 1)
Bputc(bout, ' ');
else if(linelen != 2)
Bprint(bout, "\n ");
linelen = 2;
} else {
if(linelen) {
Bputc(bout, '\n');
linelen = 0;
}
}
}
/*
* Fold the runes in null-terminated rp.
* Use the sort(1) definition of folding (uppercase to lowercase,
* latin1-accented characters to corresponding unaccented chars)
*/
void
fold(Rune *rp)
{
Rune r;
while((r = *rp) != 0) {
if (rislatin1(r) && latin_fold_tab[r-0xc0])
r = latin_fold_tab[r-0xc0];
if(risupper(r))
r = rtolower(r);
*rp++ = r;
}
}
/*
* Like fold, but put folded result into new
* (assumed to have enough space).
* old is a regular expression, but we know that
* metacharacters aren't affected
*/
void
foldre(char *new, char *old)
{
Rune r;
while(*old) {
old += chartorune(&r, old);
if (rislatin1(r) && latin_fold_tab[r-0xc0])
r = latin_fold_tab[r-0xc0];
if(risupper(r))
r = rtolower(r);
new += runetochar(new, &r);
}
*new = 0;
}
/*
* acomp(s, t) returns:
* -2 if s strictly precedes t
* -1 if s is a prefix of t
* 0 if s is the same as t
* 1 if t is a prefix of s
* 2 if t strictly precedes s
*/
int
acomp(Rune *s, Rune *t)
{
int cs, ct;
for(;;) {
cs = *s;
ct = *t;
if(cs != ct)
break;
if(cs == 0)
return 0;
s++;
t++;
}
if(cs == 0)
return -1;
if(ct == 0)
return 1;
if(cs < ct)
return -2;
return 2;
}
/*
* Copy null terminated Runes from 'from' to 'to'.
*/
void
runescpy(Rune *to, Rune *from)
{
while((*to++ = *from++) != 0)
continue;
}
/*
* Conversion of unsigned number to long, no overflow detection
*/
long
runetol(Rune *r)
{
int c;
long n;
n = 0;
for(;; r++){
c = *r;
if(0x30<=c && c<=0x39)
c -= '0';
else
break;
n = n*10 + c;
}
return n;
}
/*
* See if there is a rune corresponding to the accented
* version of r with accent acc (acc in [LIGS..LIGE-1]),
* and return it if so, else return NONE.
*/
Rune
liglookup(Rune acc, Rune r)
{
Rune *p;
if(acc < LIGS || acc >= LIGE)
return NONE;
for(p = ligtab[acc-LIGS].pairs; *p; p += 2)
if(*p == r)
return *(p+1);
return NONE;
}
/*
* Maintain a translation table stack (a translation table
* is an array of Runes indexed by bytes or 7-bit bytes).
* If starting is true, push the curtab onto the stack
* and return newtab; else pop the top of the stack and
* return it.
* If curtab is 0, initialize the stack and return.
*/
Rune *
changett(Rune *curtab, Rune *newtab, int starting)
{
if(curtab == 0) {
ntt = 0;
return 0;
}
if(starting) {
if(ntt >= asize(ttabstack)) {
if(debug)
err("translation stack overflow");
return curtab;
}
ttabstack[ntt++] = curtab;
return newtab;
} else {
if(ntt == 0) {
if(debug)
err("translation stack underflow");
return curtab;
}
return ttabstack[--ntt];
}
}

184
src/cmd/dict/world.c Normal file
View file

@ -0,0 +1,184 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
#include "dict.h"
#include "kuten.h"
/*
* Routines for handling dictionaries in the "Languages of the World"
* format. worldnextoff *must* be called with <address of valid entry>+1.
*/
#define GSHORT(p) (((p)[0]<<8)|(p)[1])
static void putchar(int, int*);
#define NONE 0xffff
/* adapted from jhelling@cs.ruu.nl (Jeroen Hellingman) */
static Rune chartab[] = {
/*00*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
NONE, NONE, L'\n', 0xe6, 0xf8, 0xe5, 0xe4, 0xf6,
/*10*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
NONE, NONE, NONE, 0xc6, 0xd8, 0xc5, 0xc4, 0xd6,
/*20*/ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, '\'',
0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
/*30*/ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
/*40*/ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
0x48, 0x49, 0x4a, 0x4b, L'L', 0x4d, 0x4e, 0x4f,
/*50*/ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
0x58, 0x59, 0x5a, 0x5b, L'\\', 0x5d, 0x5e, 0x5f,
/*60*/ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
/*70*/ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, NONE,
/*80*/ 0xc7, 0xfc, 0xe9, 0xe2, 0xe4, 0xe0, 0xe5, 0xe7,
0xea, 0xeb, 0xe8, 0xef, 0xee, 0xec, 0xc4, 0xc5,
/*90*/ 0xc9, 0xe6, 0xc6, 0xf4, 0xf6, 0xf2, 0xfb, 0xf9,
0xff, 0xd6, 0xdc, 0xa2, 0xa3, 0xa5, 0x20a7, 0x283,
/*a0*/ 0xe1, 0xed, 0xf3, 0xfa, 0xf1, 0xd1, 0xaa, 0xba,
0xbf, 0x2310, 0xac, 0xbd, 0xbc, 0xa1, 0xab, 0xbb,
/*b0*/ 0x254, 0x259, 0xf0, 0x283, 0x292, 0x14b, 0x251, 0x7a,
0x26a, 0xf0, 0x292, 0xe3, 0x153, 0x169, 0x28c, 0x265,
/*c0*/ 0x280, 0xeb, 0x6c, 0x28c, 0xf5, 0xf1, 0x152, NONE,
NONE, 0x53, 0x73, 0x5a, 0x7a, NONE, NONE, NONE,
/*d0*/ 0xdf, NONE, NONE, 0x101, 0x12b, 0x16b, 0x113, 0x14d,
NONE, NONE, NONE, 0x20, NONE, NONE, NONE, NONE,
/*e0*/ 0x3b1, 0x3b2, 0x3b3, 0x3c0, 0x3a3, 0x3c3, 0xb5, 0x3c4,
0x3a6, 0x398, 0x3a9, 0x3b4, 0x221e, 0xd8, 0x3b5, 0x2229,
/*f0*/ 0x2261, 0xb1, 0x2265, 0x2264, 0x2320, 0x2321, 0xf7, 0x2248,
0xb0, 0x2219, 0xb7, NONE, NONE, NONE, NONE, NONE,
};
enum{ Utf, Kanahi, Kanalo=Kanahi+1, GBhi, GBlo=GBhi+1, };
void
worldprintentry(Entry e, int cmd)
{
int nh, state[3];
uchar *p, *pe;
p = (uchar *)e.start;
pe = (uchar *)e.end;
nh = GSHORT(p);
p += 6;
if(cmd == 'h')
pe = p+nh;
state[0] = Utf;
state[1] = 0;
state[2] = 0;
while(p < pe){
if(cmd == 'r')
outchar(*p++);
else
putchar(*p++, state);
}
outnl(0);
}
long
worldnextoff(long fromoff)
{
int nh, np, nd;
uchar buf[6];
if(Bseek(bdict, fromoff-1, 0) < 0)
return -1;
if(Bread(bdict, buf, 6) != 6)
return -1;
nh = GSHORT(buf);
np = GSHORT(buf+2);
nd = GSHORT(buf+4);
return fromoff-1 + 6 + nh + np + nd;
}
static void
putchar(int c, int *state)
{
int xflag = 0;
Rune r;
int hi, lo;
switch(state[0]){
case Kanahi:
case GBhi:
if(CANS2JH(c) || c == 0xff){
state[0]++;
state[1] = c;
break;
}
/* fall through */
case Utf:
if(c == 0xfe){
state[0] = Kanahi;
break;
}else if(c == 0xff){
state[0] = GBhi;
break;
}
r = chartab[c];
if(r < 0x80 && state[2] == 0)
outchar(r);
else if(r == NONE){
switch(c){
case 0xfb:
if(!xflag){
state[2] = 1;
break;
}
case 0xfc:
if(!xflag){
state[2] = 0;
break;
}
case 0x10:
case 0xc7: case 0xc8:
case 0xd8: case 0xd9: case 0xda:
case 0xdc: case 0xdd: case 0xde: case 0xdf:
case 0xfd:
if(!xflag)
break;
/* fall through */
default:
outprint("\\%.2ux", c);
}
}else if(state[2] == 0)
outrune(r);
break;
case Kanalo:
case GBlo:
if(state[1] == 0xff && c == 0xff){
state[0] = Utf;
break;
}
state[0]--;
hi = state[1];
lo = c;
S2J(hi, lo); /* convert to JIS */
r = hi*100 + lo - 3232; /* convert to jis208 */
if(state[0] == Kanahi && r < JIS208MAX)
r = tabjis208[r];
else if(state[0] == GBhi && r < GB2312MAX)
r = tabgb2312[r];
else
r = NONE;
if(r == NONE)
outprint("\\%.2ux\\%.2ux", state[1], c);
else
outrune(r);
break;
}
}
void
worldprintkey(void)
{
Bprint(bout, "No pronunciation key.\n");
}