mirror of
https://github.com/9fans/plan9port.git
synced 2025-01-24 11:41:58 +00:00
add dict
This commit is contained in:
parent
091f74d0a0
commit
0870887793
27 changed files with 8343 additions and 0 deletions
139
src/cmd/dict/ahd.c
Normal file
139
src/cmd/dict/ahd.c
Normal file
|
@ -0,0 +1,139 @@
|
|||
#include <u.h>
|
||||
#include <libc.h>
|
||||
#include <bio.h>
|
||||
#include "dict.h"
|
||||
|
||||
/*
|
||||
* American Heritage Dictionary (encrypted)
|
||||
*/
|
||||
|
||||
static Rune intab[256] = {
|
||||
[0x82] 0xe9,
|
||||
[0x85] 0xe0,
|
||||
[0x89] 0xeb,
|
||||
[0x8a] 0xe8,
|
||||
[0xa4] 0xf1,
|
||||
[0xf8] 0xb0,
|
||||
[0xf9] 0xb7,
|
||||
};
|
||||
|
||||
static char tag[64];
|
||||
|
||||
enum{
|
||||
Run, Openper, Openat, Closeat
|
||||
};
|
||||
|
||||
void
|
||||
ahdprintentry(Entry e, int cmd)
|
||||
{
|
||||
static int inited;
|
||||
long addr;
|
||||
char *p, *t = tag;
|
||||
int obreaklen;
|
||||
int c, state = Run;
|
||||
|
||||
if(!inited){
|
||||
for(c=0; c<256; c++)
|
||||
if(intab[c] == 0)
|
||||
intab[c] = c;
|
||||
inited = 1;
|
||||
}
|
||||
obreaklen = breaklen;
|
||||
breaklen = 80;
|
||||
addr = e.doff;
|
||||
for(p=e.start; p<e.end; p++){
|
||||
c = intab[(*p ^ (addr++>>1))&0xff];
|
||||
switch(state){
|
||||
case Run:
|
||||
if(c == '%'){
|
||||
t = tag;
|
||||
state = Openper;
|
||||
break;
|
||||
}
|
||||
Putchar:
|
||||
if(c == '\n')
|
||||
outnl(0);
|
||||
else if(c < Runeself)
|
||||
outchar(c);
|
||||
else
|
||||
outrune(c);
|
||||
break;
|
||||
|
||||
case Openper:
|
||||
if(c == '@')
|
||||
state = Openat;
|
||||
else{
|
||||
outchar('%');
|
||||
state = Run;
|
||||
goto Putchar;
|
||||
}
|
||||
break;
|
||||
|
||||
case Openat:
|
||||
if(c == '@')
|
||||
state = Closeat;
|
||||
else if(t < &tag[sizeof tag-1])
|
||||
*t++ = c;
|
||||
break;
|
||||
|
||||
case Closeat:
|
||||
if(c == '%'){
|
||||
*t = 0;
|
||||
switch(cmd){
|
||||
case 'h':
|
||||
if(strcmp("EH", tag) == 0)
|
||||
goto out;
|
||||
break;
|
||||
case 'r':
|
||||
outprint("%%@%s@%%", tag);
|
||||
break;
|
||||
}
|
||||
state = Run;
|
||||
}else{
|
||||
if(t < &tag[sizeof tag-1])
|
||||
*t++ = '@';
|
||||
if(t < &tag[sizeof tag-1])
|
||||
*t++ = c;
|
||||
state = Openat;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
out:
|
||||
outnl(0);
|
||||
breaklen = obreaklen;
|
||||
}
|
||||
|
||||
long
|
||||
ahdnextoff(long fromoff)
|
||||
{
|
||||
static char *patterns[] = { "%@NL@%", "%@2@%", 0 };
|
||||
int c, k = 0, state = 0;
|
||||
char *pat = patterns[0];
|
||||
long defoff = -1;
|
||||
|
||||
if(Bseek(bdict, fromoff, 0) < 0)
|
||||
return -1;
|
||||
while((c = Bgetc(bdict)) >= 0){
|
||||
c ^= (fromoff++>>1)&0xff;
|
||||
if(c != pat[state]){
|
||||
state = 0;
|
||||
continue;
|
||||
}
|
||||
if(pat[++state])
|
||||
continue;
|
||||
if(pat = patterns[++k]){ /* assign = */
|
||||
state = 0;
|
||||
defoff = fromoff-6;
|
||||
continue;
|
||||
}
|
||||
return fromoff-5;
|
||||
}
|
||||
return defoff;
|
||||
}
|
||||
|
||||
void
|
||||
ahdprintkey(void)
|
||||
{
|
||||
Bprint(bout, "No pronunciations.\n");
|
||||
}
|
29
src/cmd/dict/canonind.awk
Normal file
29
src/cmd/dict/canonind.awk
Normal file
|
@ -0,0 +1,29 @@
|
|||
# turn output of mkindex into form needed by dict
|
||||
BEGIN {
|
||||
if(ARGC != 2) {
|
||||
print "Usage: awk -F' ' -f canonind.awk rawindex > index"
|
||||
exit 1
|
||||
}
|
||||
file = ARGV[1]
|
||||
ARGV[1] = ""
|
||||
while ((getline < file) > 0) {
|
||||
for(i = 2; i <= NF; i++) {
|
||||
w = $i
|
||||
if(length(w) == 0)
|
||||
continue
|
||||
b = index(w, "(")
|
||||
e = index(w, ")")
|
||||
if(b && e && b < e) {
|
||||
w1 = substr(w, 1, b-1)
|
||||
w2 = substr(w, b+1, e-b-1)
|
||||
w3 = substr(w, e+1)
|
||||
printf "%s%s\t%d\n", w1, w3, $1 > "junk"
|
||||
printf "%s%s%s\t%d\n", w1, w2, w3, $1 > "junk"
|
||||
} else
|
||||
printf "%s\t%d\n", w, $1 > "junk"
|
||||
}
|
||||
}
|
||||
system("sort -u -t' ' +0f -1 +0 -1 +1n -2 < junk")
|
||||
system("rm junk")
|
||||
exit 0
|
||||
}
|
56
src/cmd/dict/comfix.awk
Normal file
56
src/cmd/dict/comfix.awk
Normal file
|
@ -0,0 +1,56 @@
|
|||
# when raw index has a lot of entries like
|
||||
# 1578324 problematico, a, ci, che
|
||||
# apply this algorithm:
|
||||
# treat things after comma as suffixes
|
||||
# for each suffix:
|
||||
# if single letter, replace last letter
|
||||
# else search backwards for beginning of suffix
|
||||
# and if it leads to an old suffix of approximately
|
||||
# the same length, put replace that suffix
|
||||
# This will still leave some commas to fix by hand
|
||||
# Usage: awk -F' ' -f comfix.awk rawindex > newrawindex
|
||||
|
||||
NF == 2 {
|
||||
i = index($2, ",")
|
||||
if(i == 0 || length($2) == 0)
|
||||
print $0
|
||||
else {
|
||||
n = split($2, a, /,[ ]*/)
|
||||
w = a[1]
|
||||
printf "%s\t%s\n", $1, w
|
||||
for(i = 2; i <= n; i++) {
|
||||
suf = a[i]
|
||||
m = matchsuflen(w, suf)
|
||||
if(m) {
|
||||
nw = substr(w, 1, length(w)-m) suf
|
||||
printf "%s\t%s\n", $1, nw
|
||||
} else
|
||||
printf "%s\t%s\n", $1, w ", " suf
|
||||
}
|
||||
}
|
||||
}
|
||||
NF != 2 {
|
||||
print $0
|
||||
}
|
||||
|
||||
function matchsuflen(w, suf, wlen,suflen,c,pat,k,d)
|
||||
{
|
||||
wlen = length(w)
|
||||
suflen = length(suf)
|
||||
if(suflen == 1)
|
||||
return 1
|
||||
else {
|
||||
c = substr(suf, 1, 1)
|
||||
for (k = 1; k <= wlen ; k++)
|
||||
if(substr(w, wlen-k+1, 1) == c)
|
||||
break
|
||||
if(k > wlen)
|
||||
return 0
|
||||
d = k-suflen
|
||||
if(d < 0)
|
||||
d = -d
|
||||
if(d > 3)
|
||||
return 0
|
||||
return k
|
||||
}
|
||||
}
|
681
src/cmd/dict/dict.c
Normal file
681
src/cmd/dict/dict.c
Normal file
|
@ -0,0 +1,681 @@
|
|||
#include <u.h>
|
||||
#include <libc.h>
|
||||
#include <bio.h>
|
||||
#include <regexp.h>
|
||||
#include <ctype.h>
|
||||
#include "dict.h"
|
||||
|
||||
/*
|
||||
* Assumed index file structure: lines of form
|
||||
* [^\t]+\t[0-9]+
|
||||
* First field is key, second is byte offset into dictionary.
|
||||
* Should be sorted with args -u -t' ' +0f -1 +0 -1 +1n -2
|
||||
*/
|
||||
typedef struct Addr Addr;
|
||||
|
||||
struct Addr {
|
||||
int n; /* number of offsets */
|
||||
int cur; /* current position within doff array */
|
||||
int maxn; /* actual current size of doff array */
|
||||
ulong doff[1]; /* doff[maxn], with 0..n-1 significant */
|
||||
};
|
||||
|
||||
Biobuf binbuf;
|
||||
Biobuf boutbuf;
|
||||
Biobuf *bin = &binbuf; /* user cmd input */
|
||||
Biobuf *bout = &boutbuf; /* output */
|
||||
Biobuf *bdict; /* dictionary */
|
||||
Biobuf *bindex; /* index file */
|
||||
long indextop; /* index offset at end of file */
|
||||
int lastcmd; /* last executed command */
|
||||
Addr *dot; /* "current" address */
|
||||
Dict *dict; /* current dictionary */
|
||||
int linelen;
|
||||
int breaklen = 60;
|
||||
int outinhibit;
|
||||
int debug;
|
||||
|
||||
void execcmd(int);
|
||||
int getpref(char*, Rune*);
|
||||
Entry getentry(int);
|
||||
int getfield(Rune*);
|
||||
long locate(Rune*);
|
||||
int parseaddr(char*, char**);
|
||||
int parsecmd(char*);
|
||||
int search(char*, int);
|
||||
long seeknextline(Biobuf*, long);
|
||||
void setdotnext(void);
|
||||
void setdotprev(void);
|
||||
void sortaddr(Addr*);
|
||||
void usage(void);
|
||||
|
||||
enum {
|
||||
Plen=300, /* max length of a search pattern */
|
||||
Fieldlen=200, /* max length of an index field */
|
||||
Aslots=10, /* initial number of slots in an address */
|
||||
};
|
||||
|
||||
void
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
int i, cmd, kflag;
|
||||
char *line, *p;
|
||||
|
||||
Binit(&binbuf, 0, OREAD);
|
||||
Binit(&boutbuf, 1, OWRITE);
|
||||
kflag = 0;
|
||||
line = 0;
|
||||
dict = 0;
|
||||
p = getenv("PLAN9");
|
||||
if(p == nil)
|
||||
p = "/usr/local/plan9";
|
||||
if(chdir(p) < 0)
|
||||
sysfatal("chdir %s: %r", p);
|
||||
|
||||
for(i=0; dicts[i].name; i++){
|
||||
if(access(dicts[i].path, 0)>=0 && access(dicts[i].indexpath, 0)>=0){
|
||||
dict = &dicts[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
ARGBEGIN {
|
||||
case 'd':
|
||||
p = ARGF();
|
||||
dict = 0;
|
||||
if(p) {
|
||||
for(i=0; dicts[i].name; i++)
|
||||
if(strcmp(p, dicts[i].name)==0) {
|
||||
dict = &dicts[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(!dict)
|
||||
usage();
|
||||
break;
|
||||
case 'c':
|
||||
line = ARGF();
|
||||
if(!line)
|
||||
usage();
|
||||
break;
|
||||
case 'k':
|
||||
kflag++;
|
||||
break;
|
||||
case 'D':
|
||||
debug++;
|
||||
break;
|
||||
default:
|
||||
usage();
|
||||
ARGEND }
|
||||
if(dict == 0){
|
||||
err("no dictionaries present on this system");
|
||||
exits("nodict");
|
||||
}
|
||||
|
||||
if(kflag) {
|
||||
(*dict->printkey)();
|
||||
exits(0);
|
||||
}
|
||||
if(argc > 1)
|
||||
usage();
|
||||
else if(argc == 1) {
|
||||
if(line)
|
||||
usage();
|
||||
p = argv[0];
|
||||
line = malloc(strlen(p)+5);
|
||||
sprint(line, "/%s/P\n", p);
|
||||
}
|
||||
bdict = Bopen(dict->path, OREAD);
|
||||
if(!bdict) {
|
||||
err("can't open dictionary %s/%s", p, dict->path);
|
||||
exits("nodict");
|
||||
}
|
||||
bindex = Bopen(dict->indexpath, OREAD);
|
||||
if(!bindex) {
|
||||
err("can't open index %s/%s", p, dict->indexpath);
|
||||
exits("noindex");
|
||||
}
|
||||
indextop = Bseek(bindex, 0L, 2);
|
||||
|
||||
dot = malloc(sizeof(Addr)+(Aslots-1)*sizeof(ulong));
|
||||
dot->n = 0;
|
||||
dot->cur = 0;
|
||||
dot->maxn = Aslots;
|
||||
lastcmd = 0;
|
||||
|
||||
if(line) {
|
||||
cmd = parsecmd(line);
|
||||
if(cmd)
|
||||
execcmd(cmd);
|
||||
} else {
|
||||
for(;;) {
|
||||
Bprint(bout, "*");
|
||||
Bflush(bout);
|
||||
line = Brdline(bin, '\n');
|
||||
linelen = 0;
|
||||
if(!line)
|
||||
break;
|
||||
cmd = parsecmd(line);
|
||||
if(cmd) {
|
||||
execcmd(cmd);
|
||||
lastcmd = cmd;
|
||||
}
|
||||
}
|
||||
}
|
||||
exits(0);
|
||||
}
|
||||
|
||||
void
|
||||
usage(void)
|
||||
{
|
||||
int i;
|
||||
char *a, *b;
|
||||
|
||||
Bprint(bout, "Usage: %s [-d dict] [-k] [-c cmd] [word]\n", argv0);
|
||||
Bprint(bout, "dictionaries (brackets mark dictionaries not present on this system):\n");
|
||||
for(i = 0; dicts[i].name; i++){
|
||||
a = b = "";
|
||||
if(access(dicts[i].path, 0)<0 || access(dicts[i].indexpath, 0)<0){
|
||||
a = "[";
|
||||
b = "]";
|
||||
}
|
||||
Bprint(bout, " %s%s\t%s%s\n", a, dicts[i].name, dicts[i].desc, b);
|
||||
}
|
||||
exits("usage");
|
||||
}
|
||||
|
||||
int
|
||||
parsecmd(char *line)
|
||||
{
|
||||
char *e;
|
||||
int cmd, ans;
|
||||
|
||||
if(parseaddr(line, &e) >= 0)
|
||||
line = e;
|
||||
else
|
||||
return 0;
|
||||
cmd = *line;
|
||||
ans = cmd;
|
||||
if(isupper(cmd))
|
||||
cmd = tolower(cmd);
|
||||
if(!(cmd == 'a' || cmd == 'h' || cmd == 'p' || cmd == 'r' ||
|
||||
cmd == '\n')) {
|
||||
err("unknown command %c", cmd);
|
||||
return 0;
|
||||
}
|
||||
if(cmd == '\n')
|
||||
switch(lastcmd) {
|
||||
case 0: ans = 'H'; break;
|
||||
case 'H': ans = 'p'; break;
|
||||
default : ans = lastcmd; break;
|
||||
}
|
||||
else if(line[1] != '\n' && line[1] != 0)
|
||||
err("extra stuff after command %c ignored", cmd);
|
||||
return ans;
|
||||
}
|
||||
|
||||
void
|
||||
execcmd(int cmd)
|
||||
{
|
||||
Entry e;
|
||||
int cur, doall;
|
||||
|
||||
if(isupper(cmd)) {
|
||||
doall = 1;
|
||||
cmd = tolower(cmd);
|
||||
cur = 0;
|
||||
} else {
|
||||
doall = 0;
|
||||
cur = dot->cur;
|
||||
}
|
||||
|
||||
if(debug && doall && cmd == 'a')
|
||||
Bprint(bout, "%d entries, cur=%d\n", dot->n, cur+1);
|
||||
for(;;){
|
||||
if(cur >= dot->n)
|
||||
break;
|
||||
if(doall) {
|
||||
Bprint(bout, "%d\t", cur+1);
|
||||
linelen += 4 + (cur >= 10);
|
||||
}
|
||||
switch(cmd) {
|
||||
case 'a':
|
||||
Bprint(bout, "#%lud\n", dot->doff[cur]);
|
||||
break;
|
||||
case 'h':
|
||||
case 'p':
|
||||
case 'r':
|
||||
e = getentry(cur);
|
||||
(*dict->printentry)(e, cmd);
|
||||
break;
|
||||
}
|
||||
cur++;
|
||||
if(doall) {
|
||||
if(cmd == 'p' || cmd == 'r') {
|
||||
Bputc(bout, '\n');
|
||||
linelen = 0;
|
||||
}
|
||||
} else
|
||||
break;
|
||||
}
|
||||
if(cur >= dot->n)
|
||||
cur = 0;
|
||||
dot->cur = cur;
|
||||
}
|
||||
|
||||
/*
|
||||
* Address syntax: ('.' | '/' re '/' | '!' re '!' | number | '#' number) ('+' | '-')*
|
||||
* Answer goes in dot.
|
||||
* Return -1 if address starts, but get error.
|
||||
* Return 0 if no address.
|
||||
*/
|
||||
int
|
||||
parseaddr(char *line, char **eptr)
|
||||
{
|
||||
int delim, plen;
|
||||
ulong v;
|
||||
char *e;
|
||||
char pat[Plen];
|
||||
|
||||
if(*line == '/' || *line == '!') {
|
||||
/* anchored regular expression match; '!' means no folding */
|
||||
if(*line == '/') {
|
||||
delim = '/';
|
||||
e = strpbrk(line+1, "/\n");
|
||||
} else {
|
||||
delim = '!';
|
||||
e = strpbrk(line+1, "!\n");
|
||||
}
|
||||
plen = e-line-1;
|
||||
if(plen >= Plen-3) {
|
||||
err("pattern too big");
|
||||
return -1;
|
||||
}
|
||||
pat[0] = '^';
|
||||
memcpy(pat+1, line+1, plen);
|
||||
pat[plen+1] = '$';
|
||||
pat[plen+2] = 0;
|
||||
if(*e == '\n')
|
||||
line = e;
|
||||
else
|
||||
line = e+1;
|
||||
if(!search(pat, delim == '/')) {
|
||||
err("pattern not found");
|
||||
return -1;
|
||||
}
|
||||
} else if(*line == '#') {
|
||||
/* absolute byte offset into dictionary */
|
||||
line++;
|
||||
if(!isdigit(*line))
|
||||
return -1;
|
||||
v = strtoul(line, &e, 10);
|
||||
line = e;
|
||||
dot->doff[0] = v;
|
||||
dot->n = 1;
|
||||
dot->cur = 0;
|
||||
} else if(isdigit(*line)) {
|
||||
v = strtoul(line, &e, 10);
|
||||
line = e;
|
||||
if(v < 1 || v > dot->n)
|
||||
err(".%d not in range [1,%d], ignored",
|
||||
v, dot->n);
|
||||
else
|
||||
dot->cur = v-1;
|
||||
} else if(*line == '.') {
|
||||
line++;
|
||||
} else {
|
||||
*eptr = line;
|
||||
return 0;
|
||||
}
|
||||
while(*line == '+' || *line == '-') {
|
||||
if(*line == '+')
|
||||
setdotnext();
|
||||
else
|
||||
setdotprev();
|
||||
line++;
|
||||
}
|
||||
*eptr = line;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Index file is sorted by folded field1.
|
||||
* Method: find pre, a folded prefix of r.e. pat,
|
||||
* and then low = offset to beginning of
|
||||
* line in index file where first match of prefix occurs.
|
||||
* Then go through index until prefix no longer matches,
|
||||
* adding each line that matches real pattern to dot.
|
||||
* Finally, sort dot offsets (uniquing).
|
||||
* We know pat len < Plen, and that it is surrounded by ^..$
|
||||
*/
|
||||
int
|
||||
search(char *pat, int dofold)
|
||||
{
|
||||
int needre, prelen, match, n;
|
||||
Reprog *re;
|
||||
long ioff, v;
|
||||
Rune pre[Plen];
|
||||
Rune lit[Plen];
|
||||
Rune entry[Fieldlen];
|
||||
char fpat[Plen];
|
||||
|
||||
prelen = getpref(pat+1, pre);
|
||||
if(pat[prelen+1] == 0 || pat[prelen+1] == '$') {
|
||||
runescpy(lit, pre);
|
||||
if(dofold)
|
||||
fold(lit);
|
||||
needre = 0;
|
||||
SET(re);
|
||||
} else {
|
||||
needre = 1;
|
||||
if(dofold) {
|
||||
foldre(fpat, pat);
|
||||
re = regcomp(fpat);
|
||||
} else
|
||||
re = regcomp(pat);
|
||||
}
|
||||
fold(pre);
|
||||
ioff = locate(pre);
|
||||
if(ioff < 0)
|
||||
return 0;
|
||||
dot->n = 0;
|
||||
Bseek(bindex, ioff, 0);
|
||||
for(;;) {
|
||||
if(!getfield(entry))
|
||||
break;
|
||||
if(dofold)
|
||||
fold(entry);
|
||||
if(needre)
|
||||
match = rregexec(re, entry, 0, 0);
|
||||
else
|
||||
match = (acomp(lit, entry) == 0);
|
||||
if(match) {
|
||||
if(!getfield(entry))
|
||||
break;
|
||||
v = runetol(entry);
|
||||
if(dot->n >= dot->maxn) {
|
||||
n = 2*dot->maxn;
|
||||
dot = realloc(dot,
|
||||
sizeof(Addr)+(n-1)*sizeof(long));
|
||||
if(!dot) {
|
||||
err("out of memory");
|
||||
exits("nomem");
|
||||
}
|
||||
dot->maxn = n;
|
||||
}
|
||||
dot->doff[dot->n++] = v;
|
||||
} else {
|
||||
if(!dofold)
|
||||
fold(entry);
|
||||
if(*pre) {
|
||||
n = acomp(pre, entry);
|
||||
if(n < -1 || (!needre && n < 0))
|
||||
break;
|
||||
}
|
||||
/* get to next index entry */
|
||||
if(!getfield(entry))
|
||||
break;
|
||||
}
|
||||
}
|
||||
sortaddr(dot);
|
||||
dot->cur = 0;
|
||||
return dot->n;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return offset in index file of first line whose folded
|
||||
* first field has pre as a prefix. -1 if none found.
|
||||
*/
|
||||
long
|
||||
locate(Rune *pre)
|
||||
{
|
||||
long top, bot, mid;
|
||||
Rune entry[Fieldlen];
|
||||
|
||||
if(*pre == 0)
|
||||
return 0;
|
||||
bot = 0;
|
||||
top = indextop;
|
||||
if(debug>1)
|
||||
fprint(2, "locate looking for prefix %S\n", pre);
|
||||
for(;;) {
|
||||
/*
|
||||
* Loop invariant: foldkey(bot) < pre <= foldkey(top)
|
||||
* and bot < top, and bot,top point at beginning of lines
|
||||
*/
|
||||
mid = (top+bot) / 2;
|
||||
mid = seeknextline(bindex, mid);
|
||||
if(debug > 1)
|
||||
fprint(2, "bot=%ld, mid=%ld->%ld, top=%ld\n",
|
||||
bot, (top+bot) / 2, mid, top);
|
||||
if(mid == top || !getfield(entry))
|
||||
break;
|
||||
if(debug > 1)
|
||||
fprint(2, "key=%S\n", entry);
|
||||
/*
|
||||
* here mid is strictly between bot and top
|
||||
*/
|
||||
fold(entry);
|
||||
if(acomp(pre, entry) <= 0)
|
||||
top = mid;
|
||||
else
|
||||
bot = mid;
|
||||
}
|
||||
/*
|
||||
* bot < top, but they don't necessarily point at successive lines
|
||||
* Use linear search from bot to find first line that pre is a
|
||||
* prefix of
|
||||
*/
|
||||
while((bot = seeknextline(bindex, bot)) <= top) {
|
||||
if(!getfield(entry))
|
||||
return -1;
|
||||
if(debug > 1)
|
||||
fprint(2, "key=%S\n", entry);
|
||||
fold(entry);
|
||||
switch(acomp(pre, entry)) {
|
||||
case -2:
|
||||
return -1;
|
||||
case -1:
|
||||
case 0:
|
||||
return bot;
|
||||
case 1:
|
||||
case 2:
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* Get prefix of non re-metacharacters, runified, into pre,
|
||||
* and return length
|
||||
*/
|
||||
int
|
||||
getpref(char *pat, Rune *pre)
|
||||
{
|
||||
int n, r;
|
||||
char *p;
|
||||
|
||||
p = pat;
|
||||
while(*p) {
|
||||
n = chartorune(pre, p);
|
||||
r = *pre;
|
||||
switch(r) {
|
||||
case 0x2e: case 0x2a: case 0x2b: case 0x3f:
|
||||
case 0x5b: case 0x5d: case 0x28: case ')':
|
||||
case 0x7c: case 0x5e: case 0x24:
|
||||
*pre = 0;
|
||||
return p-pat;
|
||||
case L'\\':
|
||||
p += n;
|
||||
p += chartorune(++pre, p);
|
||||
pre++;
|
||||
break;
|
||||
default:
|
||||
p += n;
|
||||
pre++;
|
||||
}
|
||||
}
|
||||
return p-pat;
|
||||
}
|
||||
|
||||
long
|
||||
seeknextline(Biobuf *b, long off)
|
||||
{
|
||||
long c;
|
||||
|
||||
Bseek(b, off, 0);
|
||||
do {
|
||||
c = Bgetrune(b);
|
||||
} while(c>=0 && c!='\n');
|
||||
return Boffset(b);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get next field out of index file (either tab- or nl- terminated)
|
||||
* Answer in *rp, assumed to be Fieldlen long.
|
||||
* Return 0 if read error first.
|
||||
*/
|
||||
int
|
||||
getfield(Rune *rp)
|
||||
{
|
||||
long c;
|
||||
int n;
|
||||
|
||||
for(n=Fieldlen; n-- > 0; ) {
|
||||
if ((c = Bgetrune(bindex)) < 0)
|
||||
return 0;
|
||||
if(c == '\t' || c == '\n') {
|
||||
*rp = L'\0';
|
||||
return 1;
|
||||
}
|
||||
*rp++ = c;
|
||||
}
|
||||
err("word too long");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* A compare longs function suitable for qsort
|
||||
*/
|
||||
static int
|
||||
longcmp(const void *av, const void *bv)
|
||||
{
|
||||
long v;
|
||||
long *a, *b;
|
||||
|
||||
a = (long*)av;
|
||||
b = (long*)bv;
|
||||
|
||||
v = *a - *b;
|
||||
if(v < 0)
|
||||
return -1;
|
||||
else if(v == 0)
|
||||
return 0;
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
|
||||
void
|
||||
sortaddr(Addr *a)
|
||||
{
|
||||
int i, j;
|
||||
long v;
|
||||
|
||||
if(a->n <= 1)
|
||||
return;
|
||||
|
||||
qsort(a->doff, a->n, sizeof(long), longcmp);
|
||||
|
||||
/* remove duplicates */
|
||||
for(i=0, j=0; j < a->n; j++) {
|
||||
v = a->doff[j];
|
||||
if(i > 0 && v == a->doff[i-1])
|
||||
continue;
|
||||
a->doff[i++] = v;
|
||||
}
|
||||
a->n = i;
|
||||
}
|
||||
|
||||
Entry
|
||||
getentry(int i)
|
||||
{
|
||||
long b, e, n;
|
||||
static Entry ans;
|
||||
static int anslen = 0;
|
||||
|
||||
b = dot->doff[i];
|
||||
e = (*dict->nextoff)(b+1);
|
||||
ans.doff = b;
|
||||
if(e < 0) {
|
||||
err("couldn't seek to entry");
|
||||
ans.start = 0;
|
||||
ans.end = 0;
|
||||
} else {
|
||||
n = e-b;
|
||||
if(n+1 > anslen) {
|
||||
ans.start = realloc(ans.start, n+1);
|
||||
if(!ans.start) {
|
||||
err("out of memory");
|
||||
exits("nomem");
|
||||
}
|
||||
anslen = n+1;
|
||||
}
|
||||
Bseek(bdict, b, 0);
|
||||
n = Bread(bdict, ans.start, n);
|
||||
ans.end = ans.start + n;
|
||||
*ans.end = 0;
|
||||
}
|
||||
return ans;
|
||||
}
|
||||
|
||||
void
|
||||
setdotnext(void)
|
||||
{
|
||||
long b;
|
||||
|
||||
b = (*dict->nextoff)(dot->doff[dot->cur]+1);
|
||||
if(b < 0) {
|
||||
err("couldn't find a next entry");
|
||||
return;
|
||||
}
|
||||
dot->doff[0] = b;
|
||||
dot->n = 1;
|
||||
dot->cur = 0;
|
||||
}
|
||||
|
||||
void
|
||||
setdotprev(void)
|
||||
{
|
||||
int tryback;
|
||||
long here, last, p;
|
||||
|
||||
if(dot->cur < 0 || dot->cur >= dot->n)
|
||||
return;
|
||||
tryback = 2000;
|
||||
here = dot->doff[dot->cur];
|
||||
last = 0;
|
||||
while(last == 0) {
|
||||
p = here - tryback;
|
||||
if(p < 0)
|
||||
p = 0;
|
||||
for(;;) {
|
||||
p = (*dict->nextoff)(p+1);
|
||||
if(p < 0)
|
||||
return; /* shouldn't happen */
|
||||
if(p >= here)
|
||||
break;
|
||||
last = p;
|
||||
}
|
||||
if(!last) {
|
||||
if(here - tryback < 0) {
|
||||
err("can't find a previous entry");
|
||||
return;
|
||||
}
|
||||
tryback = 2*tryback;
|
||||
}
|
||||
}
|
||||
dot->doff[0] = last;
|
||||
dot->n = 1;
|
||||
dot->cur = 0;
|
||||
}
|
160
src/cmd/dict/dict.h
Normal file
160
src/cmd/dict/dict.h
Normal file
|
@ -0,0 +1,160 @@
|
|||
/* Runes for special purposes (0xe800-0xfdff is Private Use Area) */
|
||||
enum { NONE=0xe800, /* Emit nothing */
|
||||
TAGS, /* Start of tag */
|
||||
TAGE, /* End of tag */
|
||||
SPCS, /* Start of special character name */
|
||||
PAR, /* Newline, indent */
|
||||
LIGS, /* Start of ligature codes */
|
||||
LACU=LIGS, /* Acute (´) ligatures */
|
||||
LGRV, /* Grave (ˋ) ligatures */
|
||||
LUML, /* Umlaut (¨) ligatures */
|
||||
LCED, /* Cedilla (¸) ligatures */
|
||||
LTIL, /* Tilde (˜) ligatures */
|
||||
LBRV, /* Breve (˘) ligatures */
|
||||
LRNG, /* Ring (˚) ligatures */
|
||||
LDOT, /* Dot (˙) ligatures */
|
||||
LDTB, /* Dot below (.) ligatures */
|
||||
LFRN, /* Frown (⌢) ligatures */
|
||||
LFRB, /* Frown below (̯) ligatures */
|
||||
LOGO, /* Ogonek (˛) ligatures */
|
||||
LMAC, /* Macron (¯) ligatures */
|
||||
LHCK, /* Hacek (ˇ) ligatures */
|
||||
LASP, /* Asper (ʽ) ligatures */
|
||||
LLEN, /* Lenis (ʼ) ligatures */
|
||||
LBRB, /* Breve below (̮) ligatures */
|
||||
LIGE, /* End of ligature codes */
|
||||
MULTI, /* Start of multi-rune codes */
|
||||
MAAS=MULTI, /* ʽα */
|
||||
MALN, /* ʼα */
|
||||
MAND, /* and */
|
||||
MAOQ, /* a/q */
|
||||
MBRA, /* <| */
|
||||
MDD, /* .. */
|
||||
MDDD, /* ... */
|
||||
MEAS, /* ʽε */
|
||||
MELN, /* ʼε */
|
||||
MEMM, /* —— */
|
||||
MHAS, /* ʽη */
|
||||
MHLN, /* ʼη */
|
||||
MIAS, /* ʽι */
|
||||
MILN, /* ʼι */
|
||||
MLCT, /* ct */
|
||||
MLFF, /* ff */
|
||||
MLFFI, /* ffi */
|
||||
MLFFL, /* ffl */
|
||||
MLFL, /* fl */
|
||||
MLFI, /* fi */
|
||||
MLLS, /* ll with swing */
|
||||
MLST, /* st */
|
||||
MOAS, /* ʽο */
|
||||
MOLN, /* ʼο */
|
||||
MOR, /* or */
|
||||
MRAS, /* ʽρ */
|
||||
MRLN, /* ʼρ */
|
||||
MTT, /* ~~ */
|
||||
MUAS, /* ʽυ */
|
||||
MULN, /* ʼυ */
|
||||
MWAS, /* ʽω */
|
||||
MWLN, /* ʼω */
|
||||
MOE, /* oe */
|
||||
MES, /* em space */
|
||||
MULTIE, /* End of multi-rune codes */
|
||||
};
|
||||
#define Nligs (LIGE-LIGS)
|
||||
#define Nmulti (MULTIE-MULTI)
|
||||
|
||||
typedef struct Entry Entry;
|
||||
typedef struct Assoc Assoc;
|
||||
typedef struct Nassoc Nassoc;
|
||||
typedef struct Dict Dict;
|
||||
|
||||
struct Entry {
|
||||
char *start; /* entry starts at start */
|
||||
char *end; /* and finishes just before end */
|
||||
long doff; /* dictionary offset (for debugging) */
|
||||
};
|
||||
|
||||
struct Assoc {
|
||||
char *key;
|
||||
long val;
|
||||
};
|
||||
|
||||
struct Nassoc {
|
||||
long key;
|
||||
long val;
|
||||
};
|
||||
|
||||
struct Dict {
|
||||
char *name; /* dictionary name */
|
||||
char *desc; /* description */
|
||||
char *path; /* path to dictionary data */
|
||||
char *indexpath; /* path to index data */
|
||||
long (*nextoff)(long); /* function to find next entry offset from arg */
|
||||
void (*printentry)(Entry, int); /* function to print entry */
|
||||
void (*printkey)(void); /* function to print pronunciation key */
|
||||
};
|
||||
|
||||
int acomp(Rune*, Rune*);
|
||||
Rune *changett(Rune *, Rune *, int);
|
||||
void err(char*, ...);
|
||||
void fold(Rune *);
|
||||
void foldre(char*, char*);
|
||||
Rune liglookup(Rune, Rune);
|
||||
long lookassoc(Assoc*, int, char*);
|
||||
long looknassoc(Nassoc*, int, long);
|
||||
void outprint(char*, ...);
|
||||
void outrune(long);
|
||||
void outrunes(Rune *);
|
||||
void outchar(int);
|
||||
void outchars(char *);
|
||||
void outnl(int);
|
||||
void outpiece(char *, char *);
|
||||
void runescpy(Rune*, Rune*);
|
||||
long runetol(Rune*);
|
||||
|
||||
long oednextoff(long);
|
||||
void oedprintentry(Entry, int);
|
||||
void oedprintkey(void);
|
||||
long ahdnextoff(long);
|
||||
void ahdprintentry(Entry, int);
|
||||
void ahdprintkey(void);
|
||||
long pcollnextoff(long);
|
||||
void pcollprintentry(Entry, int);
|
||||
void pcollprintkey(void);
|
||||
long pcollgnextoff(long);
|
||||
void pcollgprintentry(Entry, int);
|
||||
void pcollgprintkey(void);
|
||||
long movienextoff(long);
|
||||
void movieprintentry(Entry, int);
|
||||
void movieprintkey(void);
|
||||
long pgwnextoff(long);
|
||||
void pgwprintentry(Entry,int);
|
||||
void pgwprintkey(void);
|
||||
long slangnextoff(long);
|
||||
void slangprintentry(Entry, int);
|
||||
void slangprintkey(void);
|
||||
long robertnextoff(long);
|
||||
void robertindexentry(Entry, int);
|
||||
void robertprintkey(void);
|
||||
long robertnextflex(long);
|
||||
void robertflexentry(Entry, int);
|
||||
long simplenextoff(long);
|
||||
void simpleprintentry(Entry, int);
|
||||
void simpleprintkey(void);
|
||||
long thesnextoff(long);
|
||||
void thesprintentry(Entry, int);
|
||||
void thesprintkey(void);
|
||||
long worldnextoff(long);
|
||||
void worldprintentry(Entry, int);
|
||||
void worldprintkey(void);
|
||||
|
||||
extern Biobuf *bdict;
|
||||
extern Biobuf *bout;
|
||||
extern int linelen;
|
||||
extern int breaklen;
|
||||
extern int outinhibit;
|
||||
extern int debug;
|
||||
extern Rune multitab[][5];
|
||||
extern Dict dicts[];
|
||||
|
||||
#define asize(a) (sizeof (a)/sizeof(a[0]))
|
15
src/cmd/dict/egfix
Executable file
15
src/cmd/dict/egfix
Executable file
|
@ -0,0 +1,15 @@
|
|||
#!/bin/rc
|
||||
|
||||
sed '
|
||||
s/[ ]+$//
|
||||
/ /!d
|
||||
/, /{; h; s/,.*//; p; g; s/ .*, / /; }
|
||||
' $1 |
|
||||
sed '
|
||||
/\(/{; h; s/\([^)]+\)//; p; g; s/[()]//g; }
|
||||
' |
|
||||
sed '
|
||||
s/ +/ /
|
||||
s/[ ]+$//
|
||||
s/ +/ /g
|
||||
'
|
8
src/cmd/dict/egfix2
Executable file
8
src/cmd/dict/egfix2
Executable file
|
@ -0,0 +1,8 @@
|
|||
#!/bin/rc
|
||||
|
||||
awk '
|
||||
BEGIN { FS = " |, " }
|
||||
{ for(i=2; i<=NF; i++)print $i " " $1 }
|
||||
' $1 |
|
||||
tr A-Z a-z |
|
||||
sort -u -t' ' +0f -1 +0 -1 +1n -2
|
1108
src/cmd/dict/gb2312.c
Normal file
1108
src/cmd/dict/gb2312.c
Normal file
File diff suppressed because it is too large
Load diff
23
src/cmd/dict/gefix
Executable file
23
src/cmd/dict/gefix
Executable file
|
@ -0,0 +1,23 @@
|
|||
#!/bin/rc
|
||||
|
||||
sed '
|
||||
s/[ ]+$//
|
||||
/ /!d
|
||||
s/\\N''349''//g
|
||||
s/''//g
|
||||
s/ -/ /
|
||||
s/-$//
|
||||
/\([^,) ]+(\)|$)/{; h; s///; p; g; s/\(//; s/\)//; }
|
||||
/\(r, s\)$/{; s///; p; s/$/r/; p; s/r$/s/; }
|
||||
' $1 |
|
||||
sed '
|
||||
/\([^,) ]+(\)|$)/{; h; s///; p; g; s/\(//; s/\)//; }
|
||||
/\(r, s\)$/{; s///; p; s/$/r/; p; s/r$/s/; }
|
||||
' |
|
||||
sed '/ß/{; p; s/ß/ss/g; }' |
|
||||
awk '
|
||||
BEGIN { FS = " |, " }
|
||||
{ for(i=2; i<=NF; i++)print $i " " $1 }
|
||||
' |
|
||||
tr A-Z a-z |
|
||||
sort -u -t' ' +0f -1 +0 -1 +1n -2
|
8
src/cmd/dict/getneeds
Executable file
8
src/cmd/dict/getneeds
Executable file
|
@ -0,0 +1,8 @@
|
|||
#!/bin/rc
|
||||
for (x in spec tag aux status) {
|
||||
grep ' '^$x^' ' $1 > junk1
|
||||
sort +4 -5 +3n -4 junk1 > junk2
|
||||
awk '{if ($5 != prev) print $0; prev = $5}' junk2 > junk3
|
||||
sort -n +2 -3 junk3 > need$x
|
||||
rm junk*
|
||||
}
|
1059
src/cmd/dict/jis208.c
Normal file
1059
src/cmd/dict/jis208.c
Normal file
File diff suppressed because it is too large
Load diff
114
src/cmd/dict/kuten.h
Normal file
114
src/cmd/dict/kuten.h
Normal file
|
@ -0,0 +1,114 @@
|
|||
/*
|
||||
following astonishing goo courtesy of kogure.
|
||||
*/
|
||||
/*
|
||||
* MicroSoft Kanji Encoding (SJIS) Transformation
|
||||
*/
|
||||
|
||||
/*
|
||||
* void
|
||||
* J2S(unsigned char *_h, unsigned char *_l)
|
||||
* JIS X 208 to MS kanji transformation.
|
||||
*
|
||||
* Calling/Exit State:
|
||||
* _h and _l should be in their valid range.
|
||||
* No return value.
|
||||
*/
|
||||
#define J2S(_h, _l) { \
|
||||
/* lower: 21-7e >> 40-9d,9e-fb >> 40-7e,(skip 7f),80-fc */ \
|
||||
if (((_l) += (((_h)-- % 2) ? 0x1f : 0x7d)) > 0x7e) (_l)++; \
|
||||
/* upper: 21-7e >> 81-af >> 81-9f,(skip a0-df),e0-ef */ \
|
||||
if (((_h) = ((_h) / 2 + 0x71)) > 0x9f) (_h) += 0x40; \
|
||||
}
|
||||
|
||||
/*
|
||||
* void
|
||||
* S2J(unsigned char *_h, unsigned char *_l)
|
||||
* MS kanji to JIS X 208 transformation.
|
||||
*
|
||||
* Calling/Exit State:
|
||||
* _h and _l should be in valid range.
|
||||
* No return value.
|
||||
*/
|
||||
#define S2J(_h, _l) { \
|
||||
/* lower: 40-7e,80-fc >> 21-5f,61-dd >> 21-7e,7f-dc */ \
|
||||
if (((_l) -= 0x1f) > 0x60) (_l)--; \
|
||||
/* upper: 81-9f,e0-ef >> 00-1e,5f-6e >> 00-2e >> 21-7d */ \
|
||||
if (((_h) -= 0x81) > 0x5e) (_h) -= 0x40; (_h) *= 2, (_h) += 0x21; \
|
||||
/* upper: ,21-7d >> ,22-7e ; lower: ,7f-dc >> ,21-7e */ \
|
||||
if ((_l) > 0x7e) (_h)++, (_l) -= 0x5e; \
|
||||
}
|
||||
|
||||
/*
|
||||
* int
|
||||
* ISJKANA(const unsigned char *_b)
|
||||
* Tests given byte is in the range of JIS X 0201 katakana.
|
||||
*
|
||||
* Calling/Exit State:
|
||||
* Returns 1 if it is, or 0 otherwise.
|
||||
*/
|
||||
#define ISJKANA(_b) (0xa0 <= (_b) && (_b) < 0xe0)
|
||||
|
||||
/*
|
||||
* int
|
||||
* CANS2JH(const unsigned char *_h)
|
||||
* Tests given byte is in the range of valid first byte of MS
|
||||
* kanji code; either acts as a subroutine of CANS2J() macro
|
||||
* or can be used to parse MS kanji encoded strings.
|
||||
*
|
||||
* Calling/Exit State:
|
||||
* Returns 1 if it is, or 0 otherwise.
|
||||
*/
|
||||
#define CANS2JH(_h) ((0x81 <= (_h) && (_h) < 0xf0) && !ISJKANA(_h))
|
||||
|
||||
/*
|
||||
* int
|
||||
* CANS2JL(const unsigned char *_l)
|
||||
* Tests given byte is in the range of valid second byte of MS
|
||||
* kanji code; acts as a subroutine of CANS2J() macro.
|
||||
*
|
||||
* Calling/Exit State:
|
||||
* Returns 1 if it is, or 0 otherwise.
|
||||
*/
|
||||
#define CANS2JL(_l) (0x40 <= (_l) && (_l) < 0xfd && (_l) != 0x7f)
|
||||
|
||||
/*
|
||||
* int
|
||||
* CANS2J(const unsigned char *_h, const unsinged char *_l)
|
||||
* Tests given bytes form a MS kanji code point which can be
|
||||
* transformed to a valid JIS X 208 code point.
|
||||
*
|
||||
* Calling/Exit State:
|
||||
* Returns 1 if they are, or 0 otherwise.
|
||||
*/
|
||||
#define CANS2J(_h, _l) (CANS2JH(_h) && CANS2JL(_l))
|
||||
|
||||
/*
|
||||
* int
|
||||
* CANJ2SB(const unsigned char *_b)
|
||||
* Tests given bytes is in the range of valid 94 graphic
|
||||
* character set; acts as a subroutine of CANJ2S() macro.
|
||||
*
|
||||
* Calling/Exit State:
|
||||
* Returns 1 if it is, or 0 otherwise.
|
||||
*/
|
||||
#define CANJ2SB(_b) (0x21 <= (_b) && (_b) < 0x7f)
|
||||
|
||||
/*
|
||||
* int
|
||||
* CANJ2S(const unsigned char *_h, const unsigned char *_l)
|
||||
* Tests given bytes form valid JIS X 208 code points
|
||||
* (which can be transformed to MS kanji).
|
||||
*
|
||||
* Calling/Exit State:
|
||||
* Returns 1 if they are, or 0 otherwise.
|
||||
*/
|
||||
#define CANJ2S(_h, _l) (CANJ2SB(_h) && CANJ2SB(_l))
|
||||
|
||||
#define JIS208MAX 8407
|
||||
#define GB2312MAX 8795
|
||||
#define BIG5MAX 13973
|
||||
|
||||
extern Rune tabjis208[JIS208MAX]; /* runes indexed by kuten */
|
||||
extern Rune tabgb2312[GB2312MAX];
|
||||
extern Rune tabbig5[BIG5MAX];
|
18
src/cmd/dict/mkfile
Normal file
18
src/cmd/dict/mkfile
Normal file
|
@ -0,0 +1,18 @@
|
|||
PLAN9=../../..
|
||||
<$PLAN9/src/mkhdr
|
||||
|
||||
TARG=dict
|
||||
LFILES=oed.$O ahd.$O pcollins.$O pcollinsg.$O movie.$O slang.$O robert.$O\
|
||||
world.$O jis208.$O gb2312.$O thesaurus.$O simple.$O pgw.$O
|
||||
|
||||
OFILES=dict.$O\
|
||||
$LFILES\
|
||||
utils.$O
|
||||
|
||||
HFILES=dict.h kuten.h
|
||||
|
||||
LDFLAGS=$LDFLAGS -lbio -l9 -lregexp9 -lfmt -lutf
|
||||
<$PLAN9/src/mkone
|
||||
|
||||
mkindex: mkindex.$O $LFILES utils.$O
|
||||
$LD $LDFLAGS -o $target $prereq
|
106
src/cmd/dict/mkindex.c
Normal file
106
src/cmd/dict/mkindex.c
Normal file
|
@ -0,0 +1,106 @@
|
|||
#include <u.h>
|
||||
#include <libc.h>
|
||||
#include <bio.h>
|
||||
#include "dict.h"
|
||||
|
||||
/*
|
||||
* Use this to start making an index for a new dictionary.
|
||||
* Get the dictionary-specific nextoff and printentry(_,'h')
|
||||
* commands working, add a record to the dicts[] array below,
|
||||
* and run this program to get a list of offset,headword
|
||||
* pairs
|
||||
*/
|
||||
Biobuf boutbuf;
|
||||
Biobuf *bdict;
|
||||
Biobuf *bout = &boutbuf;
|
||||
int linelen;
|
||||
int breaklen = 2000;
|
||||
int outinhibit;
|
||||
int debug;
|
||||
|
||||
Dict *dict; /* current dictionary */
|
||||
|
||||
Entry getentry(long);
|
||||
|
||||
void
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
int i;
|
||||
long a, ae;
|
||||
char *p;
|
||||
Entry e;
|
||||
|
||||
Binit(&boutbuf, 1, OWRITE);
|
||||
dict = &dicts[0];
|
||||
ARGBEGIN {
|
||||
case 'd':
|
||||
p = ARGF();
|
||||
dict = 0;
|
||||
if(p) {
|
||||
for(i=0; dicts[i].name; i++)
|
||||
if(strcmp(p, dicts[i].name)==0) {
|
||||
dict = &dicts[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(!dict) {
|
||||
err("unknown dictionary: %s", p);
|
||||
exits("nodict");
|
||||
}
|
||||
break;
|
||||
case 'D':
|
||||
debug++;
|
||||
break;
|
||||
ARGEND }
|
||||
USED(argc,argv);
|
||||
bdict = Bopen(dict->path, OREAD);
|
||||
ae = Bseek(bdict, 0, 2);
|
||||
if(!bdict) {
|
||||
err("can't open dictionary %s", dict->path);
|
||||
exits("nodict");
|
||||
}
|
||||
for(a = 0; a < ae; a = (*dict->nextoff)(a+1)) {
|
||||
linelen = 0;
|
||||
e = getentry(a);
|
||||
Bprint(bout, "%ld\t", a);
|
||||
linelen = 4; /* only has to be approx right */
|
||||
(*dict->printentry)(e, 'h');
|
||||
}
|
||||
exits(0);
|
||||
}
|
||||
|
||||
Entry
|
||||
getentry(long b)
|
||||
{
|
||||
long e, n, dtop;
|
||||
static Entry ans;
|
||||
static int anslen = 0;
|
||||
|
||||
e = (*dict->nextoff)(b+1);
|
||||
ans.doff = b;
|
||||
if(e < 0) {
|
||||
dtop = Bseek(bdict, 0L, 2);
|
||||
if(b < dtop) {
|
||||
e = dtop;
|
||||
} else {
|
||||
err("couldn't seek to entry");
|
||||
ans.start = 0;
|
||||
ans.end = 0;
|
||||
}
|
||||
}
|
||||
n = e-b;
|
||||
if(n) {
|
||||
if(n > anslen) {
|
||||
ans.start = realloc(ans.start, n);
|
||||
if(!ans.start) {
|
||||
err("out of memory");
|
||||
exits("nomem");
|
||||
}
|
||||
anslen = n;
|
||||
}
|
||||
Bseek(bdict, b, 0);
|
||||
n = Bread(bdict, ans.start, n);
|
||||
ans.end = ans.start + n;
|
||||
}
|
||||
return ans;
|
||||
}
|
328
src/cmd/dict/movie.c
Normal file
328
src/cmd/dict/movie.c
Normal file
|
@ -0,0 +1,328 @@
|
|||
#include <u.h>
|
||||
#include <libc.h>
|
||||
#include <bio.h>
|
||||
#include "dict.h"
|
||||
|
||||
/* Possible tags */
|
||||
enum {
|
||||
BEG, /* beginning of entry */
|
||||
AB, /* abstract */
|
||||
AN, /* database serial number */
|
||||
AS, /* author (one at a time) */
|
||||
AU, /* all authors */
|
||||
AW, /* award_awardee */
|
||||
BW, /* bw or c */
|
||||
CA, /* cast: character_actor */
|
||||
CN, /* cinematography */
|
||||
CO, /* country */
|
||||
CR, /* miscellaneous job_name */
|
||||
DE, /* topic keyword */
|
||||
DR, /* director */
|
||||
ED, /* editor */
|
||||
MP, /* MPAA rating (R, PG, etc.) */
|
||||
NT, /* note */
|
||||
PR, /* producer and for ...*/
|
||||
PS, /* producer (repeats info in PR) */
|
||||
RA, /* rating (letter) */
|
||||
RD, /* release date */
|
||||
RT, /* running time */
|
||||
RV, /* review citation */
|
||||
ST, /* production or release company (repeats info in PR) */
|
||||
TI, /* title[; original foreign title] */
|
||||
TX, /* paragraph of descriptive text */
|
||||
VD, /* video information (format_time_company; or "Not Avail.") */
|
||||
NTAG /* number of tags */
|
||||
};
|
||||
|
||||
/* Assoc tables must be sorted on first field */
|
||||
|
||||
static char *tagtab[] = {
|
||||
[BEG] "$$",
|
||||
[AB] "AB",
|
||||
[AN] "AN",
|
||||
[AS] "AS",
|
||||
[AU] "AU",
|
||||
[AW] "AW",
|
||||
[BW] "BW",
|
||||
[CA] "CA",
|
||||
[CN] "CN",
|
||||
[CO] "CO",
|
||||
[CR] "CR",
|
||||
[DE] "DE",
|
||||
[DR] "DR",
|
||||
[ED] "ED",
|
||||
[MP] "MP",
|
||||
[NT] "NT",
|
||||
[PR] "PR",
|
||||
[PS] "PS",
|
||||
[RA] "RA",
|
||||
[RD] "RD",
|
||||
[RT] "RT",
|
||||
[RV] "RV",
|
||||
[ST] "ST",
|
||||
[TI] "TI",
|
||||
[TX] "TX",
|
||||
[VD] "VD",
|
||||
};
|
||||
|
||||
static char *mget(int, char *, char *, char **);
|
||||
#if 0
|
||||
static void moutall(int, char *, char *);
|
||||
#endif
|
||||
static void moutall2(int, char *, char *);
|
||||
|
||||
void
|
||||
movieprintentry(Entry ent, int cmd)
|
||||
{
|
||||
char *p, *e, *ps, *pe, *pn;
|
||||
int n;
|
||||
|
||||
ps = ent.start;
|
||||
pe = ent.end;
|
||||
if(cmd == 'r') {
|
||||
Bwrite(bout, ps, pe-ps);
|
||||
return;
|
||||
}
|
||||
p = mget(TI, ps, pe, &e);
|
||||
if(p) {
|
||||
outpiece(p, e);
|
||||
outnl(0);
|
||||
}
|
||||
if(cmd == 'h')
|
||||
return;
|
||||
outnl(2);
|
||||
n = 0;
|
||||
p = mget(RD, ps, pe, &e);
|
||||
if(p) {
|
||||
outchars("Released: ");
|
||||
outpiece(p, e);
|
||||
n++;
|
||||
}
|
||||
p = mget(CO, ps, pe, &e);
|
||||
if(p) {
|
||||
if(n)
|
||||
outchars(", ");
|
||||
outpiece(p, e);
|
||||
n++;
|
||||
}
|
||||
p = mget(RT, ps, pe, &e);
|
||||
if(p) {
|
||||
if(n)
|
||||
outchars(", ");
|
||||
outchars("Running time: ");
|
||||
outpiece(p, e);
|
||||
n++;
|
||||
}
|
||||
p = mget(MP, ps, pe, &e);
|
||||
if(p) {
|
||||
if(n)
|
||||
outchars(", ");
|
||||
outpiece(p, e);
|
||||
n++;
|
||||
}
|
||||
p = mget(BW, ps, pe, &e);
|
||||
if(p) {
|
||||
if(n)
|
||||
outchars(", ");
|
||||
if(*p == 'c' || *p == 'C')
|
||||
outchars("Color");
|
||||
else
|
||||
outchars("B&W");
|
||||
n++;
|
||||
}
|
||||
if(n) {
|
||||
outchar('.');
|
||||
outnl(1);
|
||||
}
|
||||
p = mget(VD, ps, pe, &e);
|
||||
if(p) {
|
||||
outchars("Video: ");
|
||||
outpiece(p, e);
|
||||
outnl(1);
|
||||
}
|
||||
p = mget(AU, ps, pe, &e);
|
||||
if(p) {
|
||||
outchars("By: ");
|
||||
moutall2(AU, ps, pe);
|
||||
outnl(1);
|
||||
}
|
||||
p = mget(DR, ps, pe, &e);
|
||||
if(p) {
|
||||
outchars("Director: ");
|
||||
outpiece(p, e);
|
||||
outnl(1);
|
||||
}
|
||||
p = mget(PR, ps, pe, &e);
|
||||
if(p) {
|
||||
outchars("Producer: ");
|
||||
outpiece(p, e);
|
||||
outnl(1);
|
||||
}
|
||||
p = mget(CN, ps, pe, &e);
|
||||
if(p) {
|
||||
outchars("Cinematograpy: ");
|
||||
outpiece(p, e);
|
||||
outnl(1);
|
||||
}
|
||||
p = mget(CR, ps, pe, &e);
|
||||
if(p) {
|
||||
outchars("Other Credits: ");
|
||||
moutall2(CR, ps, pe);
|
||||
}
|
||||
outnl(2);
|
||||
p = mget(CA, ps, pe, &e);
|
||||
if(p) {
|
||||
outchars("Cast: ");
|
||||
moutall2(CA, ps, pe);
|
||||
}
|
||||
outnl(2);
|
||||
p = mget(AW, ps, pe, &e);
|
||||
if(p) {
|
||||
outchars("Awards: ");
|
||||
moutall2(AW, ps, pe);
|
||||
outnl(2);
|
||||
}
|
||||
p = mget(NT, ps, pe, &e);
|
||||
if(p) {
|
||||
outpiece(p, e);
|
||||
outnl(2);
|
||||
}
|
||||
p = mget(AB, ps, pe, &e);
|
||||
if(p) {
|
||||
outpiece(p, e);
|
||||
outnl(2);
|
||||
}
|
||||
pn = ps;
|
||||
n = 0;
|
||||
while((p = mget(TX, pn, pe, &pn)) != 0) {
|
||||
if(n++)
|
||||
outnl(1);
|
||||
outpiece(p, pn);
|
||||
}
|
||||
outnl(0);
|
||||
}
|
||||
|
||||
long
|
||||
movienextoff(long fromoff)
|
||||
{
|
||||
long a;
|
||||
char *p;
|
||||
|
||||
a = Bseek(bdict, fromoff, 0);
|
||||
if(a < 0)
|
||||
return -1;
|
||||
for(;;) {
|
||||
p = Brdline(bdict, '\n');
|
||||
if(!p)
|
||||
break;
|
||||
if(p[0] == '$' && p[1] == '$')
|
||||
return (Boffset(bdict)-Blinelen(bdict));
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
void
|
||||
movieprintkey(void)
|
||||
{
|
||||
Bprint(bout, "No key\n");
|
||||
}
|
||||
|
||||
/*
|
||||
* write a comma-separated list of all tag values between b and e
|
||||
*/
|
||||
#if 0
|
||||
static void
|
||||
moutall(int tag, char *b, char *e)
|
||||
{
|
||||
char *p, *pn;
|
||||
int n;
|
||||
|
||||
n = 0;
|
||||
pn = b;
|
||||
while((p = mget(tag, pn, e, &pn)) != 0) {
|
||||
if(n++)
|
||||
outchars(", ");
|
||||
outpiece(p, pn);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* like moutall, but values are expected to have form:
|
||||
* field1_field2
|
||||
* and we are to output 'field2 (field1)' for each
|
||||
* (sometimes field1 has underscores, so search from end)
|
||||
*/
|
||||
static void
|
||||
moutall2(int tag, char *b, char *e)
|
||||
{
|
||||
char *p, *pn, *us, *q;
|
||||
int n;
|
||||
|
||||
n = 0;
|
||||
pn = b;
|
||||
while((p = mget(tag, pn, e, &pn)) != 0) {
|
||||
if(n++)
|
||||
outchars(", ");
|
||||
us = 0;
|
||||
for(q = pn-1; q >= p; q--)
|
||||
if(*q == '_') {
|
||||
us = q;
|
||||
break;
|
||||
}
|
||||
if(us) {
|
||||
/*
|
||||
* Hack to fix cast list Himself/Herself
|
||||
*/
|
||||
if(strncmp(us+1, "Himself", 7) == 0 ||
|
||||
strncmp(us+1, "Herself", 7) == 0) {
|
||||
outpiece(p, us);
|
||||
outchars(" (");
|
||||
outpiece(us+1, pn);
|
||||
outchar(')');
|
||||
} else {
|
||||
outpiece(us+1, pn);
|
||||
outchars(" (");
|
||||
outpiece(p, us);
|
||||
outchar(')');
|
||||
}
|
||||
} else {
|
||||
outpiece(p, pn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Starting from b, find next line beginning with tagtab[tag].
|
||||
* Don't go past e, but assume *e==0.
|
||||
* Return pointer to beginning of value (after tag), and set
|
||||
* eptr to point at newline that ends the value
|
||||
*/
|
||||
static char *
|
||||
mget(int tag, char *b, char *e, char **eptr)
|
||||
{
|
||||
char *p, *t, *ans;
|
||||
|
||||
if(tag < 0 || tag >= NTAG)
|
||||
return 0;
|
||||
t = tagtab[tag];
|
||||
ans = 0;
|
||||
for(p = b;;) {
|
||||
p = strchr(p, '\n');
|
||||
if(!p || ++p >= e) {
|
||||
if(ans)
|
||||
*eptr = e-1;
|
||||
break;
|
||||
}
|
||||
if(!ans) {
|
||||
if(p[0] == t[0] && p[1] == t[1])
|
||||
ans = p+3;
|
||||
} else {
|
||||
if(p[0] != ' ') {
|
||||
*eptr = p-1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ans;
|
||||
}
|
1425
src/cmd/dict/oed.c
Normal file
1425
src/cmd/dict/oed.c
Normal file
File diff suppressed because it is too large
Load diff
226
src/cmd/dict/pcollins.c
Normal file
226
src/cmd/dict/pcollins.c
Normal file
|
@ -0,0 +1,226 @@
|
|||
#include <u.h>
|
||||
#include <libc.h>
|
||||
#include <bio.h>
|
||||
#include "dict.h"
|
||||
|
||||
/*
|
||||
* Routines for handling dictionaries in the "Paperback Collins"
|
||||
* format (with tags surrounded by >....<)
|
||||
*/
|
||||
enum {
|
||||
Buflen=1000,
|
||||
};
|
||||
|
||||
/* More special runes */
|
||||
enum {
|
||||
B = MULTIE+1, /* bold */
|
||||
H, /* headword start */
|
||||
I, /* italics */
|
||||
Ps, /* pronunciation start */
|
||||
Pe, /* pronunciation end */
|
||||
R, /* roman */
|
||||
X, /* headword end */
|
||||
};
|
||||
|
||||
/* Assoc tables must be sorted on first field */
|
||||
|
||||
static Assoc tagtab[] = {
|
||||
{"AA", 0xc5},
|
||||
{"AC", LACU},
|
||||
{"B", B},
|
||||
{"CE", LCED},
|
||||
{"CI", LFRN},
|
||||
{"Di", 0x131},
|
||||
{"EL", 0x2d},
|
||||
{"GR", LGRV},
|
||||
{"H", H},
|
||||
{"I", I},
|
||||
{"OE", 0x152},
|
||||
{"R", R},
|
||||
{"TI", LTIL},
|
||||
{"UM", LUML},
|
||||
{"X", X},
|
||||
{"[", Ps},
|
||||
{"]", Pe},
|
||||
{"ac", LACU},
|
||||
{"ce", LCED},
|
||||
{"ci", LFRN},
|
||||
{"gr", LGRV},
|
||||
{"oe", 0x153},
|
||||
{"supe", 0x65}, /* should be raised */
|
||||
{"supo", 0x6f}, /* should be raised */
|
||||
{"ti", LTIL},
|
||||
{"um", LUML},
|
||||
{"{", Ps},
|
||||
{"~", 0x7e},
|
||||
{"~~", MTT},
|
||||
};
|
||||
|
||||
static Rune normtab[128] = {
|
||||
/*0*/ /*1*/ /*2*/ /*3*/ /*4*/ /*5*/ /*6*/ /*7*/
|
||||
/*00*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
|
||||
NONE, NONE, 0x20, NONE, NONE, NONE, NONE, NONE,
|
||||
/*10*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
|
||||
NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
|
||||
/*20*/ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, '\'',
|
||||
0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
|
||||
/*30*/ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
|
||||
0x38, 0x39, 0x3a, 0x3b, TAGE, 0x3d, TAGS, 0x3f,
|
||||
/*40*/ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
|
||||
0x48, 0x49, 0x4a, 0x4b, L'L', 0x4d, 0x4e, 0x4f,
|
||||
/*50*/ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
|
||||
0x58, 0x59, 0x5a, 0x5b, L'\\', 0x5d, 0x5e, 0x5f,
|
||||
/*60*/ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
|
||||
/*70*/ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, NONE,
|
||||
};
|
||||
|
||||
static char *gettag(char *, char *);
|
||||
|
||||
static Entry curentry;
|
||||
static char tag[Buflen];
|
||||
#define cursize (curentry.end-curentry.start)
|
||||
|
||||
void
|
||||
pcollprintentry(Entry e, int cmd)
|
||||
{
|
||||
char *p, *pe;
|
||||
long r, rprev, t, rlig;
|
||||
int saveoi;
|
||||
Rune *transtab;
|
||||
|
||||
p = e.start;
|
||||
pe = e.end;
|
||||
transtab = normtab;
|
||||
rprev = NONE;
|
||||
changett(0, 0, 0);
|
||||
curentry = e;
|
||||
saveoi = 0;
|
||||
if(cmd == 'h')
|
||||
outinhibit = 1;
|
||||
while(p < pe) {
|
||||
if(cmd == 'r') {
|
||||
outchar(*p++);
|
||||
continue;
|
||||
}
|
||||
r = transtab[(*p++)&0x7F];
|
||||
if(r < NONE) {
|
||||
/* Emit the rune, but buffer in case of ligature */
|
||||
if(rprev != NONE)
|
||||
outrune(rprev);
|
||||
rprev = r;
|
||||
} else if(r == TAGS) {
|
||||
p = gettag(p, pe);
|
||||
t = lookassoc(tagtab, asize(tagtab), tag);
|
||||
if(t == -1) {
|
||||
if(debug && !outinhibit)
|
||||
err("tag %ld %d %s",
|
||||
e.doff, cursize, tag);
|
||||
continue;
|
||||
}
|
||||
if(t < NONE) {
|
||||
if(rprev != NONE)
|
||||
outrune(rprev);
|
||||
rprev = t;
|
||||
} else if(t >= LIGS && t < LIGE) {
|
||||
/* handle possible ligature */
|
||||
rlig = liglookup(t, rprev);
|
||||
if(rlig != NONE)
|
||||
rprev = rlig; /* overwrite rprev */
|
||||
else {
|
||||
/* could print accent, but let's not */
|
||||
if(rprev != NONE) outrune(rprev);
|
||||
rprev = NONE;
|
||||
}
|
||||
} else if(t >= MULTI && t < MULTIE) {
|
||||
if(rprev != NONE) {
|
||||
outrune(rprev);
|
||||
rprev = NONE;
|
||||
}
|
||||
outrunes(multitab[t-MULTI]);
|
||||
} else {
|
||||
if(rprev != NONE) {
|
||||
outrune(rprev);
|
||||
rprev = NONE;
|
||||
}
|
||||
switch(t){
|
||||
case H:
|
||||
if(cmd == 'h')
|
||||
outinhibit = 0;
|
||||
else
|
||||
outnl(0);
|
||||
break;
|
||||
case X:
|
||||
if(cmd == 'h')
|
||||
outinhibit = 1;
|
||||
else
|
||||
outchars(". ");
|
||||
break;
|
||||
case Ps:
|
||||
/* don't know enough of pron. key yet */
|
||||
saveoi = outinhibit;
|
||||
outinhibit = 1;
|
||||
break;
|
||||
case Pe:
|
||||
outinhibit = saveoi;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if(cmd == 'h')
|
||||
outinhibit = 0;
|
||||
outnl(0);
|
||||
}
|
||||
|
||||
long
|
||||
pcollnextoff(long fromoff)
|
||||
{
|
||||
long a;
|
||||
char *p;
|
||||
|
||||
a = Bseek(bdict, fromoff, 0);
|
||||
if(a < 0)
|
||||
return -1;
|
||||
for(;;) {
|
||||
p = Brdline(bdict, '\n');
|
||||
if(!p)
|
||||
break;
|
||||
if(p[0] == '>' && p[1] == 'H' && p[2] == '<')
|
||||
return (Boffset(bdict)-Blinelen(bdict));
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
void
|
||||
pcollprintkey(void)
|
||||
{
|
||||
Bprint(bout, "No pronunciation key yet\n");
|
||||
}
|
||||
|
||||
/*
|
||||
* f points just after '>'; fe points at end of entry.
|
||||
* Expect next characters from bin to match:
|
||||
* [^ <]+<
|
||||
* tag
|
||||
* Accumulate the tag in tag[].
|
||||
* Return pointer to after final '<'.
|
||||
*/
|
||||
static char *
|
||||
gettag(char *f, char *fe)
|
||||
{
|
||||
char *t;
|
||||
int c, i;
|
||||
|
||||
t = tag;
|
||||
i = Buflen;
|
||||
while(--i > 0) {
|
||||
c = *f++;
|
||||
if(c == '<' || f == fe)
|
||||
break;
|
||||
*t++ = c;
|
||||
}
|
||||
*t = 0;
|
||||
return f;
|
||||
}
|
248
src/cmd/dict/pcollinsg.c
Normal file
248
src/cmd/dict/pcollinsg.c
Normal file
|
@ -0,0 +1,248 @@
|
|||
#include <u.h>
|
||||
#include <libc.h>
|
||||
#include <bio.h>
|
||||
#include "dict.h"
|
||||
|
||||
/*
|
||||
* Routines for handling dictionaries in the "Paperback Collins"
|
||||
* `German' format (with tags surrounded by \5⋯\6 and \xba⋯\xba)
|
||||
*/
|
||||
|
||||
/*
|
||||
* \5...\6 escapes (fonts, mostly)
|
||||
*
|
||||
* h headword (helvetica 7 pt)
|
||||
* c clause (helvetica 7 pt)
|
||||
* 3 helvetica 7 pt
|
||||
* 4 helvetica 6.5 pt
|
||||
* s helvetica 8 pt
|
||||
* x helvetica 8 pt
|
||||
* y helvetica 5 pt
|
||||
* m helvetica 30 pt
|
||||
* 1 roman 6 pt
|
||||
* 9 roman 4.5 pt
|
||||
* p roman 7 pt
|
||||
* q roman 4.5 pt
|
||||
* 2 italic 6 pt
|
||||
* 7 italic 4.5 pt
|
||||
* b bold 6 pt
|
||||
* a `indent 0:4 left'
|
||||
* k `keep 9'
|
||||
* l `size 12'
|
||||
*/
|
||||
|
||||
enum {
|
||||
IBASE=0x69, /* dotless i */
|
||||
Taglen=32,
|
||||
};
|
||||
|
||||
static Rune intab[256] = {
|
||||
/*0*/ /*1*/ /*2*/ /*3*/ /*4*/ /*5*/ /*6*/ /*7*/
|
||||
/*00*/ NONE, NONE, NONE, NONE, NONE, TAGS, TAGE, NONE,
|
||||
NONE, NONE, NONE, NONE, NONE, 0x20, NONE, NONE,
|
||||
/*10*/ NONE, 0x2d, 0x20, 0x20, NONE, NONE, NONE, NONE,
|
||||
0x20, NONE, NONE, NONE, 0x20, NONE, NONE, 0x2d,
|
||||
/*20*/ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, '\'',
|
||||
0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
|
||||
/*30*/ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
|
||||
0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
|
||||
/*40*/ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
|
||||
0x48, 0x49, 0x4a, 0x4b, L'L', 0x4d, 0x4e, 0x4f,
|
||||
/*50*/ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
|
||||
0x58, 0x59, 0x5a, 0x5b, L'\\', 0x5d, 0x5e, 0x5f,
|
||||
/*60*/ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
|
||||
/*70*/ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, NONE,
|
||||
/*80*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
|
||||
NONE, NONE, 0x20, NONE, NONE, NONE, NONE, NONE,
|
||||
/*90*/ 0xdf, 0xe6, NONE, MOE, NONE, NONE, NONE, 0xf8,
|
||||
NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
|
||||
/*A0*/ NONE, NONE, 0x22, 0xa3, NONE, NONE, NONE, NONE,
|
||||
NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
|
||||
/*B0*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, 0x7e,
|
||||
NONE, IBASE, SPCS, NONE, NONE, NONE, NONE, NONE,
|
||||
/*C0*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
|
||||
NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
|
||||
/*D0*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
|
||||
NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
|
||||
/*E0*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
|
||||
NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
|
||||
/*F0*/ 0x20, 0x20, NONE, NONE, NONE, NONE, NONE, NONE,
|
||||
NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
|
||||
};
|
||||
|
||||
static Nassoc numtab[] = {
|
||||
{1, 0x2b},
|
||||
{4, 0x3d},
|
||||
{7, 0xb0},
|
||||
{11, 0x2248},
|
||||
{69, 0x2666},
|
||||
{114, 0xae},
|
||||
{340, 0x25b},
|
||||
{341, 0x254},
|
||||
{342, 0x28c},
|
||||
{343, 0x259},
|
||||
{345, 0x292},
|
||||
{346, 0x283},
|
||||
{347, 0x275},
|
||||
{348, 0x28a},
|
||||
{349, 0x2c8},
|
||||
{351, 0x26a},
|
||||
{352, 0x25c},
|
||||
{354, 0x251},
|
||||
{355, 0x7e},
|
||||
{356, 0x252},
|
||||
{384, 0x273},
|
||||
{445, 0xf0}, /* BUG -- should be script eth */
|
||||
};
|
||||
|
||||
static Nassoc overtab[] = {
|
||||
{0x2c, LCED},
|
||||
{0x2f, LACU},
|
||||
{0x3a, LUML},
|
||||
{L'\\', LGRV},
|
||||
{0x5e, LFRN},
|
||||
{0x7e, LTIL},
|
||||
};
|
||||
|
||||
static uchar *reach(uchar*, int);
|
||||
|
||||
static Entry curentry;
|
||||
static char tag[Taglen];
|
||||
|
||||
void
|
||||
pcollgprintentry(Entry e, int cmd)
|
||||
{
|
||||
uchar *p, *pe;
|
||||
int r, rprev = NONE, rx, over = 0, font;
|
||||
char buf[16];
|
||||
|
||||
p = (uchar *)e.start;
|
||||
pe = (uchar *)e.end;
|
||||
curentry = e;
|
||||
if(cmd == 'h')
|
||||
outinhibit = 1;
|
||||
while(p < pe){
|
||||
if(cmd == 'r'){
|
||||
outchar(*p++);
|
||||
continue;
|
||||
}
|
||||
switch(r = intab[*p++]){ /* assign = */
|
||||
case TAGS:
|
||||
if(rprev != NONE){
|
||||
outrune(rprev);
|
||||
rprev = NONE;
|
||||
}
|
||||
p = reach(p, 0x06);
|
||||
font = tag[0];
|
||||
if(cmd == 'h')
|
||||
outinhibit = (font != 'h');
|
||||
break;
|
||||
|
||||
case TAGE: /* an extra one */
|
||||
break;
|
||||
|
||||
case SPCS:
|
||||
p = reach(p, 0xba);
|
||||
r = looknassoc(numtab, asize(numtab), strtol(tag,0,0));
|
||||
if(r < 0){
|
||||
if(rprev != NONE){
|
||||
outrune(rprev);
|
||||
rprev = NONE;
|
||||
}
|
||||
sprint(buf, "\\N'%s'", tag);
|
||||
outchars(buf);
|
||||
break;
|
||||
}
|
||||
/* else fall through */
|
||||
|
||||
default:
|
||||
if(over){
|
||||
rx = looknassoc(overtab, asize(overtab), r);
|
||||
if(rx > 0)
|
||||
rx = liglookup(rx, rprev);
|
||||
if(rx > 0 && rx != NONE)
|
||||
outrune(rx);
|
||||
else{
|
||||
outrune(rprev);
|
||||
if(r == ':')
|
||||
outrune(0xa8);
|
||||
else{
|
||||
outrune(0x5e);
|
||||
outrune(r);
|
||||
}
|
||||
}
|
||||
over = 0;
|
||||
rprev = NONE;
|
||||
}else if(r == '^'){
|
||||
over = 1;
|
||||
}else{
|
||||
if(rprev != NONE)
|
||||
outrune(rprev);
|
||||
rprev = r;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
if(rprev != NONE)
|
||||
outrune(rprev);
|
||||
if(cmd == 'h')
|
||||
outinhibit = 0;
|
||||
outnl(0);
|
||||
}
|
||||
|
||||
long
|
||||
pcollgnextoff(long fromoff)
|
||||
{
|
||||
int c, state = 0, defoff = -1;
|
||||
|
||||
if(Bseek(bdict, fromoff, 0) < 0)
|
||||
return -1;
|
||||
while((c = Bgetc(bdict)) >= 0){
|
||||
if(c == '\r')
|
||||
defoff = Boffset(bdict);
|
||||
switch(state){
|
||||
case 0:
|
||||
if(c == 0x05)
|
||||
state = 1;
|
||||
break;
|
||||
case 1:
|
||||
if(c == 'h')
|
||||
state = 2;
|
||||
else
|
||||
state = 0;
|
||||
break;
|
||||
case 2:
|
||||
if(c == 0x06)
|
||||
return (Boffset(bdict)-3);
|
||||
else
|
||||
state = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return defoff;
|
||||
}
|
||||
|
||||
void
|
||||
pcollgprintkey(void)
|
||||
{
|
||||
Bprint(bout, "No pronunciation key yet\n");
|
||||
}
|
||||
|
||||
static uchar *
|
||||
reach(uchar *p, int tagchar)
|
||||
{
|
||||
int c; char *q=tag;
|
||||
|
||||
while(p < (uchar *)curentry.end){
|
||||
c = *p++;
|
||||
if(c == tagchar)
|
||||
break;
|
||||
*q++ = c;
|
||||
if(q >= &tag[sizeof tag-1])
|
||||
break;
|
||||
}
|
||||
*q = 0;
|
||||
return p;
|
||||
}
|
1165
src/cmd/dict/pgw.c
Normal file
1165
src/cmd/dict/pgw.c
Normal file
File diff suppressed because it is too large
Load diff
6
src/cmd/dict/rev.awk
Normal file
6
src/cmd/dict/rev.awk
Normal file
|
@ -0,0 +1,6 @@
|
|||
NF == 2 {
|
||||
printf "%s\t%s\n", $2, $1
|
||||
}
|
||||
NF != 2 {
|
||||
print "ERROR " $0
|
||||
}
|
312
src/cmd/dict/robert.c
Normal file
312
src/cmd/dict/robert.c
Normal file
|
@ -0,0 +1,312 @@
|
|||
#include <u.h>
|
||||
#include <libc.h>
|
||||
#include <bio.h>
|
||||
#include "dict.h"
|
||||
|
||||
/*
|
||||
* Robert Électronique.
|
||||
*/
|
||||
|
||||
enum
|
||||
{
|
||||
CIT = MULTIE+1, /* citation ptr followed by long int and ascii label */
|
||||
BROM, /* bold roman */
|
||||
ITON, /* start italic */
|
||||
ROM, /* roman */
|
||||
SYM, /* symbol font? */
|
||||
HEL, /* helvetica */
|
||||
BHEL, /* helvetica bold */
|
||||
SMALL, /* smaller? */
|
||||
ITOFF, /* end italic */
|
||||
SUP, /* following character is superscript */
|
||||
SUB /* following character is subscript */
|
||||
};
|
||||
|
||||
static Rune intab[256] = {
|
||||
/*0*/ /*1*/ /*2*/ /*3*/ /*4*/ /*5*/ /*6*/ /*7*/
|
||||
/*00*/ NONE, 0x263a, 0x263b, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022,
|
||||
0x25d8, 0x298, L'\n', 0x2642, 0x2640, 0x266a, 0x266b, 0x203b,
|
||||
/*10*/ 0x21e8, 0x21e6, 0x2195, 0x203c, 0xb6, 0xa7, 0x2043, 0x21a8,
|
||||
0x2191, 0x2193, 0x2192, 0x2190, 0x2319, 0x2194, 0x25b4, 0x25be,
|
||||
/*20*/ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, L'\'',
|
||||
0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
|
||||
/*30*/ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
|
||||
0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
|
||||
/*40*/ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
|
||||
0x48, 0x49, 0x4a, 0x4b, L'L', 0x4d, 0x4e, 0x4f,
|
||||
/*50*/ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
|
||||
0x58, 0x59, 0x5a, 0x5b, L'\\', 0x5d, 0x5e, 0x5f,
|
||||
/*60*/ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
|
||||
/*70*/ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
|
||||
/*80*/ 0xc7, 0xfc, 0xe9, 0xe2, 0xe4, 0xe0, 0xe5, 0xe7,
|
||||
0xea, 0xeb, 0xe8, 0xef, 0xee, 0xec, 0xc4, 0xc5,
|
||||
/*90*/ 0xc9, 0xe6, 0xc6, 0xf4, 0xf6, 0xf2, 0xfb, 0xf9,
|
||||
0xff, 0xd6, 0xdc, 0xa2, 0xa3, 0xa5, 0x20a7, 0x283,
|
||||
/*a0*/ 0xe1, 0xed, 0xf3, 0xfa, 0xf1, 0xd1, 0xaa, 0xba,
|
||||
0xbf, 0x2310, 0xac, 0xbd, 0xbc, 0xa1, 0xab, 0xbb,
|
||||
/*b0*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
|
||||
NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
|
||||
/*c0*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
|
||||
CIT, BROM, NONE, ITON, ROM, SYM, HEL, BHEL,
|
||||
/*d0*/ NONE, SMALL, ITOFF, SUP, SUB, NONE, NONE, NONE,
|
||||
NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
|
||||
/*e0*/ 0x3b1, 0xdf, 0x3b3, 0x3c0, 0x3a3, 0x3c3, 0xb5, 0x3c4,
|
||||
0x3a6, 0x398, 0x3a9, 0x3b4, 0x221e, 0xd8, 0x3b5, 0x2229,
|
||||
/*f0*/ 0x2261, 0xb1, 0x2265, 0x2264, 0x2320, 0x2321, 0xf7, 0x2248,
|
||||
0xb0, 0x2219, 0xb7, 0x221a, 0x207f, 0xb2, 0x220e, 0xa0,
|
||||
};
|
||||
|
||||
static Rune suptab[] = {
|
||||
['0'] 0x2070, ['1'] 0x2071, ['2'] 0x2072, ['3'] 0x2073,
|
||||
['4'] 0x2074, ['5'] 0x2075, ['6'] 0x2076, ['7'] 0x2077,
|
||||
['8'] 0x2078, ['9'] 0x2079, ['+'] 0x207a, ['-'] 0x207b,
|
||||
['='] 0x207c, ['('] 0x207d, [')'] 0x207e, ['a'] 0xaa,
|
||||
['n'] 0x207f, ['o'] 0xba
|
||||
};
|
||||
|
||||
static Rune subtab[] = {
|
||||
['0'] 0x2080, ['1'] 0x2081, ['2'] 0x2082, ['3'] 0x2083,
|
||||
['4'] 0x2084, ['5'] 0x2085, ['6'] 0x2086, ['7'] 0x2087,
|
||||
['8'] 0x2088, ['9'] 0x2089, ['+'] 0x208a, ['-'] 0x208b,
|
||||
['='] 0x208c, ['('] 0x208d, [')'] 0x208e
|
||||
};
|
||||
|
||||
#define GSHORT(p) (((p)[0]<<8) | (p)[1])
|
||||
#define GLONG(p) (((p)[0]<<24) | ((p)[1]<<16) | ((p)[2]<<8) | (p)[3])
|
||||
|
||||
static char cfile[] = "/lib/dict/robert/cits.rob";
|
||||
static char dfile[] = "/lib/dict/robert/defs.rob";
|
||||
static char efile[] = "/lib/dict/robert/etym.rob";
|
||||
static char kfile[] = "/lib/dict/robert/_phon";
|
||||
|
||||
static Biobuf * cb;
|
||||
static Biobuf * db;
|
||||
static Biobuf * eb;
|
||||
|
||||
static Biobuf * Bouvrir(char*);
|
||||
static void citation(int, int);
|
||||
static void robertprintentry(Entry*, Entry*, int);
|
||||
|
||||
void
|
||||
robertindexentry(Entry e, int cmd)
|
||||
{
|
||||
uchar *p = (uchar *)e.start;
|
||||
long ea, el, da, dl, fa;
|
||||
Entry def, etym;
|
||||
|
||||
ea = GLONG(&p[0]);
|
||||
el = GSHORT(&p[4]);
|
||||
da = GLONG(&p[6]);
|
||||
dl = GSHORT(&p[10]);
|
||||
fa = GLONG(&p[12]);
|
||||
USED(fa);
|
||||
|
||||
if(db == 0)
|
||||
db = Bouvrir(dfile);
|
||||
def.start = malloc(dl+1);
|
||||
def.end = def.start + dl;
|
||||
def.doff = da;
|
||||
Bseek(db, da, 0);
|
||||
Bread(db, def.start, dl);
|
||||
*def.end = 0;
|
||||
if(cmd == 'h'){
|
||||
robertprintentry(&def, 0, cmd);
|
||||
}else{
|
||||
if(eb == 0)
|
||||
eb = Bouvrir(efile);
|
||||
etym.start = malloc(el+1);
|
||||
etym.end = etym.start + el;
|
||||
etym.doff = ea;
|
||||
Bseek(eb, ea, 0);
|
||||
Bread(eb, etym.start, el);
|
||||
*etym.end = 0;
|
||||
robertprintentry(&def, &etym, cmd);
|
||||
free(etym.start);
|
||||
}
|
||||
free(def.start);
|
||||
}
|
||||
|
||||
static void
|
||||
robertprintentry(Entry *def, Entry *etym, int cmd)
|
||||
{
|
||||
uchar *p, *pe;
|
||||
Rune r; int c, n;
|
||||
int baseline = 0;
|
||||
int lineno = 0;
|
||||
int cit = 0;
|
||||
|
||||
p = (uchar *)def->start;
|
||||
pe = (uchar *)def->end;
|
||||
while(p < pe){
|
||||
if(cmd == 'r'){
|
||||
outchar(*p++);
|
||||
continue;
|
||||
}
|
||||
c = *p++;
|
||||
switch(r = intab[c]){ /* assign = */
|
||||
case BROM:
|
||||
case ITON:
|
||||
case ROM:
|
||||
case SYM:
|
||||
case HEL:
|
||||
case BHEL:
|
||||
case SMALL:
|
||||
case ITOFF:
|
||||
case NONE:
|
||||
if(debug)
|
||||
outprint("\\%.2ux", c);
|
||||
baseline = 0;
|
||||
break;
|
||||
|
||||
case SUP:
|
||||
baseline = 1;
|
||||
break;
|
||||
|
||||
case SUB:
|
||||
baseline = -1;
|
||||
break;
|
||||
|
||||
case CIT:
|
||||
n = p[0] | (p[1]<<8) | (p[2]<<16) | (p[3]<<24);
|
||||
p += 4;
|
||||
if(debug)
|
||||
outprint("[%d]", n);
|
||||
while(*p == ' ' || ('0'<=*p && *p<='9') || *p == '.'){
|
||||
if(debug)
|
||||
outchar(*p);
|
||||
++p;
|
||||
}
|
||||
++cit;
|
||||
outnl(2);
|
||||
citation(n, cmd);
|
||||
baseline = 0;
|
||||
break;
|
||||
|
||||
case '\n':
|
||||
outnl(0);
|
||||
baseline = 0;
|
||||
++lineno;
|
||||
break;
|
||||
|
||||
default:
|
||||
if(baseline > 0 && r < nelem(suptab))
|
||||
r = suptab[r];
|
||||
else if(baseline < 0 && r < nelem(subtab))
|
||||
r = subtab[r];
|
||||
if(cit){
|
||||
outchar('\n');
|
||||
cit = 0;
|
||||
}
|
||||
outrune(r);
|
||||
baseline = 0;
|
||||
break;
|
||||
}
|
||||
if(r == '\n'){
|
||||
if(cmd == 'h')
|
||||
break;
|
||||
if(lineno == 1 && etym)
|
||||
robertprintentry(etym, 0, cmd);
|
||||
}
|
||||
}
|
||||
outnl(0);
|
||||
}
|
||||
|
||||
static void
|
||||
citation(int addr, int cmd)
|
||||
{
|
||||
Entry cit;
|
||||
|
||||
if(cb == 0)
|
||||
cb = Bouvrir(cfile);
|
||||
Bseek(cb, addr, 0);
|
||||
cit.start = Brdline(cb, 0xc8);
|
||||
cit.end = cit.start + Blinelen(cb) - 1;
|
||||
cit.doff = addr;
|
||||
*cit.end = 0;
|
||||
robertprintentry(&cit, 0, cmd);
|
||||
}
|
||||
|
||||
long
|
||||
robertnextoff(long fromoff)
|
||||
{
|
||||
return (fromoff & ~15) + 16;
|
||||
}
|
||||
|
||||
void
|
||||
robertprintkey(void)
|
||||
{
|
||||
Biobuf *db;
|
||||
char *l;
|
||||
|
||||
db = Bouvrir(kfile);
|
||||
while(l = Brdline(db, '\n')) /* assign = */
|
||||
Bwrite(bout, l, Blinelen(db));
|
||||
Bterm(db);
|
||||
}
|
||||
|
||||
void
|
||||
robertflexentry(Entry e, int cmd)
|
||||
{
|
||||
uchar *p, *pe;
|
||||
Rune r; int c;
|
||||
int lineno = 1;
|
||||
|
||||
p = (uchar *)e.start;
|
||||
pe = (uchar *)e.end;
|
||||
while(p < pe){
|
||||
if(cmd == 'r'){
|
||||
Bputc(bout, *p++);
|
||||
continue;
|
||||
}
|
||||
c = *p++;
|
||||
r = intab[c];
|
||||
if(r == '$')
|
||||
r = '\n';
|
||||
if(r == '\n'){
|
||||
++lineno;
|
||||
if(cmd == 'h' && lineno > 2)
|
||||
break;
|
||||
}
|
||||
if(cmd == 'h' && lineno < 2)
|
||||
continue;
|
||||
if(r > MULTIE){
|
||||
if(debug)
|
||||
Bprint(bout, "\\%.2ux", c);
|
||||
continue;
|
||||
}
|
||||
if(r < Runeself)
|
||||
Bputc(bout, r);
|
||||
else
|
||||
Bputrune(bout, r);
|
||||
}
|
||||
outnl(0);
|
||||
}
|
||||
|
||||
long
|
||||
robertnextflex(long fromoff)
|
||||
{
|
||||
int c;
|
||||
|
||||
if(Bseek(bdict, fromoff, 0) < 0)
|
||||
return -1;
|
||||
while((c = Bgetc(bdict)) >= 0){
|
||||
if(c == '$')
|
||||
return Boffset(bdict);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
static Biobuf *
|
||||
Bouvrir(char *fichier)
|
||||
{
|
||||
Biobuf *db;
|
||||
|
||||
db = Bopen(fichier, OREAD);
|
||||
if(db == 0){
|
||||
fprint(2, "%s: impossible d'ouvrir %s: %r\n", argv0, fichier);
|
||||
exits("ouvrir");
|
||||
}
|
||||
return db;
|
||||
}
|
46
src/cmd/dict/simple.c
Normal file
46
src/cmd/dict/simple.c
Normal file
|
@ -0,0 +1,46 @@
|
|||
#include <u.h>
|
||||
#include <libc.h>
|
||||
#include <bio.h>
|
||||
#include "dict.h"
|
||||
|
||||
/*
|
||||
* Routines for handling dictionaries in UTF, headword
|
||||
* separated from entry by tab, entries separated by newline.
|
||||
*/
|
||||
|
||||
void
|
||||
simpleprintentry(Entry e, int cmd)
|
||||
{
|
||||
uchar *p, *pe;
|
||||
|
||||
p = (uchar *)e.start;
|
||||
pe = (uchar *)e.end;
|
||||
while(p < pe){
|
||||
if(*p == '\t'){
|
||||
if(cmd == 'h')
|
||||
break;
|
||||
else
|
||||
outchar(' '), ++p;
|
||||
}else if(*p == '\n')
|
||||
break;
|
||||
else
|
||||
outchar(*p++);
|
||||
}
|
||||
outnl(0);
|
||||
}
|
||||
|
||||
long
|
||||
simplenextoff(long fromoff)
|
||||
{
|
||||
if(Bseek(bdict, fromoff, 0) < 0)
|
||||
return -1;
|
||||
if(Brdline(bdict, '\n') == 0)
|
||||
return -1;
|
||||
return Boffset(bdict);
|
||||
}
|
||||
|
||||
void
|
||||
simpleprintkey(void)
|
||||
{
|
||||
Bprint(bout, "No pronunciation key.\n");
|
||||
}
|
203
src/cmd/dict/slang.c
Normal file
203
src/cmd/dict/slang.c
Normal file
|
@ -0,0 +1,203 @@
|
|||
#include <u.h>
|
||||
#include <libc.h>
|
||||
#include <bio.h>
|
||||
#include "dict.h"
|
||||
|
||||
/* Possible tags */
|
||||
enum {
|
||||
DF, /* definition */
|
||||
DX, /* definition/example */
|
||||
ET, /* etymology */
|
||||
EX, /* example */
|
||||
LA, /* label */
|
||||
ME, /* main entry */
|
||||
NU, /* sense number */
|
||||
PR, /* pronunciation */
|
||||
PS, /* grammar part */
|
||||
XR, /* cross reference */
|
||||
XX, /* cross reference (whole entry) */
|
||||
};
|
||||
|
||||
/* Assoc tables must be sorted on first field */
|
||||
|
||||
static Assoc tagtab[] = {
|
||||
{"df", DF},
|
||||
{"dx", DX},
|
||||
{"et", ET},
|
||||
{"ex", EX},
|
||||
{"la", LA},
|
||||
{"me", ME},
|
||||
{"nu", NU},
|
||||
{"pr", PR},
|
||||
{"ps", PS},
|
||||
{"xr", XR},
|
||||
{"xx", XX},
|
||||
};
|
||||
static long sget(char *, char *, char **, char **);
|
||||
static void soutpiece(char *, char *);
|
||||
|
||||
void
|
||||
slangprintentry(Entry e, int cmd)
|
||||
{
|
||||
char *p, *pe, *vs, *ve;
|
||||
long t;
|
||||
|
||||
p = e.start;
|
||||
pe = e.end;
|
||||
if(cmd == 'h') {
|
||||
t = sget(p, pe, &vs, &ve);
|
||||
if(t == ME)
|
||||
soutpiece(vs, ve);
|
||||
outnl(0);
|
||||
return;
|
||||
}
|
||||
while(p < pe) {
|
||||
switch(sget(p, pe, &vs, &ve)) {
|
||||
case DF:
|
||||
soutpiece(vs, ve);
|
||||
outchars(". ");
|
||||
break;
|
||||
case DX:
|
||||
soutpiece(vs, ve);
|
||||
outchars(". ");
|
||||
break;
|
||||
case ET:
|
||||
outchars("[");
|
||||
soutpiece(vs, ve);
|
||||
outchars("] ");
|
||||
break;
|
||||
case EX:
|
||||
outchars("E.g., ");
|
||||
soutpiece(vs, ve);
|
||||
outchars(". ");
|
||||
break;
|
||||
case LA:
|
||||
outchars("(");
|
||||
soutpiece(vs, ve);
|
||||
outchars(") ");
|
||||
break;
|
||||
case ME:
|
||||
outnl(0);
|
||||
soutpiece(vs, ve);
|
||||
outnl(0);
|
||||
break;
|
||||
case NU:
|
||||
outnl(2);
|
||||
soutpiece(vs, ve);
|
||||
outchars(". ");
|
||||
break;
|
||||
case PR:
|
||||
outchars("[");
|
||||
soutpiece(vs, ve);
|
||||
outchars("] ");
|
||||
break;
|
||||
case PS:
|
||||
outnl(1);
|
||||
soutpiece(vs, ve);
|
||||
outchars(". ");
|
||||
break;
|
||||
case XR:
|
||||
outchars("See ");
|
||||
soutpiece(vs, ve);
|
||||
outchars(". ");
|
||||
break;
|
||||
case XX:
|
||||
outchars("See ");
|
||||
soutpiece(vs, ve);
|
||||
outchars(". ");
|
||||
break;
|
||||
default:
|
||||
ve = pe; /* will end loop */
|
||||
break;
|
||||
}
|
||||
p = ve;
|
||||
}
|
||||
outnl(0);
|
||||
}
|
||||
|
||||
long
|
||||
slangnextoff(long fromoff)
|
||||
{
|
||||
long a;
|
||||
char *p;
|
||||
|
||||
a = Bseek(bdict, fromoff, 0);
|
||||
if(a < 0)
|
||||
return -1;
|
||||
for(;;) {
|
||||
p = Brdline(bdict, '\n');
|
||||
if(!p)
|
||||
break;
|
||||
if(p[0] == 'm' && p[1] == 'e' && p[2] == ' ')
|
||||
return (Boffset(bdict)-Blinelen(bdict));
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
void
|
||||
slangprintkey(void)
|
||||
{
|
||||
Bprint(bout, "No key\n");
|
||||
}
|
||||
|
||||
/*
|
||||
* Starting from b, find next line beginning with a tag.
|
||||
* Don't go past e, but assume *e==0.
|
||||
* Return tag value, or -1 if no more tags before e.
|
||||
* Set pvb to beginning of value (after tag).
|
||||
* Set pve to point at newline that ends the value.
|
||||
*/
|
||||
static long
|
||||
sget(char *b, char *e, char **pvb, char **pve)
|
||||
{
|
||||
char *p;
|
||||
char buf[3];
|
||||
long t, tans;
|
||||
|
||||
buf[2] = 0;
|
||||
tans = -1;
|
||||
for(p = b;;) {
|
||||
if(p[2] == ' ') {
|
||||
buf[0] = p[0];
|
||||
buf[1] = p[1];
|
||||
t = lookassoc(tagtab, asize(tagtab), buf);
|
||||
if(t < 0) {
|
||||
if(debug)
|
||||
err("tag %s\n", buf);
|
||||
p += 3;
|
||||
} else {
|
||||
if(tans < 0) {
|
||||
p += 3;
|
||||
tans = t;
|
||||
*pvb = p;
|
||||
} else {
|
||||
*pve = p;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
p = strchr(p, '\n');
|
||||
if(!p || ++p >= e) {
|
||||
if(tans >= 0)
|
||||
*pve = e-1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return tans;
|
||||
}
|
||||
|
||||
static void
|
||||
soutpiece(char *b, char *e)
|
||||
{
|
||||
int c, lastc;
|
||||
|
||||
lastc = 0;
|
||||
while(b < e) {
|
||||
c = *b++;
|
||||
if(c == '\n')
|
||||
c = ' ';
|
||||
if(!(c == ' ' && lastc == ' ') && c != '@')
|
||||
outchar(c);
|
||||
lastc = c;
|
||||
}
|
||||
}
|
13
src/cmd/dict/t.awk
Normal file
13
src/cmd/dict/t.awk
Normal file
|
@ -0,0 +1,13 @@
|
|||
NF == 2 {
|
||||
if($2 !~ / or / || $2 ~ /\(or/)
|
||||
print $0
|
||||
else {
|
||||
n = split($2, a, / or /)
|
||||
for(i = 1; i <= n; i++) {
|
||||
printf "%s\t%s\n", $1, a[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
NF != 2 {
|
||||
print $0
|
||||
}
|
86
src/cmd/dict/thesaurus.c
Normal file
86
src/cmd/dict/thesaurus.c
Normal file
|
@ -0,0 +1,86 @@
|
|||
#include <u.h>
|
||||
#include <libc.h>
|
||||
#include <bio.h>
|
||||
#include "dict.h"
|
||||
|
||||
void
|
||||
thesprintentry(Entry e, int cmd)
|
||||
{
|
||||
char *p, *pe;
|
||||
int c, i;
|
||||
|
||||
p = e.start;
|
||||
pe = e.end;
|
||||
while(p < pe) {
|
||||
c = *p++;
|
||||
if(cmd == 'r') {
|
||||
outchar(c);
|
||||
continue;
|
||||
}
|
||||
switch(c) {
|
||||
case '*':
|
||||
c = *p++;
|
||||
if(cmd == 'h' && c != 'L') {
|
||||
outnl(0);
|
||||
return;
|
||||
}
|
||||
if(c == 'L' && cmd != 'h')
|
||||
outnl(0);
|
||||
if(c == 'S') {
|
||||
outchar('(');
|
||||
outchar(*p++);
|
||||
outchar(')');
|
||||
}
|
||||
break;
|
||||
case '#':
|
||||
c = *p++;
|
||||
i = *p++ - '0' - 1;
|
||||
if(i < 0 || i > 4)
|
||||
break;
|
||||
switch(c) {
|
||||
case 'a': outrune(L"áàâäa"[i]); break;
|
||||
case 'e': outrune(L"éèêëe"[i]); break;
|
||||
case 'o': outrune(L"óòôöo"[i]); break;
|
||||
case 'c': outrune(L"ccccç"[i]); break;
|
||||
default: outchar(c); break;
|
||||
}
|
||||
break;
|
||||
case '+':
|
||||
case '<':
|
||||
break;
|
||||
case ' ':
|
||||
if(cmd == 'h' && *p == '*') {
|
||||
outnl(0);
|
||||
return;
|
||||
}
|
||||
default:
|
||||
outchar(c);
|
||||
}
|
||||
}
|
||||
outnl(0);
|
||||
}
|
||||
|
||||
long
|
||||
thesnextoff(long fromoff)
|
||||
{
|
||||
long a;
|
||||
char *p;
|
||||
|
||||
a = Bseek(bdict, fromoff, 0);
|
||||
if(a < 0)
|
||||
return -1;
|
||||
for(;;) {
|
||||
p = Brdline(bdict, '\n');
|
||||
if(!p)
|
||||
break;
|
||||
if(p[0] == '*' && p[1] == 'L')
|
||||
return (Boffset(bdict)-Blinelen(bdict));
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
void
|
||||
thesprintkey(void)
|
||||
{
|
||||
Bprint(bout, "No key\n");
|
||||
}
|
577
src/cmd/dict/utils.c
Normal file
577
src/cmd/dict/utils.c
Normal file
|
@ -0,0 +1,577 @@
|
|||
#include <u.h>
|
||||
#include <libc.h>
|
||||
#include <bio.h>
|
||||
#include "dict.h"
|
||||
|
||||
Dict dicts[] = {
|
||||
{"oed", "Oxford English Dictionary, 2nd Ed.",
|
||||
"dict/oed2", "dict/oed2index",
|
||||
oednextoff, oedprintentry, oedprintkey},
|
||||
{"ahd", "American Heritage Dictionary, 2nd College Ed.",
|
||||
"ahd/DICT.DB", "ahd/index",
|
||||
ahdnextoff, ahdprintentry, ahdprintkey},
|
||||
{"pgw", "Project Gutenberg Webster Dictionary",
|
||||
"dict/pgw", "dict/pgwindex",
|
||||
pgwnextoff, pgwprintentry, pgwprintkey},
|
||||
{"thesaurus", "Collins Thesaurus",
|
||||
"dict/thesaurus", "dict/thesindex",
|
||||
thesnextoff, thesprintentry, thesprintkey},
|
||||
|
||||
{"ce", "Gendai Chinese->English",
|
||||
"dict/world/sansdata/sandic24.dat",
|
||||
"dict/world/sansdata/ceindex",
|
||||
worldnextoff, worldprintentry, worldprintkey},
|
||||
{"ceh", "Gendai Chinese->English (Hanzi index)",
|
||||
"dict/world/sansdata/sandic24.dat",
|
||||
"dict/world/sansdata/cehindex",
|
||||
worldnextoff, worldprintentry, worldprintkey},
|
||||
{"ec", "Gendai English->Chinese",
|
||||
"dict/world/sansdata/sandic24.dat",
|
||||
"dict/world/sansdata/ecindex",
|
||||
worldnextoff, worldprintentry, worldprintkey},
|
||||
|
||||
{"dae", "Gyldendal Danish->English",
|
||||
"dict/world/gylddata/sandic30.dat",
|
||||
"dict/world/gylddata/daeindex",
|
||||
worldnextoff, worldprintentry, worldprintkey},
|
||||
{"eda", "Gyldendal English->Danish",
|
||||
"dict/world/gylddata/sandic29.dat",
|
||||
"dict/world/gylddata/edaindex",
|
||||
worldnextoff, worldprintentry, worldprintkey},
|
||||
|
||||
{"due", "Wolters-Noordhoff Dutch->English",
|
||||
"dict/world/woltdata/sandic07.dat",
|
||||
"dict/world/woltdata/deindex",
|
||||
worldnextoff, worldprintentry, worldprintkey},
|
||||
{"edu", "Wolters-Noordhoff English->Dutch",
|
||||
"dict/world/woltdata/sandic06.dat",
|
||||
"dict/world/woltdata/edindex",
|
||||
worldnextoff, worldprintentry, worldprintkey},
|
||||
|
||||
{"fie", "WSOY Finnish->English",
|
||||
"dict/world/werndata/sandic32.dat",
|
||||
"dict/world/werndata/fieindex",
|
||||
worldnextoff, worldprintentry, worldprintkey},
|
||||
{"efi", "WSOY English->Finnish",
|
||||
"dict/world/werndata/sandic31.dat",
|
||||
"dict/world/werndata/efiindex",
|
||||
worldnextoff, worldprintentry, worldprintkey},
|
||||
|
||||
{"fe", "Collins French->English",
|
||||
"dict/fe", "dict/feindex",
|
||||
pcollnextoff, pcollprintentry, pcollprintkey},
|
||||
{"ef", "Collins English->French",
|
||||
"dict/ef", "dict/efindex",
|
||||
pcollnextoff, pcollprintentry, pcollprintkey},
|
||||
|
||||
{"ge", "Collins German->English",
|
||||
"dict/ge", "dict/geindex",
|
||||
pcollgnextoff, pcollgprintentry, pcollgprintkey},
|
||||
{"eg", "Collins English->German",
|
||||
"dict/eg", "dict/egindex",
|
||||
pcollgnextoff, pcollgprintentry, pcollgprintkey},
|
||||
|
||||
{"ie", "Collins Italian->English",
|
||||
"dict/ie", "dict/ieindex",
|
||||
pcollnextoff, pcollprintentry, pcollprintkey},
|
||||
{"ei", "Collins English->Italian",
|
||||
"dict/ei", "dict/eiindex",
|
||||
pcollnextoff, pcollprintentry, pcollprintkey},
|
||||
|
||||
{"je", "Sanshusha Japanese->English",
|
||||
"dict/world/sansdata/sandic18.dat",
|
||||
"dict/world/sansdata/jeindex",
|
||||
worldnextoff, worldprintentry, worldprintkey},
|
||||
{"jek", "Sanshusha Japanese->English (Kanji index)",
|
||||
"dict/world/sansdata/sandic18.dat",
|
||||
"dict/world/sansdata/jekindex",
|
||||
worldnextoff, worldprintentry, worldprintkey},
|
||||
{"ej", "Sanshusha English->Japanese",
|
||||
"dict/world/sansdata/sandic18.dat",
|
||||
"dict/world/sansdata/ejindex",
|
||||
worldnextoff, worldprintentry, worldprintkey},
|
||||
|
||||
{"tjeg", "Sanshusha technical Japanese->English,German",
|
||||
"dict/world/sansdata/sandic16.dat",
|
||||
"dict/world/sansdata/tjegindex",
|
||||
worldnextoff, worldprintentry, worldprintkey},
|
||||
{"tjegk", "Sanshusha technical Japanese->English,German (Kanji index)",
|
||||
"dict/world/sansdata/sandic16.dat",
|
||||
"dict/world/sansdata/tjegkindex",
|
||||
worldnextoff, worldprintentry, worldprintkey},
|
||||
{"tegj", "Sanshusha technical English->German,Japanese",
|
||||
"dict/world/sansdata/sandic16.dat",
|
||||
"dict/world/sansdata/tegjindex",
|
||||
worldnextoff, worldprintentry, worldprintkey},
|
||||
{"tgje", "Sanshusha technical German->Japanese,English",
|
||||
"dict/world/sansdata/sandic16.dat",
|
||||
"dict/world/sansdata/tgjeindex",
|
||||
worldnextoff, worldprintentry, worldprintkey},
|
||||
|
||||
{"ne", "Kunnskapforlaget Norwegian->English",
|
||||
"dict/world/kunndata/sandic28.dat",
|
||||
"dict/world/kunndata/neindex",
|
||||
worldnextoff, worldprintentry, worldprintkey},
|
||||
{"en", "Kunnskapforlaget English->Norwegian",
|
||||
"dict/world/kunndata/sandic27.dat",
|
||||
"dict/world/kunndata/enindex",
|
||||
worldnextoff, worldprintentry, worldprintkey},
|
||||
|
||||
{"re", "Leon Ungier Russian->English",
|
||||
"dict/re", "dict/reindex",
|
||||
simplenextoff, simpleprintentry, simpleprintkey},
|
||||
{"er", "Leon Ungier English->Russian",
|
||||
"dict/re", "dict/erindex",
|
||||
simplenextoff, simpleprintentry, simpleprintkey},
|
||||
|
||||
{"se", "Collins Spanish->English",
|
||||
"dict/se", "dict/seindex",
|
||||
pcollnextoff, pcollprintentry, pcollprintkey},
|
||||
{"es", "Collins English->Spanish",
|
||||
"dict/es", "dict/esindex",
|
||||
pcollnextoff, pcollprintentry, pcollprintkey},
|
||||
|
||||
{"swe", "Esselte Studium Swedish->English",
|
||||
"dict/world/essedata/sandic34.dat",
|
||||
"dict/world/essedata/sweindex",
|
||||
worldnextoff, worldprintentry, worldprintkey},
|
||||
{"esw", "Esselte Studium English->Swedish",
|
||||
"dict/world/essedata/sandic33.dat",
|
||||
"dict/world/essedata/eswindex",
|
||||
worldnextoff, worldprintentry, worldprintkey},
|
||||
|
||||
{"movie", "Movies -- by title",
|
||||
"movie/data", "dict/movtindex",
|
||||
movienextoff, movieprintentry, movieprintkey},
|
||||
{"moviea", "Movies -- by actor",
|
||||
"movie/data", "dict/movaindex",
|
||||
movienextoff, movieprintentry, movieprintkey},
|
||||
{"movied", "Movies -- by director",
|
||||
"movie/data", "dict/movdindex",
|
||||
movienextoff, movieprintentry, movieprintkey},
|
||||
|
||||
{"slang", "English Slang",
|
||||
"dict/slang", "dict/slangindex",
|
||||
slangnextoff, slangprintentry, slangprintkey},
|
||||
|
||||
{"robert", "Robert Électronique",
|
||||
"dict/robert/_pointers", "dict/robert/_index",
|
||||
robertnextoff, robertindexentry, robertprintkey},
|
||||
{"robertv", "Robert Électronique - formes des verbes",
|
||||
"dict/robert/flex.rob", "dict/robert/_flexindex",
|
||||
robertnextflex, robertflexentry, robertprintkey},
|
||||
|
||||
{0, 0, 0, 0, 0}
|
||||
};
|
||||
|
||||
typedef struct Lig Lig;
|
||||
struct Lig {
|
||||
Rune start; /* accent rune */
|
||||
Rune pairs[100]; /* <char,accented version> pairs */
|
||||
};
|
||||
|
||||
static Lig ligtab[Nligs] = {
|
||||
[LACU-LIGS] {0xb4, {0x41, 0xc1, 0x61, 0xe1, 0x43, 0x106, 0x63, 0x107, 0x45, 0xc9, 0x65, 0xe9, 0x67, 0x123, 0x49, 0xcd, 0x69, 0xed, 0x131, 0xed, 0x4c, 0x139, 0x6c, 0x13a, 0x4e, 0x143, 0x6e, 0x144, 0x4f, 0xd3, 0x6f, 0xf3, 0x52, 0x154, 0x72, 0x155, 0x53, 0x15a, 0x73, 0x15b, 0x55, 0xda, 0x75, 0xfa, 0x59, 0xdd, 0x79, 0xfd, 0x5a, 0x179, 0x7a, 0x17a, 0}},
|
||||
[LGRV-LIGS] {0x2cb, {0x41, 0xc0, 0x61, 0xe0, 0x45, 0xc8, 0x65, 0xe8, 0x49, 0xcc, 0x69, 0xec, 0x131, 0xec, 0x4f, 0xd2, 0x6f, 0xf2, 0x55, 0xd9, 0x75, 0xf9, 0}},
|
||||
[LUML-LIGS] {0xa8, {0x41, 0xc4, 0x61, 0xe4, 0x45, 0xcb, 0x65, 0xeb, 0x49, 0xcf, 0x69, 0xef, 0x4f, 0xd6, 0x6f, 0xf6, 0x55, 0xdc, 0x75, 0xfc, 0x59, 0x178, 0x79, 0xff, 0}},
|
||||
[LCED-LIGS] {0xb8, {0x43, 0xc7, 0x63, 0xe7, 0x47, 0x122, 0x4b, 0x136, 0x6b, 0x137, 0x4c, 0x13b, 0x6c, 0x13c, 0x4e, 0x145, 0x6e, 0x146, 0x52, 0x156, 0x72, 0x157, 0x53, 0x15e, 0x73, 0x15f, 0x54, 0x162, 0x74, 0x163, 0}},
|
||||
[LTIL-LIGS] {0x2dc, {0x41, 0xc3, 0x61, 0xe3, 0x49, 0x128, 0x69, 0x129, 0x131, 0x129, 0x4e, 0xd1, 0x6e, 0xf1, 0x4f, 0xd5, 0x6f, 0xf5, 0x55, 0x168, 0x75, 0x169, 0}},
|
||||
[LBRV-LIGS] {0x2d8, {0x41, 0x102, 0x61, 0x103, 0x45, 0x114, 0x65, 0x115, 0x47, 0x11e, 0x67, 0x11f, 0x49, 0x12c, 0x69, 0x12d, 0x131, 0x12d, 0x4f, 0x14e, 0x6f, 0x14f, 0x55, 0x16c, 0x75, 0x16d, 0}},
|
||||
[LRNG-LIGS] {0x2da, {0x41, 0xc5, 0x61, 0xe5, 0x55, 0x16e, 0x75, 0x16f, 0}},
|
||||
[LDOT-LIGS] {0x2d9, {0x43, 0x10a, 0x63, 0x10b, 0x45, 0x116, 0x65, 0x117, 0x47, 0x120, 0x67, 0x121, 0x49, 0x130, 0x4c, 0x13f, 0x6c, 0x140, 0x5a, 0x17b, 0x7a, 0x17c, 0}},
|
||||
[LDTB-LIGS] {0x2e, {0}},
|
||||
[LFRN-LIGS] {0x2322, {0x41, 0xc2, 0x61, 0xe2, 0x43, 0x108, 0x63, 0x109, 0x45, 0xca, 0x65, 0xea, 0x47, 0x11c, 0x67, 0x11d, 0x48, 0x124, 0x68, 0x125, 0x49, 0xce, 0x69, 0xee, 0x131, 0xee, 0x4a, 0x134, 0x6a, 0x135, 0x4f, 0xd4, 0x6f, 0xf4, 0x53, 0x15c, 0x73, 0x15d, 0x55, 0xdb, 0x75, 0xfb, 0x57, 0x174, 0x77, 0x175, 0x59, 0x176, 0x79, 0x177, 0}},
|
||||
[LFRB-LIGS] {0x32f, {0}},
|
||||
[LOGO-LIGS] {0x2db, {0x41, 0x104, 0x61, 0x105, 0x45, 0x118, 0x65, 0x119, 0x49, 0x12e, 0x69, 0x12f, 0x131, 0x12f, 0x55, 0x172, 0x75, 0x173, 0}},
|
||||
[LMAC-LIGS] {0xaf, {0x41, 0x100, 0x61, 0x101, 0x45, 0x112, 0x65, 0x113, 0x49, 0x12a, 0x69, 0x12b, 0x131, 0x12b, 0x4f, 0x14c, 0x6f, 0x14d, 0x55, 0x16a, 0x75, 0x16b, 0}},
|
||||
[LHCK-LIGS] {0x2c7, {0x43, 0x10c, 0x63, 0x10d, 0x44, 0x10e, 0x64, 0x10f, 0x45, 0x11a, 0x65, 0x11b, 0x4c, 0x13d, 0x6c, 0x13e, 0x4e, 0x147, 0x6e, 0x148, 0x52, 0x158, 0x72, 0x159, 0x53, 0x160, 0x73, 0x161, 0x54, 0x164, 0x74, 0x165, 0x5a, 0x17d, 0x7a, 0x17e, 0}},
|
||||
[LASP-LIGS] {0x2bd, {0}},
|
||||
[LLEN-LIGS] {0x2bc, {0}},
|
||||
[LBRB-LIGS] {0x32e, {0}}
|
||||
};
|
||||
|
||||
Rune multitab[Nmulti][5] = {
|
||||
[MAAS-MULTI] {0x2bd, 0x3b1, 0},
|
||||
[MALN-MULTI] {0x2bc, 0x3b1, 0},
|
||||
[MAND-MULTI] {0x61, 0x6e, 0x64, 0},
|
||||
[MAOQ-MULTI] {0x61, 0x2f, 0x71, 0},
|
||||
[MBRA-MULTI] {0x3c, 0x7c, 0},
|
||||
[MDD-MULTI] {0x2e, 0x2e, 0},
|
||||
[MDDD-MULTI] {0x2e, 0x2e, 0x2e, 0},
|
||||
[MEAS-MULTI] {0x2bd, 0x3b5, 0},
|
||||
[MELN-MULTI] {0x2bc, 0x3b5, 0},
|
||||
[MEMM-MULTI] {0x2014, 0x2014, 0},
|
||||
[MHAS-MULTI] {0x2bd, 0x3b7, 0},
|
||||
[MHLN-MULTI] {0x2bc, 0x3b7, 0},
|
||||
[MIAS-MULTI] {0x2bd, 0x3b9, 0},
|
||||
[MILN-MULTI] {0x2bc, 0x3b9, 0},
|
||||
[MLCT-MULTI] {0x63, 0x74, 0},
|
||||
[MLFF-MULTI] {0x66, 0x66, 0},
|
||||
[MLFFI-MULTI] {0x66, 0x66, 0x69, 0},
|
||||
[MLFFL-MULTI] {0x66, 0x66, 0x6c, 0},
|
||||
[MLFL-MULTI] {0x66, 0x6c, 0},
|
||||
[MLFI-MULTI] {0x66, 0x69, 0},
|
||||
[MLLS-MULTI] {0x26b, 0x26b, 0},
|
||||
[MLST-MULTI] {0x73, 0x74, 0},
|
||||
[MOAS-MULTI] {0x2bd, 0x3bf, 0},
|
||||
[MOLN-MULTI] {0x2bc, 0x3bf, 0},
|
||||
[MOR-MULTI] {0x6f, 0x72, 0},
|
||||
[MRAS-MULTI] {0x2bd, 0x3c1, 0},
|
||||
[MRLN-MULTI] {0x2bc, 0x3c1, 0},
|
||||
[MTT-MULTI] {0x7e, 0x7e, 0},
|
||||
[MUAS-MULTI] {0x2bd, 0x3c5, 0},
|
||||
[MULN-MULTI] {0x2bc, 0x3c5, 0},
|
||||
[MWAS-MULTI] {0x2bd, 0x3c9, 0},
|
||||
[MWLN-MULTI] {0x2bc, 0x3c9, 0},
|
||||
[MOE-MULTI] {0x6f, 0x65, 0},
|
||||
[MES-MULTI] {0x20, 0x20, 0},
|
||||
};
|
||||
|
||||
#define risupper(r) (0x41 <= (r) && (r) <= 0x5a)
|
||||
#define rislatin1(r) (0xC0 <= (r) && (r) <= 0xFF)
|
||||
#define rtolower(r) ((r)-'A'+'a')
|
||||
|
||||
static Rune latin_fold_tab[] =
|
||||
{
|
||||
/* Table to fold latin 1 characters to ASCII equivalents
|
||||
based at Rune value 0xc0
|
||||
|
||||
À Á Â Ã Ä Å Æ Ç
|
||||
È É Ê Ë Ì Í Î Ï
|
||||
Ð Ñ Ò Ó Ô Õ Ö ×
|
||||
Ø Ù Ú Û Ü Ý Þ ß
|
||||
à á â ã ä å æ ç
|
||||
è é ê ë ì í î ï
|
||||
ð ñ ò ó ô õ ö ÷
|
||||
ø ù ú û ü ý þ ÿ
|
||||
*/
|
||||
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
|
||||
'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
|
||||
'd', 'n', 'o', 'o', 'o', 'o', 'o', 0 ,
|
||||
'o', 'u', 'u', 'u', 'u', 'y', 0 , 0 ,
|
||||
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
|
||||
'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
|
||||
'd', 'n', 'o', 'o', 'o', 'o', 'o', 0 ,
|
||||
'o', 'u', 'u', 'u', 'u', 'y', 0 , 'y',
|
||||
};
|
||||
|
||||
static Rune *ttabstack[20];
|
||||
static int ntt;
|
||||
|
||||
/*
|
||||
* tab is an array of n Assoc's, sorted by key.
|
||||
* Look for key in tab, and return corresponding val
|
||||
* or -1 if not there
|
||||
*/
|
||||
long
|
||||
lookassoc(Assoc *tab, int n, char *key)
|
||||
{
|
||||
Assoc *q;
|
||||
long i, low, high;
|
||||
int r;
|
||||
|
||||
for(low = -1, high = n; high > low+1; ){
|
||||
i = (high+low)/2;
|
||||
q = &tab[i];
|
||||
if((r=strcmp(key, q->key))<0)
|
||||
high = i;
|
||||
else if(r == 0)
|
||||
return q->val;
|
||||
else
|
||||
low=i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
long
|
||||
looknassoc(Nassoc *tab, int n, long key)
|
||||
{
|
||||
Nassoc *q;
|
||||
long i, low, high;
|
||||
|
||||
for(low = -1, high = n; high > low+1; ){
|
||||
i = (high+low)/2;
|
||||
q = &tab[i];
|
||||
if(key < q->key)
|
||||
high = i;
|
||||
else if(key == q->key)
|
||||
return q->val;
|
||||
else
|
||||
low=i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
void
|
||||
err(char *fmt, ...)
|
||||
{
|
||||
char buf[1000];
|
||||
va_list v;
|
||||
|
||||
va_start(v, fmt);
|
||||
vsnprint(buf, sizeof(buf), fmt, v);
|
||||
va_end(v);
|
||||
fprint(2, "%s: %s\n", argv0, buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* Write the rune r to bout, keeping track of line length
|
||||
* and breaking the lines (at blanks) when they get too long
|
||||
*/
|
||||
void
|
||||
outrune(long r)
|
||||
{
|
||||
if(outinhibit)
|
||||
return;
|
||||
if(++linelen > breaklen && r == 0x20) {
|
||||
Bputc(bout, '\n');
|
||||
linelen = 0;
|
||||
} else
|
||||
Bputrune(bout, r);
|
||||
}
|
||||
|
||||
void
|
||||
outrunes(Rune *rp)
|
||||
{
|
||||
Rune r;
|
||||
|
||||
while((r = *rp++) != 0)
|
||||
outrune(r);
|
||||
}
|
||||
|
||||
/* like outrune, but when arg is know to be a char */
|
||||
void
|
||||
outchar(int c)
|
||||
{
|
||||
if(outinhibit)
|
||||
return;
|
||||
if(++linelen > breaklen && c == ' ') {
|
||||
c ='\n';
|
||||
linelen = 0;
|
||||
}
|
||||
Bputc(bout, c);
|
||||
}
|
||||
|
||||
void
|
||||
outchars(char *s)
|
||||
{
|
||||
char c;
|
||||
|
||||
while((c = *s++) != 0)
|
||||
outchar(c);
|
||||
}
|
||||
|
||||
void
|
||||
outprint(char *fmt, ...)
|
||||
{
|
||||
char buf[1000];
|
||||
va_list v;
|
||||
|
||||
va_start(v, fmt);
|
||||
vsnprint(buf, sizeof(buf), fmt, v);
|
||||
va_end(v);
|
||||
outchars(buf);
|
||||
}
|
||||
|
||||
void
|
||||
outpiece(char *b, char *e)
|
||||
{
|
||||
int c, lastc;
|
||||
|
||||
lastc = 0;
|
||||
while(b < e) {
|
||||
c = *b++;
|
||||
if(c == '\n')
|
||||
c = ' ';
|
||||
if(!(c == ' ' && lastc == ' '))
|
||||
outchar(c);
|
||||
lastc = c;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Go to new line if not already there; indent if ind != 0.
|
||||
* If ind > 1, leave a blank line too.
|
||||
* Slight hack: assume if current line is only one or two
|
||||
* characters long, then they were spaces.
|
||||
*/
|
||||
void
|
||||
outnl(int ind)
|
||||
{
|
||||
if(outinhibit)
|
||||
return;
|
||||
if(ind) {
|
||||
if(ind > 1) {
|
||||
if(linelen > 2)
|
||||
Bputc(bout, '\n');
|
||||
Bprint(bout, "\n ");
|
||||
} else if(linelen == 0)
|
||||
Bprint(bout, " ");
|
||||
else if(linelen == 1)
|
||||
Bputc(bout, ' ');
|
||||
else if(linelen != 2)
|
||||
Bprint(bout, "\n ");
|
||||
linelen = 2;
|
||||
} else {
|
||||
if(linelen) {
|
||||
Bputc(bout, '\n');
|
||||
linelen = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Fold the runes in null-terminated rp.
|
||||
* Use the sort(1) definition of folding (uppercase to lowercase,
|
||||
* latin1-accented characters to corresponding unaccented chars)
|
||||
*/
|
||||
void
|
||||
fold(Rune *rp)
|
||||
{
|
||||
Rune r;
|
||||
|
||||
while((r = *rp) != 0) {
|
||||
if (rislatin1(r) && latin_fold_tab[r-0xc0])
|
||||
r = latin_fold_tab[r-0xc0];
|
||||
if(risupper(r))
|
||||
r = rtolower(r);
|
||||
*rp++ = r;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Like fold, but put folded result into new
|
||||
* (assumed to have enough space).
|
||||
* old is a regular expression, but we know that
|
||||
* metacharacters aren't affected
|
||||
*/
|
||||
void
|
||||
foldre(char *new, char *old)
|
||||
{
|
||||
Rune r;
|
||||
|
||||
while(*old) {
|
||||
old += chartorune(&r, old);
|
||||
if (rislatin1(r) && latin_fold_tab[r-0xc0])
|
||||
r = latin_fold_tab[r-0xc0];
|
||||
if(risupper(r))
|
||||
r = rtolower(r);
|
||||
new += runetochar(new, &r);
|
||||
}
|
||||
*new = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* acomp(s, t) returns:
|
||||
* -2 if s strictly precedes t
|
||||
* -1 if s is a prefix of t
|
||||
* 0 if s is the same as t
|
||||
* 1 if t is a prefix of s
|
||||
* 2 if t strictly precedes s
|
||||
*/
|
||||
|
||||
int
|
||||
acomp(Rune *s, Rune *t)
|
||||
{
|
||||
int cs, ct;
|
||||
|
||||
for(;;) {
|
||||
cs = *s;
|
||||
ct = *t;
|
||||
if(cs != ct)
|
||||
break;
|
||||
if(cs == 0)
|
||||
return 0;
|
||||
s++;
|
||||
t++;
|
||||
}
|
||||
if(cs == 0)
|
||||
return -1;
|
||||
if(ct == 0)
|
||||
return 1;
|
||||
if(cs < ct)
|
||||
return -2;
|
||||
return 2;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy null terminated Runes from 'from' to 'to'.
|
||||
*/
|
||||
void
|
||||
runescpy(Rune *to, Rune *from)
|
||||
{
|
||||
while((*to++ = *from++) != 0)
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Conversion of unsigned number to long, no overflow detection
|
||||
*/
|
||||
long
|
||||
runetol(Rune *r)
|
||||
{
|
||||
int c;
|
||||
long n;
|
||||
|
||||
n = 0;
|
||||
for(;; r++){
|
||||
c = *r;
|
||||
if(0x30<=c && c<=0x39)
|
||||
c -= '0';
|
||||
else
|
||||
break;
|
||||
n = n*10 + c;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
/*
|
||||
* See if there is a rune corresponding to the accented
|
||||
* version of r with accent acc (acc in [LIGS..LIGE-1]),
|
||||
* and return it if so, else return NONE.
|
||||
*/
|
||||
Rune
|
||||
liglookup(Rune acc, Rune r)
|
||||
{
|
||||
Rune *p;
|
||||
|
||||
if(acc < LIGS || acc >= LIGE)
|
||||
return NONE;
|
||||
for(p = ligtab[acc-LIGS].pairs; *p; p += 2)
|
||||
if(*p == r)
|
||||
return *(p+1);
|
||||
return NONE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Maintain a translation table stack (a translation table
|
||||
* is an array of Runes indexed by bytes or 7-bit bytes).
|
||||
* If starting is true, push the curtab onto the stack
|
||||
* and return newtab; else pop the top of the stack and
|
||||
* return it.
|
||||
* If curtab is 0, initialize the stack and return.
|
||||
*/
|
||||
Rune *
|
||||
changett(Rune *curtab, Rune *newtab, int starting)
|
||||
{
|
||||
if(curtab == 0) {
|
||||
ntt = 0;
|
||||
return 0;
|
||||
}
|
||||
if(starting) {
|
||||
if(ntt >= asize(ttabstack)) {
|
||||
if(debug)
|
||||
err("translation stack overflow");
|
||||
return curtab;
|
||||
}
|
||||
ttabstack[ntt++] = curtab;
|
||||
return newtab;
|
||||
} else {
|
||||
if(ntt == 0) {
|
||||
if(debug)
|
||||
err("translation stack underflow");
|
||||
return curtab;
|
||||
}
|
||||
return ttabstack[--ntt];
|
||||
}
|
||||
}
|
184
src/cmd/dict/world.c
Normal file
184
src/cmd/dict/world.c
Normal file
|
@ -0,0 +1,184 @@
|
|||
#include <u.h>
|
||||
#include <libc.h>
|
||||
#include <bio.h>
|
||||
#include "dict.h"
|
||||
#include "kuten.h"
|
||||
|
||||
/*
|
||||
* Routines for handling dictionaries in the "Languages of the World"
|
||||
* format. worldnextoff *must* be called with <address of valid entry>+1.
|
||||
*/
|
||||
|
||||
#define GSHORT(p) (((p)[0]<<8)|(p)[1])
|
||||
|
||||
static void putchar(int, int*);
|
||||
|
||||
#define NONE 0xffff
|
||||
|
||||
/* adapted from jhelling@cs.ruu.nl (Jeroen Hellingman) */
|
||||
|
||||
static Rune chartab[] = {
|
||||
|
||||
/*00*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
|
||||
NONE, NONE, L'\n', 0xe6, 0xf8, 0xe5, 0xe4, 0xf6,
|
||||
/*10*/ NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
|
||||
NONE, NONE, NONE, 0xc6, 0xd8, 0xc5, 0xc4, 0xd6,
|
||||
|
||||
/*20*/ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, '\'',
|
||||
0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
|
||||
/*30*/ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
|
||||
0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
|
||||
/*40*/ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
|
||||
0x48, 0x49, 0x4a, 0x4b, L'L', 0x4d, 0x4e, 0x4f,
|
||||
/*50*/ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
|
||||
0x58, 0x59, 0x5a, 0x5b, L'\\', 0x5d, 0x5e, 0x5f,
|
||||
/*60*/ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
|
||||
/*70*/ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, NONE,
|
||||
|
||||
/*80*/ 0xc7, 0xfc, 0xe9, 0xe2, 0xe4, 0xe0, 0xe5, 0xe7,
|
||||
0xea, 0xeb, 0xe8, 0xef, 0xee, 0xec, 0xc4, 0xc5,
|
||||
/*90*/ 0xc9, 0xe6, 0xc6, 0xf4, 0xf6, 0xf2, 0xfb, 0xf9,
|
||||
0xff, 0xd6, 0xdc, 0xa2, 0xa3, 0xa5, 0x20a7, 0x283,
|
||||
/*a0*/ 0xe1, 0xed, 0xf3, 0xfa, 0xf1, 0xd1, 0xaa, 0xba,
|
||||
0xbf, 0x2310, 0xac, 0xbd, 0xbc, 0xa1, 0xab, 0xbb,
|
||||
|
||||
/*b0*/ 0x254, 0x259, 0xf0, 0x283, 0x292, 0x14b, 0x251, 0x7a,
|
||||
0x26a, 0xf0, 0x292, 0xe3, 0x153, 0x169, 0x28c, 0x265,
|
||||
/*c0*/ 0x280, 0xeb, 0x6c, 0x28c, 0xf5, 0xf1, 0x152, NONE,
|
||||
NONE, 0x53, 0x73, 0x5a, 0x7a, NONE, NONE, NONE,
|
||||
/*d0*/ 0xdf, NONE, NONE, 0x101, 0x12b, 0x16b, 0x113, 0x14d,
|
||||
NONE, NONE, NONE, 0x20, NONE, NONE, NONE, NONE,
|
||||
|
||||
/*e0*/ 0x3b1, 0x3b2, 0x3b3, 0x3c0, 0x3a3, 0x3c3, 0xb5, 0x3c4,
|
||||
0x3a6, 0x398, 0x3a9, 0x3b4, 0x221e, 0xd8, 0x3b5, 0x2229,
|
||||
/*f0*/ 0x2261, 0xb1, 0x2265, 0x2264, 0x2320, 0x2321, 0xf7, 0x2248,
|
||||
0xb0, 0x2219, 0xb7, NONE, NONE, NONE, NONE, NONE,
|
||||
};
|
||||
|
||||
enum{ Utf, Kanahi, Kanalo=Kanahi+1, GBhi, GBlo=GBhi+1, };
|
||||
|
||||
void
|
||||
worldprintentry(Entry e, int cmd)
|
||||
{
|
||||
int nh, state[3];
|
||||
uchar *p, *pe;
|
||||
|
||||
p = (uchar *)e.start;
|
||||
pe = (uchar *)e.end;
|
||||
nh = GSHORT(p);
|
||||
p += 6;
|
||||
if(cmd == 'h')
|
||||
pe = p+nh;
|
||||
state[0] = Utf;
|
||||
state[1] = 0;
|
||||
state[2] = 0;
|
||||
while(p < pe){
|
||||
if(cmd == 'r')
|
||||
outchar(*p++);
|
||||
else
|
||||
putchar(*p++, state);
|
||||
}
|
||||
outnl(0);
|
||||
}
|
||||
|
||||
long
|
||||
worldnextoff(long fromoff)
|
||||
{
|
||||
int nh, np, nd;
|
||||
uchar buf[6];
|
||||
|
||||
if(Bseek(bdict, fromoff-1, 0) < 0)
|
||||
return -1;
|
||||
if(Bread(bdict, buf, 6) != 6)
|
||||
return -1;
|
||||
nh = GSHORT(buf);
|
||||
np = GSHORT(buf+2);
|
||||
nd = GSHORT(buf+4);
|
||||
return fromoff-1 + 6 + nh + np + nd;
|
||||
}
|
||||
|
||||
static void
|
||||
putchar(int c, int *state)
|
||||
{
|
||||
int xflag = 0;
|
||||
Rune r;
|
||||
int hi, lo;
|
||||
|
||||
switch(state[0]){
|
||||
case Kanahi:
|
||||
case GBhi:
|
||||
if(CANS2JH(c) || c == 0xff){
|
||||
state[0]++;
|
||||
state[1] = c;
|
||||
break;
|
||||
}
|
||||
/* fall through */
|
||||
case Utf:
|
||||
if(c == 0xfe){
|
||||
state[0] = Kanahi;
|
||||
break;
|
||||
}else if(c == 0xff){
|
||||
state[0] = GBhi;
|
||||
break;
|
||||
}
|
||||
r = chartab[c];
|
||||
if(r < 0x80 && state[2] == 0)
|
||||
outchar(r);
|
||||
else if(r == NONE){
|
||||
switch(c){
|
||||
case 0xfb:
|
||||
if(!xflag){
|
||||
state[2] = 1;
|
||||
break;
|
||||
}
|
||||
case 0xfc:
|
||||
if(!xflag){
|
||||
state[2] = 0;
|
||||
break;
|
||||
}
|
||||
case 0x10:
|
||||
case 0xc7: case 0xc8:
|
||||
case 0xd8: case 0xd9: case 0xda:
|
||||
case 0xdc: case 0xdd: case 0xde: case 0xdf:
|
||||
case 0xfd:
|
||||
if(!xflag)
|
||||
break;
|
||||
/* fall through */
|
||||
default:
|
||||
outprint("\\%.2ux", c);
|
||||
}
|
||||
}else if(state[2] == 0)
|
||||
outrune(r);
|
||||
break;
|
||||
case Kanalo:
|
||||
case GBlo:
|
||||
if(state[1] == 0xff && c == 0xff){
|
||||
state[0] = Utf;
|
||||
break;
|
||||
}
|
||||
state[0]--;
|
||||
hi = state[1];
|
||||
lo = c;
|
||||
S2J(hi, lo); /* convert to JIS */
|
||||
r = hi*100 + lo - 3232; /* convert to jis208 */
|
||||
if(state[0] == Kanahi && r < JIS208MAX)
|
||||
r = tabjis208[r];
|
||||
else if(state[0] == GBhi && r < GB2312MAX)
|
||||
r = tabgb2312[r];
|
||||
else
|
||||
r = NONE;
|
||||
if(r == NONE)
|
||||
outprint("\\%.2ux\\%.2ux", state[1], c);
|
||||
else
|
||||
outrune(r);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
worldprintkey(void)
|
||||
{
|
||||
Bprint(bout, "No pronunciation key.\n");
|
||||
}
|
Loading…
Reference in a new issue