plan9port/src/cmd/spell/sprog.c

1385 lines
23 KiB
C
Raw Normal View History

2004-03-09 12:45:12 +00:00
#include <u.h>
#include <libc.h>
#include <bio.h>
#include <ctype.h>
#include "code.h"
/* fig leaves for possibly signed char quantities */
#define ISUPPER(c) isupper((c)&0xff)
#define ISLOWER(c) islower((c)&0xff)
#define ISALPHA(c) isalpha((c)&0xff)
#define ISDIGIT(c) isdigit((c)&0xff)
#define ISVOWEL(c) voweltab[(c)&0xff]
#define Tolower(c) (ISUPPER(c)? (c)-'A'+'a': (c))
#define pair(a,b) (((a)<<8) | (b))
#define DLEV 2
#define DSIZ 40
typedef long Bits;
#define Set(h, f) ((long)(h) & (f))
Bits nop(char*, char*, char*, int, int);
Bits strip(char*, char*, char*, int, int);
Bits ize(char*, char*, char*, int, int);
Bits i_to_y(char*, char*, char*, int, int);
Bits ily(char*, char*, char*, int, int);
Bits subst(char*, char*, char*, int, int);
Bits CCe(char*, char*, char*, int, int);
Bits tion(char*, char*, char*, int, int);
Bits an(char*, char*, char*, int, int);
Bits s(char*, char*, char*, int, int);
Bits es(char*, char*, char*, int, int);
Bits bility(char*, char*, char*, int, int);
Bits y_to_e(char*, char*, char*, int, int);
Bits VCe(char*, char*, char*, int, int);
Bits trypref(char*, char*, int, int);
Bits tryword(char*, char*, int, int);
Bits trysuff(char*, int, int);
Bits dict(char*, char*);
void typeprint(Bits);
void pcomma(char*);
void ise(void);
int ordinal(void);
char* skipv(char*);
int inun(char*, Bits);
char* ztos(char*);
void readdict(char*);
typedef struct Ptab Ptab;
struct Ptab
{
char* s;
int flag;
};
typedef struct Suftab Suftab;
struct Suftab
{
char *suf;
Bits (*p1)(char*, char*, char*, int, int);
int n1;
char *d1;
char *a1;
int flag;
int affixable;
Bits (*p2)(char*, char*, char*, int, int);
int n2;
char *d2;
char *a2;
};
Suftab staba[] = {
{"aibohp",subst,1,"-e+ia","",NOUN, NOUN},
0
};
Suftab stabc[] =
{
{"cai",strip,1,"","+c",N_AFFIX, ADJ|NOUN},
{"citsi",strip,2,"","+ic",N_AFFIX, ADJ | N_AFFIX | NOUN},
{"citi",ize,1,"-e+ic","",N_AFFIX, ADJ },
{"cihparg",i_to_y,1,"-y+ic","",NOUN, ADJ|NOUN },
{"cipocs",ize,1,"-e+ic","",NOUN, ADJ },
{"cirtem",i_to_y,1,"-y+ic","",NOUN, ADJ },
{"cigol",i_to_y,1,"-y+ic","",NOUN, ADJ },
{"cimono",i_to_y,1,"-y+ic","",NOUN, ADJ },
{"cibohp",subst,1,"-e+ic","",NOUN, ADJ },
0
};
Suftab stabd[] =
{
{"de",strip,1,"","+d",ED,ADJ |COMP,i_to_y,2,"-y+ied","+ed"},
{"dooh",ily,4,"-y+ihood","+hood",NOUN | ADV, NOUN},
0
};
Suftab stabe[] =
{
/*
* V_affix for comment ->commence->commentment??
*/
{"ecna",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
{"ecne",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
{"elbaif",i_to_y,4,"-y+iable","",V_IRREG,ADJ},
{"elba",CCe,4,"-e+able","+able",V_AFFIX,ADJ},
{"evi",subst,0,"-ion+ive","",N_AFFIX | V_AFFIX,NOUN | N_AFFIX| ADJ},
{"ezi",CCe,3,"-e+ize","+ize",N_AFFIX|ADJ ,V_AFFIX | VERB |ION | COMP},
{"ekil",strip,4,"","+like",N_AFFIX ,ADJ},
0
};
Suftab stabg[] =
{
{"gniee",strip,3,"","+ing",V_IRREG ,ADJ|NOUN},
{"gnikam",strip,6,"","+making",NOUN,NOUN},
{"gnipeek",strip,7,"","+keeping",NOUN,NOUN},
{"gni",CCe,3,"-e+ing","+ing",V_IRREG ,ADJ|ED|NOUN},
0
};
Suftab stabl[] =
{
{"ladio",strip,2,"","+al",NOUN |ADJ,ADJ},
{"laci",strip,2,"","+al",NOUN |ADJ,ADJ |NOUN|N_AFFIX},
{"latnem",strip,2,"","+al",N_AFFIX,ADJ},
{"lanoi",strip,2,"","+al",N_AFFIX,ADJ|NOUN},
{"luf",ily,3,"-y+iful","+ful",N_AFFIX,ADJ | NOUN},
0
};
Suftab stabm[] =
{
/* congregational + ism */
{"msi",CCe,3,"-e+ism","ism",N_AFFIX|ADJ,NOUN},
{"margo",subst,-1,"-ph+m","",NOUN,NOUN},
0
};
Suftab stabn[] =
{
{"noitacifi",i_to_y,6,"-y+ication","",ION,NOUN | N_AFFIX},
{"noitazi",ize,4,"-e+ation","",ION,NOUN| N_AFFIX},
{"noit",tion,3,"-e+ion","+ion",ION,NOUN| N_AFFIX | V_AFFIX |VERB|ACTOR},
{"naino",an,3,"","+ian",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
{"namow",strip,5,"","+woman",MAN,PROP_COLLECT|N_AFFIX},
{"nam",strip,3,"","+man",MAN,PROP_COLLECT | N_AFFIX | VERB},
{"na",an,1,"","+n",NOUN|PROP_COLLECT,NOUN | N_AFFIX},
{"nemow",strip,5,"","+women",MAN,PROP_COLLECT},
{"nem",strip,3,"","+man",MAN,PROP_COLLECT},
{"nosrep",strip,6,"","+person",MAN,PROP_COLLECT},
0
};
Suftab stabp[] =
{
{"pihs",strip,4,"","+ship",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
0
};
Suftab stabr[] =
{
{"rehparg",subst,1,"-y+er","",ACTOR,NOUN,strip,2,"","+er"},
{"reyhparg",nop,0,"","",0,NOUN},
{"reyl",nop,0,"","",0,NOUN},
{"rekam",strip,5,"","+maker",NOUN,NOUN},
{"repeek",strip,6,"","+keeper",NOUN,NOUN},
{"re",strip,1,"","+r",ACTOR,NOUN | N_AFFIX|VERB|ADJ, i_to_y,2,"-y+ier","+er"},
{"rota",tion,2,"-e+or","",ION,NOUN| N_AFFIX|_Y},
{"rotc",tion,2,"","+or",ION,NOUN| N_AFFIX},
{"rotp",tion,2,"","+or",ION,NOUN| N_AFFIX},
0
};
Suftab stabs[] =
{
{"ssen",ily,4,"-y+iness","+ness",ADJ|ADV,NOUN| N_AFFIX},
{"ssel",ily,4,"-y+iless","+less",NOUN | PROP_COLLECT,ADJ },
{"se",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH , es,2,"-y+ies","+es"},
{"s'",s,2,"","+'s",PROP_COLLECT | NOUN,DONT_TOUCH },
{"s",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH },
0
};
Suftab stabt[] =
{
{"tnem",strip,4,"","+ment",V_AFFIX,NOUN | N_AFFIX | ADJ|VERB},
{"tse",strip,2,"","+st",EST,DONT_TOUCH, i_to_y,3,"-y+iest","+est" },
{"tsigol",i_to_y,2,"-y+ist","",N_AFFIX,NOUN | N_AFFIX},
{"tsi",CCe,3,"-e+ist","+ist",N_AFFIX|ADJ,NOUN | N_AFFIX|COMP},
0
};
Suftab staby[] =
{
{"ycna",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
{"ycne",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
{"ytilib",bility,5,"-le+ility","",ADJ | V_AFFIX,NOUN | N_AFFIX},
{"ytisuo",nop,0,"","",NOUN},
{"ytilb",nop,0,"","",0,NOUN},
{"yti",CCe,3,"-e+ity","+ity",ADJ ,NOUN | N_AFFIX },
{"ylb",y_to_e,1,"-e+y","",ADJ,ADV},
{"ylc",nop,0,"","",0},
{"ylelb",nop,0,"","",0},
{"ylelp",nop,0,"","",0},
{"yl",ily,2,"-y+ily","+ly",ADJ,ADV|COMP},
{"yrtem",subst,0,"-er+ry","",NOUN,NOUN | N_AFFIX},
{"y",CCe,1,"-e+y","+y",_Y,ADJ|COMP},
0
};
Suftab stabz[] =
{
0
};
Suftab* suftab[] =
{
staba,
stabz,
stabc,
stabd,
stabe,
stabz,
stabg,
stabz,
stabz,
stabz,
stabz,
stabl,
stabm,
stabn,
stabz,
stabp,
stabz,
stabr,
stabs,
stabt,
stabz,
stabz,
stabz,
stabz,
staby,
stabz,
};
Ptab ptaba[] =
{
"anti", 0,
"auto", 0,
0
};
Ptab ptabb[] =
{
"bio", 0,
0
};
Ptab ptabc[] =
{
"counter", 0,
0
};
Ptab ptabd[] =
{
"dis", 0,
0
};
Ptab ptabe[] =
{
"electro", 0,
0
};
Ptab ptabf[] =
{
"femto", 0,
0
};
Ptab ptabg[] =
{
"geo", 0,
"giga", 0,
0
};
Ptab ptabh[] =
{
"hyper", 0,
0
};
Ptab ptabi[] =
{
"immuno", 0,
"im", IN,
"intra", 0,
"inter", 0,
"in", IN,
"ir", IN,
"iso", 0,
0
};
Ptab ptabj[] =
{
0
};
Ptab ptabk[] =
{
"kilo", 0,
0
};
Ptab ptabl[] =
{
0
};
Ptab ptabm[] =
{
"magneto", 0,
"mega", 0,
"meta", 0,
"micro", 0,
"mid", 0,
"milli", 0,
"mini", 0,
"mis", 0,
"mono", 0,
"multi", 0,
0
};
Ptab ptabn[] =
{
"nano", 0,
"neuro", 0,
"non", 0,
0
};
Ptab ptabo[] =
{
"out", 0,
"over", 0,
0
};
Ptab ptabp[] =
{
"para", 0,
"photo", 0,
"pico", 0,
"poly", 0,
"pre", 0,
"pseudo", 0,
"psycho", 0,
0
};
Ptab ptabq[] =
{
"quasi", 0,
0
};
Ptab ptabr[] =
{
"radio", 0,
"re", 0,
0
};
Ptab ptabs[] =
{
"semi", 0,
"stereo", 0,
"sub", 0,
"super", 0,
0
};
Ptab ptabt[] =
{
"tele", 0,
"tera", 0,
"thermo", 0,
0
};
Ptab ptabu[] =
{
"ultra", 0,
"under", 0, /*must precede un*/
"un", IN,
0
};
Ptab ptabv[] =
{
0
};
Ptab ptabw[] =
{
0
};
Ptab ptabx[] =
{
0
};
Ptab ptaby[] =
{
0
};
Ptab ptabz[] =
{
0
};
Ptab* preftab[] =
{
ptaba,
ptabb,
ptabc,
ptabd,
ptabe,
ptabf,
ptabg,
ptabh,
ptabi,
ptabj,
ptabk,
ptabl,
ptabm,
ptabn,
ptabo,
ptabp,
ptabq,
ptabr,
ptabs,
ptabt,
ptabu,
ptabv,
ptabw,
ptabx,
ptaby,
ptabz,
};
typedef struct {
char *mesg;
enum { NONE, SUFF, PREF} type;
} Deriv;
int aflag;
int cflag;
int fflag;
int vflag;
int xflag;
int nflag;
char word[500];
char* original;
Deriv emptyderiv;
Deriv deriv[DSIZ+3];
char affix[DSIZ*10]; /* 10 is longest affix message */
int prefcount;
int suffcount;
char* acmeid;
char space[300000]; /* must be as large as "words"+"space" in pcode run */
Bits encode[2048]; /* must be as long as "codes" in pcode run */
int nencode;
char voweltab[256];
char* spacep[128*128+1]; /* pointer to words starting with 'xx' */
Biobuf bin;
Biobuf bout;
char* codefile = "#9/lib/amspell";
char* brfile = "#9/lib/brspell";
char* Usage = "usage";
void
main(int argc, char *argv[])
{
char *ep, *cp;
char *dp;
int j, i, c;
int low;
Bits h;
2004-03-25 23:03:57 +00:00
codefile = unsharp(codefile);
brfile = unsharp(brfile);
2004-03-09 12:45:12 +00:00
Binit(&bin, 0, OREAD);
Binit(&bout, 1, OWRITE);
for(i=0; c = "aeiouyAEIOUY"[i]; i++)
voweltab[c] = 1;
while(argc > 1) {
if(argv[1][0] != '-')
break;
for(i=1; c = argv[1][i]; i++)
switch(c) {
default:
fprint(2, "usage: spell [-bcCvx] [-f file]\n");
exits(Usage);
case 'a':
aflag++;
continue;
case 'b':
ise();
if(!fflag)
codefile = brfile;
continue;
case 'C': /* for "correct" */
vflag++;
case 'c': /* for ocr */
cflag++;
continue;
case 'v':
vflag++;
continue;
case 'x':
xflag++;
continue;
case 'f':
if(argc <= 2) {
fprint(2, "spell: -f requires another argument\n");
exits(Usage);
}
argv++;
argc--;
codefile = argv[1];
fflag++;
goto brk;
}
brk:
argv++;
argc--;
}
readdict(codefile);
if(argc > 1) {
fprint(2, "usage: spell [-bcCvx] [-f file]\n");
exits(Usage);
}
if(aflag)
cflag = vflag = 0;
for(;;) {
affix[0] = 0;
original = Brdline(&bin, '\n');
if(original == 0)
exits(0);
original[Blinelen(&bin)-1] = 0;
low = 0;
if(aflag) {
acmeid = original;
while(*original != ':')
if(*original++ == 0)
exits(0);
while(*++original != ':')
if(*original == 0)
exits(0);
*original++ = 0;
}
for(ep=word,dp=original; j = *dp; ep++,dp++) {
if(ISLOWER(j))
low++;
if(ep >= word+sizeof(word)-1)
break;
*ep = j;
}
*ep = 0;
if(ISDIGIT(word[0]) && ordinal())
continue;
h = 0;
if(!low && !(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH)))
for(cp=original+1,dp=word+1; dp<ep; dp++,cp++)
*dp = Tolower(*cp);
if(!h)
for(;;) { /* at most twice */
if(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH))
break;
if(h = trysuff(ep,0,ALL|STOP|DONT_TOUCH))
break;
if(!ISUPPER(word[0]))
break;
cp = original;
dp = word;
while(*dp = *cp++) {
if(!low)
*dp = Tolower(*dp);
dp++;
}
word[0] = Tolower(word[0]);
}
if(cflag) {
if(!h || Set(h,STOP))
print("-");
else if(!vflag)
print("+");
else
print("%c",'0' + (suffcount>0) +
(prefcount>4? 8: 2*prefcount));
} else if(!h || Set(h,STOP)) {
if(aflag)
Bprint(&bout, "%s:%s\n", acmeid, original);
else
Bprint(&bout, "%s\n", original);
} else if(affix[0] != 0 && affix[0] != '.')
print("%s\t%s\n", affix, original);
}
exits(0);
}
/* strip exactly one suffix and do
* indicated routine(s), which may recursively
* strip suffixes
*/
Bits
trysuff(char* ep, int lev, int flag)
{
Suftab *t;
char *cp, *sp;
Bits h = 0;
int initchar = ep[-1];
flag &= ~MONO;
lev += DLEV;
if(lev < DSIZ) {
deriv[lev] = emptyderiv;
deriv[lev-1] = emptyderiv;
}
if(!ISLOWER(initchar))
return h;
for(t=suftab[initchar-'a']; sp=t->suf; t++) {
cp = ep;
while(*sp)
if(*--cp != *sp++)
goto next;
for(sp=ep-t->n1; --sp >= word && !ISVOWEL(*sp);)
;
if(sp < word)
continue;
if(!(t->affixable & flag))
return 0;
h = (*t->p1)(ep-t->n1, t->d1, t->a1, lev+1, t->flag|STOP);
if(!h && t->p2!=0) {
if(lev < DSIZ) {
deriv[lev] = emptyderiv;
deriv[lev+1] = emptyderiv;
}
h = (*t->p2)(ep-t->n2, t->d2, t->a2, lev, t->flag|STOP);
}
break;
next:;
}
return h;
}
Bits
nop(char* ep, char* d, char* a, int lev, int flag)
{
USED(ep);
USED(d);
USED(a);
USED(lev);
USED(flag);
return 0;
}
Bits
cstrip(char* ep, char* d, char* a, int lev, int flag)
{
int temp = ep[0];
if(ISVOWEL(temp) && ISVOWEL(ep[-1])) {
switch(pair(ep[-1],ep[0])) {
case pair('a', 'a'):
case pair('a', 'e'):
case pair('a', 'i'):
case pair('e', 'a'):
case pair('e', 'e'):
case pair('e', 'i'):
case pair('i', 'i'):
case pair('o', 'a'):
return 0;
}
} else
if(temp==ep[-1]&&temp==ep[-2])
return 0;
return strip(ep,d,a,lev,flag);
}
Bits
strip(char* ep, char* d, char* a, int lev, int flag)
{
Bits h = trypref(ep, a, lev, flag);
USED(d);
if(Set(h,MONO) && ISVOWEL(*ep) && ISVOWEL(ep[-2]))
h = 0;
if(h)
return h;
if(ISVOWEL(*ep) && !ISVOWEL(ep[-1]) && ep[-1]==ep[-2]) {
h = trypref(ep-1,a,lev,flag|MONO);
if(h)
return h;
}
return trysuff(ep,lev,flag);
}
Bits
s(char* ep, char* d, char* a, int lev, int flag)
{
if(lev > DLEV+1)
return 0;
if(*ep=='s') {
switch(ep[-1]) {
case 'y':
if(ISVOWEL(ep[-2])||ISUPPER(*word))
break; /*says Kennedys*/
case 'x':
case 'z':
case 's':
return 0;
case 'h':
switch(ep[-2]) {
case 'c':
case 's':
return 0;
}
}
}
return strip(ep,d,a,lev,flag);
}
Bits
an(char* ep, char* d, char* a, int lev, int flag)
{
USED(d);
if(!ISUPPER(*word)) /*must be proper name*/
return 0;
return trypref(ep,a,lev,flag);
}
Bits
ize(char* ep, char* d, char* a, int lev, int flag)
{
int temp = ep[-1];
Bits h;
USED(a);
ep[-1] = 'e';
h = strip(ep,"",d,lev,flag);
ep[-1] = temp;
return h;
}
Bits
y_to_e(char* ep, char* d, char* a, int lev, int flag)
{
Bits h;
int temp;
USED(a);
switch(ep[-1]) {
case 'a':
case 'e':
case 'i':
return 0;
}
temp = *ep;
*ep++ = 'e';
h = strip(ep,"",d,lev,flag);
ep[-1] = temp;
return h;
}
Bits
ily(char* ep, char* d, char* a, int lev, int flag)
{
int temp = ep[0];
char *cp = ep;
if(temp==ep[-1]&&temp==ep[-2]) /* sillly */
return 0;
if(*--cp=='y' && !ISVOWEL(*--cp)) /* happyly */
while(cp>word)
if(ISVOWEL(*--cp)) /* shyness */
return 0;
if(ep[-1]=='i')
return i_to_y(ep,d,a,lev,flag);
return cstrip(ep,d,a,lev,flag);
}
Bits
bility(char* ep, char* d, char* a, int lev, int flag)
{
*ep++ = 'l';
return y_to_e(ep,d,a,lev,flag);
}
Bits
i_to_y(char* ep, char* d, char* a, int lev, int flag)
{
Bits h;
int temp;
if(ISUPPER(*word))
return 0;
if((temp=ep[-1])=='i' && !ISVOWEL(ep[-2])) {
ep[-1] = 'y';
a = d;
}
h = cstrip(ep,"",a,lev,flag);
ep[-1] = temp;
return h;
}
Bits
es(char* ep, char* d, char* a, int lev, int flag)
{
if(lev>DLEV)
return 0;
switch(ep[-1]) {
default:
return 0;
case 'i':
return i_to_y(ep,d,a,lev,flag);
case 'h':
switch(ep[-2]) {
default:
return 0;
case 'c':
case 's':
break;
}
case 's':
case 'z':
case 'x':
return strip(ep,d,a,lev,flag);
}
}
Bits
subst(char* ep, char* d, char* a, int lev, int flag)
{
char *u,*t;
Bits h;
USED(a);
if(skipv(skipv(ep-1)) < word)
return 0;
for(t=d; *t!='+'; t++)
continue;
for(u=ep; *--t!='-';)
*--u = *t;
h = strip(ep,"",d,lev,flag);
while(*++t != '+')
continue;
while(*++t)
*u++ = *t;
return h;
}
Bits
tion(char* ep, char* d, char* a, int lev, int flag)
{
switch(ep[-2]) {
default:
return trypref(ep,a,lev,flag);
case 'a':
case 'e':
case 'i':
case 'o':
case 'u':
return y_to_e(ep,d,a,lev,flag);
}
}
/*
* possible consonant-consonant-e ending
*/
Bits
CCe(char* ep, char* d, char* a, int lev, int flag)
{
Bits h;
switch(ep[-1]) {
case 'l':
if(ISVOWEL(ep[-2]))
break;
switch(ep[-2]) {
case 'l':
case 'r':
case 'w':
break;
default:
return y_to_e(ep,d,a,lev,flag);
}
break;
case 'c':
case 'g':
if(*ep == 'a') /* prevent -able for -eable */
return 0;
case 's':
case 'v':
case 'z':
if(ep[-2]==ep[-1])
break;
if(ISVOWEL(ep[-2]))
break;
case 'u':
if(h = y_to_e(ep,d,a,lev,flag))
return h;
if(!(ep[-2]=='n' && ep[-1]=='g'))
return 0;
}
return VCe(ep,d,a,lev,flag);
}
/*
* possible consonant-vowel-consonant-e ending
*/
Bits
VCe(char* ep, char* d, char* a, int lev, int flag)
{
int c;
Bits h;
c = ep[-1];
if(c=='e')
return 0;
if(!ISVOWEL(c) && ISVOWEL(ep[-2])) {
c = *ep;
*ep++ = 'e';
h = trypref(ep,d,lev,flag);
if(!h)
h = trysuff(ep,lev,flag);
if(h)
return h;
ep--;
*ep = c;
}
return cstrip(ep,d,a,lev,flag);
}
Ptab*
lookuppref(uchar** wp, char* ep)
{
Ptab *sp;
uchar *bp,*cp;
unsigned int initchar = Tolower(**wp);
if(!ISALPHA(initchar))
return 0;
for(sp=preftab[initchar-'a'];sp->s;sp++) {
bp = *wp;
for(cp= (uchar*)sp->s;*cp; )
if(*bp++!=*cp++)
goto next;
for(cp=bp;cp<(uchar*)ep;cp++)
if(ISVOWEL(*cp)) {
*wp = bp;
return sp;
}
next:;
}
return 0;
}
/* while word is not in dictionary try stripping
* prefixes. Fail if no more prefixes.
*/
Bits
trypref(char* ep, char* a, int lev, int flag)
{
Ptab *tp;
char *bp, *cp;
char *pp;
Bits h;
char space[20];
if(lev<DSIZ) {
deriv[lev].mesg = a;
deriv[lev].type = *a=='.'? NONE: SUFF;
}
if(h = tryword(word,ep,lev,flag)) {
if(Set(h, flag&~MONO) && (flag&MONO) <= Set(h, MONO))
return h;
h = 0;
}
bp = word;
pp = space;
if(lev<DSIZ) {
deriv[lev+1].mesg = pp;
deriv[lev+1].type = 0;
}
while(tp=lookuppref((uchar**)&bp,ep)) {
*pp++ = '+';
cp = tp->s;
while(pp<space+sizeof(space) && (*pp = *cp++))
pp++;
deriv[lev+1].type += PREF;
h = tryword(bp,ep,lev+1,flag);
if(Set(h,NOPREF) ||
((tp->flag&IN) && inun(bp-2,h)==0)) {
h = 0;
break;
}
if(Set(h,flag&~MONO) && (flag&MONO) <= Set(h, MONO))
break;
h = 0;
}
if(lev < DSIZ) {
deriv[lev+1] = emptyderiv;
deriv[lev+2] = emptyderiv;
}
return h;
}
Bits
tryword(char* bp, char* ep, int lev, int flag)
{
int j;
Bits h = 0;
char duple[3];
if(ep-bp <= 1)
return h;
if(flag&MONO) {
if(lev<DSIZ) {
deriv[++lev].mesg = duple;
deriv[lev].type = SUFF;
}
duple[0] = '+';
duple[1] = *ep;
duple[2] = 0;
}
h = dict(bp, ep);
if(vflag==0 || h==0)
return h;
/*
* when derivations are wanted, collect them
* for printing
*/
j = lev;
prefcount = suffcount = 0;
do {
if(j<DSIZ && deriv[j].type) {
strcat(affix, deriv[j].mesg);
if(deriv[j].type == SUFF)
suffcount++;
else if(deriv[j].type != NONE)
prefcount = deriv[j].type/PREF;
}
} while(--j > 0);
return h;
}
int
inun(char* bp, Bits h)
{
if(*bp == 'u')
return Set(h, IN) == 0;
/* *bp == 'i' */
if(Set(h, IN) == 0)
return 0;
switch(bp[2]) {
case 'r':
return bp[1] == 'r';
case 'm':
case 'p':
return bp[1] == 'm';
}
return bp[1] == 'n';
}
char*
skipv(char *s)
{
if(s >= word && ISVOWEL(*s))
s--;
while(s >= word && !ISVOWEL(*s))
s--;
return s;
}
/*
* crummy way to Britishise
*/
void
ise(void)
{
Suftab *p;
int i;
for(i=0; i<26; i++)
for(p = suftab[i]; p->suf; p++) {
p->suf = ztos(p->suf);
p->d1 = ztos(p->d1);
p->a1 = ztos(p->a1);
}
}
char*
ztos(char *as)
{
char *s, *ds;
for(s=as; *s; s++)
if(*s == 'z')
goto copy;
return as;
copy:
ds = strdup(as);
for(s=ds; *s; s++)
if(*s == 'z')
*s = 's';
return ds;
}
Bits
dict(char* bp, char* ep)
{
char *cp, *cp1, *w, *wp, *we;
int n, f;
w = bp;
we = ep;
n = ep-bp;
if(n <= 1)
return NOUN;
f = w[0] & 0x7f;
f *= 128;
f += w[1] & 0x7f;
bp = spacep[f];
ep = spacep[f+1];
loop:
if(bp >= ep) {
if(xflag)
fprint(2, "=%.*s\n", utfnlen(w, n), w);
return 0;
}
/*
* find the beginning of some word in the middle
*/
cp = bp + (ep-bp)/2;
while(cp > bp && !(*cp & 0x80))
cp--;
while(cp > bp && (cp[-1] & 0x80))
cp--;
wp = w + 2; /* skip two letters */
cp1 = cp + 2; /* skip affix code */
for(;;) {
if(wp >= we) {
if(*cp1 & 0x80)
goto found;
else
f = 1;
break;
}
if(*cp1 & 0x80) {
f = -1;
break;
}
f = *cp1++ - *wp++;
if(f != 0)
break;
}
if(f < 0) {
while(!(*cp1 & 0x80))
cp1++;
bp = cp1;
goto loop;
}
ep = cp;
goto loop;
found:
f = ((cp[0] & 0x7) << 8) |
(cp[1] & 0xff);
if(xflag) {
fprint(2, "=%.*s ", utfnlen(w, n), w);
typeprint(encode[f]);
}
return encode[f];
}
void
typeprint(Bits h)
{
pcomma("");
if(h & NOUN)
pcomma("n");
if(h & PROP_COLLECT)
pcomma("pc");
if(h & VERB) {
if((h & VERB) == VERB)
pcomma("v");
else
if((h & VERB) == V_IRREG)
pcomma("vi");
else
if(h & ED)
pcomma("ed");
}
if(h & ADJ)
pcomma("a");
if(h & COMP) {
if((h & COMP) == ACTOR)
pcomma("er");
else
pcomma("comp");
}
if(h & DONT_TOUCH)
pcomma("d");
if(h & N_AFFIX)
pcomma("na");
if(h & ADV)
pcomma("adv");
if(h & ION)
pcomma("ion");
if(h & V_AFFIX)
pcomma("va");
if(h & MAN)
pcomma("man");
if(h & NOPREF)
pcomma("nopref");
if(h & MONO)
pcomma("ms");
if(h & IN)
pcomma("in");
if(h & _Y)
pcomma("y");
if(h & STOP)
pcomma("s");
fprint(2, "\n");
}
void
pcomma(char *s)
{
static int flag;
if(*s == 0) {
flag = 0;
return;
}
if(!flag) {
fprint(2, "%s", s);
flag = 1;
} else
fprint(2, ",%s", s);
}
/*
* is the word on of the following
* 12th teen
* 21st end in 1
* 23rd end in 3
* 77th default
* called knowing word[0] is a digit
*/
int
ordinal(void)
{
char *cp = word;
static char sp[4];
while(ISDIGIT(*cp))
cp++;
strncpy(sp,cp,3);
if(ISUPPER(cp[0]) && ISUPPER(cp[1])) {
sp[0] = Tolower(cp[0]);
sp[1] = Tolower(cp[1]);
}
return 0 == strncmp(sp,
cp[-2]=='1'? "th": /* out of bounds if 1 digit */
*--cp=='1'? "st": /* harmless */
*cp=='2'? "nd":
*cp=='3'? "rd":
"th", 3);
}
/*
* read in the dictionary.
* format is
* {
* short nencode;
* long encode[nencode];
* char space[*];
* };
*
* the encodings are a table all different
* affixes.
* the dictionary proper has 2 bytes
* that demark and then the rest of the
* word. the 2 bytes have the following
* 0x80 0x00 flag
* 0x78 0x00 count of prefix bytes
* common with prev word
* 0x07 0xff affix code
*
* all ints are big endians in the file.
*/
void
readdict(char *file)
{
char *s, *is, *lasts, *ls;
int c, i, sp, p;
int f;
long l;
lasts = 0;
f = open(file, 0);
if(f == -1) {
fprint(2, "cannot open %s\n", file);
exits("open");
}
if(read(f, space, 2) != 2)
goto bad;
nencode = ((space[0]&0xff)<<8) | (space[1]&0xff);
if(read(f, space, 4*nencode) != 4*nencode)
goto bad;
s = space;
for(i=0; i<nencode; i++) {
l = (long)(s[0] & 0xff) << 24;
l |= (s[1] & 0xff) << 16;
l |= (s[2] & 0xff) << 8;
l |= s[3] & 0xff;
encode[i] = (Bits)l;
s += 4;
}
l = read(f, space, sizeof(space));
if(l == sizeof(space))
goto noroom;
is = space + (sizeof(space) - l);
memmove(is, space, l);
s = space;
c = *is++ & 0xff;
sp = -1;
i = 0;
loop:
if(s > is)
goto noroom;
if(c < 0) {
close(f);
while(sp < 128*128)
spacep[++sp] = s;
*s = 0x80; /* fence */
return;
}
p = (c>>3) & 0xf;
*s++ = c;
*s++ = *is++ & 0xff;
if(p <= 0)
i = (*is++ & 0xff)*128;
if(p <= 1) {
if(!(*is & 0x80))
i = i/128*128 + (*is++ & 0xff);
if(i <= sp) {
fprint(2, "the dict isnt sorted or \n");
fprint(2, "memmove didn't work\n");
goto bad;
}
while(sp < i)
spacep[++sp] = s-2;
}
ls = lasts;
lasts = s;
for(p-=2; p>0; p--)
*s++ = *ls++;
for(;;) {
if(is >= space+sizeof(space)) {
c = -1;
break;
}
c = *is++ & 0xff;
if(c & 0x80)
break;
*s++ = c;
}
*s = 0;
goto loop;
bad:
fprint(2, "trouble reading %s\n", file);
exits("read");
noroom:
fprint(2, "not enough space for dictionary\n");
exits("space");
}