plan9port/src/libregexp/regcomp.c
2003-09-30 17:47:41 +00:00

557 lines
9.5 KiB
C

#include "lib9.h"
#include "regexp9.h"
#include "regcomp.h"
#define TRUE 1
#define FALSE 0
/*
* Parser Information
*/
typedef
struct Node
{
Reinst* first;
Reinst* last;
}Node;
Reprog RePrOg;
#define NSTACK 20
static Node andstack[NSTACK];
static Node *andp;
static int atorstack[NSTACK];
static int* atorp;
static int cursubid; /* id of current subexpression */
static int subidstack[NSTACK]; /* parallel to atorstack */
static int* subidp;
static int lastwasand; /* Last token was operand */
static int nbra;
static char* exprp; /* pointer to next character in source expression */
static int lexdone;
static int nclass;
static Reclass*classp;
static Reinst* freep;
static int errors;
static Rune yyrune; /* last lex'd rune */
static Reclass*yyclassp; /* last lex'd class */
/* predeclared crap */
static void operator(int);
static void pushand(Reinst*, Reinst*);
static void pushator(int);
static void evaluntil(int);
static int bldcclass(void);
static jmp_buf regkaboom;
static void
rcerror(char *s)
{
errors++;
regerror(s);
longjmp(regkaboom, 1);
}
static Reinst*
newinst(int t)
{
freep->type = t;
freep->u2.left = 0;
freep->u1.right = 0;
return freep++;
}
static void
operand(int t)
{
Reinst *i;
if(lastwasand)
operator(CAT); /* catenate is implicit */
i = newinst(t);
if(t == CCLASS || t == NCCLASS)
i->u1.cp = yyclassp;
if(t == RUNE)
i->u1.r = yyrune;
pushand(i, i);
lastwasand = TRUE;
}
static void
operator(int t)
{
if(t==RBRA && --nbra<0)
rcerror("unmatched right paren");
if(t==LBRA){
if(++cursubid >= NSUBEXP)
rcerror ("too many subexpressions");
nbra++;
if(lastwasand)
operator(CAT);
} else
evaluntil(t);
if(t != RBRA)
pushator(t);
lastwasand = FALSE;
if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
lastwasand = TRUE; /* these look like operands */
}
static void
regerr2(char *s, int c)
{
char buf[100];
char *cp = buf;
while(*s)
*cp++ = *s++;
*cp++ = c;
*cp = '\0';
rcerror(buf);
}
static void
cant(char *s)
{
char buf[100];
strcpy(buf, "can't happen: ");
strcat(buf, s);
rcerror(buf);
}
static void
pushand(Reinst *f, Reinst *l)
{
if(andp >= &andstack[NSTACK])
cant("operand stack overflow");
andp->first = f;
andp->last = l;
andp++;
}
static void
pushator(int t)
{
if(atorp >= &atorstack[NSTACK])
cant("operator stack overflow");
*atorp++ = t;
*subidp++ = cursubid;
}
static Node*
popand(int op)
{
Reinst *inst;
if(andp <= &andstack[0]){
regerr2("missing operand for ", op);
inst = newinst(NOP);
pushand(inst,inst);
}
return --andp;
}
static int
popator(void)
{
if(atorp <= &atorstack[0])
cant("operator stack underflow");
--subidp;
return *--atorp;
}
static void
evaluntil(int pri)
{
Node *op1, *op2;
Reinst *inst1, *inst2;
while(pri==RBRA || atorp[-1]>=pri){
switch(popator()){
default:
rcerror("unknown operator in evaluntil");
break;
case LBRA: /* must have been RBRA */
op1 = popand('(');
inst2 = newinst(RBRA);
inst2->u1.subid = *subidp;
op1->last->u2.next = inst2;
inst1 = newinst(LBRA);
inst1->u1.subid = *subidp;
inst1->u2.next = op1->first;
pushand(inst1, inst2);
return;
case OR:
op2 = popand('|');
op1 = popand('|');
inst2 = newinst(NOP);
op2->last->u2.next = inst2;
op1->last->u2.next = inst2;
inst1 = newinst(OR);
inst1->u1.right = op1->first;
inst1->u2.left = op2->first;
pushand(inst1, inst2);
break;
case CAT:
op2 = popand(0);
op1 = popand(0);
op1->last->u2.next = op2->first;
pushand(op1->first, op2->last);
break;
case STAR:
op2 = popand('*');
inst1 = newinst(OR);
op2->last->u2.next = inst1;
inst1->u1.right = op2->first;
pushand(inst1, inst1);
break;
case PLUS:
op2 = popand('+');
inst1 = newinst(OR);
op2->last->u2.next = inst1;
inst1->u1.right = op2->first;
pushand(op2->first, inst1);
break;
case QUEST:
op2 = popand('?');
inst1 = newinst(OR);
inst2 = newinst(NOP);
inst1->u2.left = inst2;
inst1->u1.right = op2->first;
op2->last->u2.next = inst2;
pushand(inst1, inst2);
break;
}
}
}
static Reprog*
optimize(Reprog *pp)
{
Reinst *inst, *target;
int size;
Reprog *npp;
Reclass *cl;
int diff;
/*
* get rid of NOOP chains
*/
for(inst=pp->firstinst; inst->type!=END; inst++){
target = inst->u2.next;
while(target->type == NOP)
target = target->u2.next;
inst->u2.next = target;
}
/*
* The original allocation is for an area larger than
* necessary. Reallocate to the actual space used
* and then relocate the code.
*/
size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
npp = (Reprog *)realloc(pp, size);
if(npp==0 || npp==pp)
return pp;
diff = (char *)npp - (char *)pp;
freep = (Reinst *)((char *)freep + diff);
for(inst=npp->firstinst; inst<freep; inst++){
switch(inst->type){
case OR:
case STAR:
case PLUS:
case QUEST:
*(char **)&inst->u1.right += diff;
break;
case CCLASS:
case NCCLASS:
*(char **)&inst->u1.right += diff;
cl = inst->u1.cp;
*(char **)&cl->end += diff;
break;
}
*(char **)&inst->u2.left += diff;
}
*(char **)&npp->startinst += diff;
return npp;
}
#ifdef DEBUG
static void
dumpstack(void){
Node *stk;
int *ip;
print("operators\n");
for(ip=atorstack; ip<atorp; ip++)
print("0%o\n", *ip);
print("operands\n");
for(stk=andstack; stk<andp; stk++)
print("0%o\t0%o\n", stk->first->type, stk->last->type);
}
static void
dump(Reprog *pp)
{
Reinst *l;
Rune *p;
l = pp->firstinst;
do{
print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
l->u2.left-pp->firstinst, l->u1.right-pp->firstinst);
if(l->type == RUNE)
print("\t%C\n", l->r);
else if(l->type == CCLASS || l->type == NCCLASS){
print("\t[");
if(l->type == NCCLASS)
print("^");
for(p = l->cp->spans; p < l->cp->end; p += 2)
if(p[0] == p[1])
print("%C", p[0]);
else
print("%C-%C", p[0], p[1]);
print("]\n");
} else
print("\n");
}while(l++->type);
}
#endif
static Reclass*
newclass(void)
{
if(nclass >= NCLASS)
regerr2("too many character classes; limit", NCLASS+'0');
return &(classp[nclass++]);
}
static int
nextc(Rune *rp)
{
if(lexdone){
*rp = 0;
return 1;
}
exprp += chartorune(rp, exprp);
if(*rp == L'\\'){
exprp += chartorune(rp, exprp);
return 1;
}
if(*rp == 0)
lexdone = 1;
return 0;
}
static int
lex(int literal, int dot_type)
{
int quoted;
quoted = nextc(&yyrune);
if(literal || quoted){
if(yyrune == 0)
return END;
return RUNE;
}
switch(yyrune){
case 0:
return END;
case L'*':
return STAR;
case L'?':
return QUEST;
case L'+':
return PLUS;
case L'|':
return OR;
case L'.':
return dot_type;
case L'(':
return LBRA;
case L')':
return RBRA;
case L'^':
return BOL;
case L'$':
return EOL;
case L'[':
return bldcclass();
}
return RUNE;
}
static int
bldcclass(void)
{
int type;
Rune r[NCCRUNE];
Rune *p, *ep, *np;
Rune rune;
int quoted;
/* we have already seen the '[' */
type = CCLASS;
yyclassp = newclass();
/* look ahead for negation */
/* SPECIAL CASE!!! negated classes don't match \n */
ep = r;
quoted = nextc(&rune);
if(!quoted && rune == L'^'){
type = NCCLASS;
quoted = nextc(&rune);
*ep++ = L'\n';
*ep++ = L'\n';
}
/* parse class into a set of spans */
for(; ep<&r[NCCRUNE];){
if(rune == 0){
rcerror("malformed '[]'");
return 0;
}
if(!quoted && rune == L']')
break;
if(!quoted && rune == L'-'){
if(ep == r){
rcerror("malformed '[]'");
return 0;
}
quoted = nextc(&rune);
if((!quoted && rune == L']') || rune == 0){
rcerror("malformed '[]'");
return 0;
}
*(ep-1) = rune;
} else {
*ep++ = rune;
*ep++ = rune;
}
quoted = nextc(&rune);
}
/* sort on span start */
for(p = r; p < ep; p += 2){
for(np = p; np < ep; np += 2)
if(*np < *p){
rune = np[0];
np[0] = p[0];
p[0] = rune;
rune = np[1];
np[1] = p[1];
p[1] = rune;
}
}
/* merge spans */
np = yyclassp->spans;
p = r;
if(r == ep)
yyclassp->end = np;
else {
np[0] = *p++;
np[1] = *p++;
for(; p < ep; p += 2)
if(p[0] <= np[1]){
if(p[1] > np[1])
np[1] = p[1];
} else {
np += 2;
np[0] = p[0];
np[1] = p[1];
}
yyclassp->end = np+2;
}
return type;
}
static Reprog*
regcomp1(char *s, int literal, int dot_type)
{
int token;
Reprog *pp;
/* get memory for the program */
pp = (Reprog *)malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
if(pp == 0){
regerror("out of memory");
return 0;
}
freep = pp->firstinst;
classp = pp->class;
errors = 0;
if(setjmp(regkaboom))
goto out;
/* go compile the sucker */
lexdone = 0;
exprp = s;
nclass = 0;
nbra = 0;
atorp = atorstack;
andp = andstack;
subidp = subidstack;
lastwasand = FALSE;
cursubid = 0;
/* Start with a low priority operator to prime parser */
pushator(START-1);
while((token = lex(literal, dot_type)) != END){
if((token&0300) == OPERATOR)
operator(token);
else
operand(token);
}
/* Close with a low priority operator */
evaluntil(START);
/* Force END */
operand(END);
evaluntil(START);
#ifdef DEBUG
dumpstack();
#endif
if(nbra)
rcerror("unmatched left paren");
--andp; /* points to first and only operand */
pp->startinst = andp->first;
#ifdef DEBUG
dump(pp);
#endif
pp = optimize(pp);
#ifdef DEBUG
print("start: %d\n", andp->first-pp->firstinst);
dump(pp);
#endif
out:
if(errors){
free(pp);
pp = 0;
}
return pp;
}
extern Reprog*
regcomp(char *s)
{
return regcomp1(s, 0, ANY);
}
extern Reprog*
regcomplit(char *s)
{
return regcomp1(s, 1, ANY);
}
extern Reprog*
regcompnl(char *s)
{
return regcomp1(s, 0, ANYNL);
}