plan9port/src/cmd/acme/regx.c

836 lines
16 KiB
C
Raw Normal View History

#include <u.h>
#include <libc.h>
#include <draw.h>
#include <thread.h>
#include <cursor.h>
#include <mouse.h>
#include <keyboard.h>
#include <frame.h>
#include <fcall.h>
#include <plumb.h>
#include "dat.h"
#include "fns.h"
Rangeset sel;
Rune *lastregexp;
/*
* Machine Information
*/
typedef struct Inst Inst;
struct Inst
{
uint type; /* < 0x10000 ==> literal, otherwise action */
union {
int sid;
int subid;
int class;
Inst *other;
Inst *right;
} u;
union{
Inst *left;
Inst *next;
} u1;
};
#define NPROG 1024
Inst program[NPROG];
Inst *progp;
Inst *startinst; /* First inst. of program; might not be program[0] */
Inst *bstartinst; /* same for backwards machine */
Channel *rechan; /* chan(Inst*) */
typedef struct Ilist Ilist;
struct Ilist
{
Inst *inst; /* Instruction of the thread */
Rangeset se;
uint startp; /* first char of match */
};
#define NLIST 128
Ilist *tl, *nl; /* This list, next list */
Ilist list[2][NLIST];
static Rangeset sempty;
/*
* Actions and Tokens
*
* 0x100xx are operators, value == precedence
* 0x200xx are tokens, i.e. operands for operators
*/
#define OPERATOR 0x10000 /* Bitmask of all operators */
#define START 0x10000 /* Start, used for marker on stack */
#define RBRA 0x10001 /* Right bracket, ) */
#define LBRA 0x10002 /* Left bracket, ( */
#define OR 0x10003 /* Alternation, | */
#define CAT 0x10004 /* Concatentation, implicit operator */
#define STAR 0x10005 /* Closure, * */
#define PLUS 0x10006 /* a+ == aa* */
#define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */
#define ANY 0x20000 /* Any character but newline, . */
#define NOP 0x20001 /* No operation, internal use only */
#define BOL 0x20002 /* Beginning of line, ^ */
#define EOL 0x20003 /* End of line, $ */
#define CCLASS 0x20004 /* Character class, [] */
#define NCCLASS 0x20005 /* Negated character class, [^] */
#define END 0x20077 /* Terminate: match found */
#define ISATOR 0x10000
#define ISAND 0x20000
/*
* Parser Information
*/
typedef struct Node Node;
struct Node
{
Inst *first;
Inst *last;
};
#define NSTACK 20
Node andstack[NSTACK];
Node *andp;
int atorstack[NSTACK];
int *atorp;
int lastwasand; /* Last token was operand */
int cursubid;
int subidstack[NSTACK];
int *subidp;
int backwards;
int nbra;
Rune *exprp; /* pointer to next character in source expression */
#define DCLASS 10 /* allocation increment */
int nclass; /* number active */
int Nclass; /* high water mark */
Rune **class;
int negateclass;
void addinst(Ilist *l, Inst *inst, Rangeset *sep);
void newmatch(Rangeset*);
void bnewmatch(Rangeset*);
void pushand(Inst*, Inst*);
void pushator(int);
Node *popand(int);
int popator(void);
void startlex(Rune*);
int lex(void);
void operator(int);
void operand(int);
void evaluntil(int);
void optimize(Inst*);
void bldcclass(void);
void
rxinit(void)
{
rechan = chancreate(sizeof(Inst*), 0);
lastregexp = runemalloc(1);
}
void
regerror(char *e)
{
lastregexp[0] = 0;
warning(nil, "regexp: %s\n", e);
sendp(rechan, nil);
threadexits(nil);
}
Inst *
newinst(int t)
{
if(progp >= &program[NPROG])
regerror("expression too long");
progp->type = t;
progp->u1.left = nil;
progp->u.right = nil;
return progp++;
}
void
realcompile(void *arg)
{
int token;
Rune *s;
threadsetname("regcomp");
s = arg;
startlex(s);
atorp = atorstack;
andp = andstack;
subidp = subidstack;
cursubid = 0;
lastwasand = FALSE;
/* Start with a low priority operator to prime parser */
pushator(START-1);
while((token=lex()) != END){
if((token&ISATOR) == OPERATOR)
operator(token);
else
operand(token);
}
/* Close with a low priority operator */
evaluntil(START);
/* Force END */
operand(END);
evaluntil(START);
if(nbra)
regerror("unmatched `('");
--andp; /* points to first and only operand */
sendp(rechan, andp->first);
threadexits(nil);
}
/* r is null terminated */
int
rxcompile(Rune *r)
{
int i, nr;
Inst *oprogp;
nr = runestrlen(r)+1;
if(runeeq(lastregexp, runestrlen(lastregexp)+1, r, nr)==TRUE)
return TRUE;
lastregexp[0] = 0;
for(i=0; i<nclass; i++)
free(class[i]);
nclass = 0;
progp = program;
backwards = FALSE;
bstartinst = nil;
threadcreate(realcompile, r, STACK);
startinst = recvp(rechan);
if(startinst == nil)
return FALSE;
optimize(program);
oprogp = progp;
backwards = TRUE;
threadcreate(realcompile, r, STACK);
bstartinst = recvp(rechan);
if(bstartinst == nil)
return FALSE;
optimize(oprogp);
lastregexp = runerealloc(lastregexp, nr);
runemove(lastregexp, r, nr);
return TRUE;
}
void
operand(int t)
{
Inst *i;
if(lastwasand)
operator(CAT); /* catenate is implicit */
i = newinst(t);
if(t == CCLASS){
if(negateclass)
i->type = NCCLASS; /* UGH */
i->u.class = nclass-1; /* UGH */
}
pushand(i, i);
lastwasand = TRUE;
}
void
operator(int t)
{
if(t==RBRA && --nbra<0)
regerror("unmatched `)'");
if(t==LBRA){
cursubid++; /* silently ignored */
nbra++;
if(lastwasand)
operator(CAT);
}else
evaluntil(t);
if(t!=RBRA)
pushator(t);
lastwasand = FALSE;
if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
lastwasand = TRUE; /* these look like operands */
}
void
pushand(Inst *f, Inst *l)
{
if(andp >= &andstack[NSTACK])
error("operand stack overflow");
andp->first = f;
andp->last = l;
andp++;
}
void
pushator(int t)
{
if(atorp >= &atorstack[NSTACK])
error("operator stack overflow");
*atorp++=t;
if(cursubid >= NRange)
*subidp++= -1;
else
*subidp++=cursubid;
}
Node *
popand(int op)
{
char buf[64];
if(andp <= &andstack[0])
if(op){
sprint(buf, "missing operand for %c", op);
regerror(buf);
}else
regerror("malformed regexp");
return --andp;
}
int
popator()
{
if(atorp <= &atorstack[0])
error("operator stack underflow");
--subidp;
return *--atorp;
}
void
evaluntil(int pri)
{
Node *op1, *op2, *t;
Inst *inst1, *inst2;
while(pri==RBRA || atorp[-1]>=pri){
switch(popator()){
case LBRA:
op1 = popand('(');
inst2 = newinst(RBRA);
inst2->u.subid = *subidp;
op1->last->u1.next = inst2;
inst1 = newinst(LBRA);
inst1->u.subid = *subidp;
inst1->u1.next = op1->first;
pushand(inst1, inst2);
return; /* must have been RBRA */
default:
error("unknown regexp operator");
break;
case OR:
op2 = popand('|');
op1 = popand('|');
inst2 = newinst(NOP);
op2->last->u1.next = inst2;
op1->last->u1.next = inst2;
inst1 = newinst(OR);
inst1->u.right = op1->first;
inst1->u1.left = op2->first;
pushand(inst1, inst2);
break;
case CAT:
op2 = popand(0);
op1 = popand(0);
if(backwards && op2->first->type!=END){
t = op1;
op1 = op2;
op2 = t;
}
op1->last->u1.next = op2->first;
pushand(op1->first, op2->last);
break;
case STAR:
op2 = popand('*');
inst1 = newinst(OR);
op2->last->u1.next = inst1;
inst1->u.right = op2->first;
pushand(inst1, inst1);
break;
case PLUS:
op2 = popand('+');
inst1 = newinst(OR);
op2->last->u1.next = inst1;
inst1->u.right = op2->first;
pushand(op2->first, inst1);
break;
case QUEST:
op2 = popand('?');
inst1 = newinst(OR);
inst2 = newinst(NOP);
inst1->u1.left = inst2;
inst1->u.right = op2->first;
op2->last->u1.next = inst2;
pushand(inst1, inst2);
break;
}
}
}
void
optimize(Inst *start)
{
Inst *inst, *target;
for(inst=start; inst->type!=END; inst++){
target = inst->u1.next;
while(target->type == NOP)
target = target->u1.next;
inst->u1.next = target;
}
}
void
startlex(Rune *s)
{
exprp = s;
nbra = 0;
}
int
lex(void){
int c;
c = *exprp++;
switch(c){
case '\\':
if(*exprp)
if((c= *exprp++)=='n')
c='\n';
break;
case 0:
c = END;
--exprp; /* In case we come here again */
break;
case '*':
c = STAR;
break;
case '?':
c = QUEST;
break;
case '+':
c = PLUS;
break;
case '|':
c = OR;
break;
case '.':
c = ANY;
break;
case '(':
c = LBRA;
break;
case ')':
c = RBRA;
break;
case '^':
c = BOL;
break;
case '$':
c = EOL;
break;
case '[':
c = CCLASS;
bldcclass();
break;
}
return c;
}
int
nextrec(void)
{
if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
regerror("malformed `[]'");
if(exprp[0] == '\\'){
exprp++;
if(*exprp=='n'){
exprp++;
return '\n';
}
return *exprp++|0x10000;
}
return *exprp++;
}
void
bldcclass(void)
{
int c1, c2, n, na;
Rune *classp;
classp = runemalloc(DCLASS);
n = 0;
na = DCLASS;
/* we have already seen the '[' */
if(*exprp == '^'){
classp[n++] = '\n'; /* don't match newline in negate case */
negateclass = TRUE;
exprp++;
}else
negateclass = FALSE;
while((c1 = nextrec()) != ']'){
if(c1 == '-'){
Error:
free(classp);
regerror("malformed `[]'");
}
if(n+4 >= na){ /* 3 runes plus NUL */
na += DCLASS;
classp = runerealloc(classp, na);
}
if(*exprp == '-'){
exprp++; /* eat '-' */
if((c2 = nextrec()) == ']')
goto Error;
classp[n+0] = 0xFFFF;
classp[n+1] = c1;
classp[n+2] = c2;
n += 3;
}else
classp[n++] = c1;
}
classp[n] = 0;
if(nclass == Nclass){
Nclass += DCLASS;
class = realloc(class, Nclass*sizeof(Rune*));
}
class[nclass++] = classp;
}
int
classmatch(int classno, int c, int negate)
{
Rune *p;
p = class[classno];
while(*p){
if(*p == 0xFFFF){
if(p[1]<=c && c<=p[2])
return !negate;
p += 3;
}else if(*p++ == c)
return !negate;
}
return negate;
}
/*
* Note optimization in addinst:
* *l must be pending when addinst called; if *l has been looked
* at already, the optimization is a bug.
*/
void
addinst(Ilist *l, Inst *inst, Rangeset *sep)
{
Ilist *p;
for(p = l; p->inst; p++){
if(p->inst==inst){
if((sep)->r[0].q0 < p->se.r[0].q0)
p->se= *sep; /* this would be bug */
return; /* It's already there */
}
}
p->inst = inst;
p->se= *sep;
(p+1)->inst = nil;
}
int
rxnull(void)
{
return startinst==nil || bstartinst==nil;
}
/* either t!=nil or r!=nil, and we match the string in the appropriate place */
int
rxexecute(Text *t, Rune *r, uint startp, uint eof, Rangeset *rp)
{
int flag;
Inst *inst;
Ilist *tlp;
uint p;
int nnl, ntl;
int nc, c;
int wrapped;
int startchar;
flag = 0;
p = startp;
startchar = 0;
wrapped = 0;
nnl = 0;
if(startinst->type<OPERATOR)
startchar = startinst->type;
list[0][0].inst = list[1][0].inst = nil;
sel.r[0].q0 = -1;
if(t != nil)
nc = t->file->b.nc;
else
nc = runestrlen(r);
/* Execute machine once for each character */
for(;;p++){
doloop:
if(p>=eof || p>=nc){
switch(wrapped++){
case 0: /* let loop run one more click */
case 2:
break;
case 1: /* expired; wrap to beginning */
if(sel.r[0].q0>=0 || eof!=Infinity)
goto Return;
list[0][0].inst = list[1][0].inst = nil;
p = 0;
goto doloop;
default:
goto Return;
}
c = 0;
}else{
if(((wrapped && p>=startp) || sel.r[0].q0>0) && nnl==0)
break;
if(t != nil)
c = textreadc(t, p);
else
c = r[p];
}
/* fast check for first char */
if(startchar && nnl==0 && c!=startchar)
continue;
tl = list[flag];
nl = list[flag^=1];
nl->inst = nil;
ntl = nnl;
nnl = 0;
if(sel.r[0].q0<0 && (!wrapped || p<startp || startp==eof)){
/* Add first instruction to this list */
if(++ntl >= NLIST){
Overflow:
warning(nil, "regexp list overflow\n");
sel.r[0].q0 = -1;
goto Return;
}
sempty.r[0].q0 = p;
addinst(tl, startinst, &sempty);
}
/* Execute machine until this list is empty */
for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
Switchstmt:
switch(inst->type){
default: /* regular character */
if(inst->type==c){
Addinst:
if(++nnl >= NLIST)
goto Overflow;
addinst(nl, inst->u1.next, &tlp->se);
}
break;
case LBRA:
if(inst->u.subid>=0)
tlp->se.r[inst->u.subid].q0 = p;
inst = inst->u1.next;
goto Switchstmt;
case RBRA:
if(inst->u.subid>=0)
tlp->se.r[inst->u.subid].q1 = p;
inst = inst->u1.next;
goto Switchstmt;
case ANY:
if(c!='\n')
goto Addinst;
break;
case BOL:
if(p==0 || (t!=nil && textreadc(t, p-1)=='\n') || (r!=nil && r[p-1]=='\n')){
Step:
inst = inst->u1.next;
goto Switchstmt;
}
break;
case EOL:
if(c == '\n')
goto Step;
break;
case CCLASS:
if(c>=0 && classmatch(inst->u.class, c, 0))
goto Addinst;
break;
case NCCLASS:
if(c>=0 && classmatch(inst->u.class, c, 1))
goto Addinst;
break;
case OR:
/* evaluate right choice later */
if(++ntl >= NLIST)
goto Overflow;
addinst(tlp, inst->u.right, &tlp->se);
/* efficiency: advance and re-evaluate */
inst = inst->u1.left;
goto Switchstmt;
case END: /* Match! */
tlp->se.r[0].q1 = p;
newmatch(&tlp->se);
break;
}
}
}
Return:
*rp = sel;
return sel.r[0].q0 >= 0;
}
void
newmatch(Rangeset *sp)
{
if(sel.r[0].q0<0 || sp->r[0].q0<sel.r[0].q0 ||
(sp->r[0].q0==sel.r[0].q0 && sp->r[0].q1>sel.r[0].q1))
sel = *sp;
}
int
rxbexecute(Text *t, uint startp, Rangeset *rp)
{
int flag;
Inst *inst;
Ilist *tlp;
int p;
int nnl, ntl;
int c;
int wrapped;
int startchar;
flag = 0;
nnl = 0;
wrapped = 0;
p = startp;
startchar = 0;
if(bstartinst->type<OPERATOR)
startchar = bstartinst->type;
list[0][0].inst = list[1][0].inst = nil;
sel.r[0].q0= -1;
/* Execute machine once for each character, including terminal NUL */
for(;;--p){
doloop:
if(p <= 0){
switch(wrapped++){
case 0: /* let loop run one more click */
case 2:
break;
case 1: /* expired; wrap to end */
if(sel.r[0].q0>=0)
goto Return;
list[0][0].inst = list[1][0].inst = nil;
p = t->file->b.nc;
goto doloop;
case 3:
default:
goto Return;
}
c = 0;
}else{
if(((wrapped && p<=startp) || sel.r[0].q0>0) && nnl==0)
break;
c = textreadc(t, p-1);
}
/* fast check for first char */
if(startchar && nnl==0 && c!=startchar)
continue;
tl = list[flag];
nl = list[flag^=1];
nl->inst = nil;
ntl = nnl;
nnl = 0;
if(sel.r[0].q0<0 && (!wrapped || p>startp)){
/* Add first instruction to this list */
if(++ntl >= NLIST){
Overflow:
warning(nil, "regexp list overflow\n");
sel.r[0].q0 = -1;
goto Return;
}
/* the minus is so the optimizations in addinst work */
sempty.r[0].q0 = -p;
addinst(tl, bstartinst, &sempty);
}
/* Execute machine until this list is empty */
for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
Switchstmt:
switch(inst->type){
default: /* regular character */
if(inst->type == c){
Addinst:
if(++nnl >= NLIST)
goto Overflow;
addinst(nl, inst->u1.next, &tlp->se);
}
break;
case LBRA:
if(inst->u.subid>=0)
tlp->se.r[inst->u.subid].q0 = p;
inst = inst->u1.next;
goto Switchstmt;
case RBRA:
if(inst->u.subid >= 0)
tlp->se.r[inst->u.subid].q1 = p;
inst = inst->u1.next;
goto Switchstmt;
case ANY:
if(c != '\n')
goto Addinst;
break;
case BOL:
if(c=='\n' || p==0){
Step:
inst = inst->u1.next;
goto Switchstmt;
}
break;
case EOL:
if(p<t->file->b.nc && textreadc(t, p)=='\n')
goto Step;
break;
case CCLASS:
if(c>0 && classmatch(inst->u.class, c, 0))
goto Addinst;
break;
case NCCLASS:
if(c>0 && classmatch(inst->u.class, c, 1))
goto Addinst;
break;
case OR:
/* evaluate right choice later */
if(++ntl >= NLIST)
goto Overflow;
addinst(tlp, inst->u.right, &tlp->se);
/* efficiency: advance and re-evaluate */
inst = inst->u1.left;
goto Switchstmt;
case END: /* Match! */
tlp->se.r[0].q0 = -tlp->se.r[0].q0; /* minus sign */
tlp->se.r[0].q1 = p;
bnewmatch(&tlp->se);
break;
}
}
}
Return:
*rp = sel;
return sel.r[0].q0 >= 0;
}
void
bnewmatch(Rangeset *sp)
{
int i;
if(sel.r[0].q0<0 || sp->r[0].q0>sel.r[0].q1 || (sp->r[0].q0==sel.r[0].q1 && sp->r[0].q1<sel.r[0].q0))
for(i = 0; i<NRange; i++){ /* note the reversal; q0<=q1 */
sel.r[i].q0 = sp->r[i].q1;
sel.r[i].q1 = sp->r[i].q0;
}
}