plan9port/src/libregexp/regcomp.c
Russ Cox a7511dd43d libregexp: fix ambiguous match selection
echo SYSSYSR1 | sed 's/SYS.+/sysr1/'

was producing SYSsysr1 instead of sysr1.
Bug was introduced during overflow cleanup earlier this year.

Also bring regexec.c and rregexec.c into sync again.
Also allocate large enough lists in the regexec2/rregexec2 case.
2007-12-07 15:32:45 -05:00

560 lines
9.5 KiB
C

#include "lib9.h"
#include "regexp9.h"
#include "regcomp.h"
#define TRUE 1
#define FALSE 0
/*
* Parser Information
*/
typedef
struct Node
{
Reinst* first;
Reinst* last;
}Node;
#define NSTACK 20
static Node andstack[NSTACK];
static Node *andp;
static int atorstack[NSTACK];
static int* atorp;
static int cursubid; /* id of current subexpression */
static int subidstack[NSTACK]; /* parallel to atorstack */
static int* subidp;
static int lastwasand; /* Last token was operand */
static int nbra;
static char* exprp; /* pointer to next character in source expression */
static int lexdone;
static int nclass;
static Reclass*classp;
static Reinst* freep;
static int errors;
static Rune yyrune; /* last lex'd rune */
static Reclass*yyclassp; /* last lex'd class */
/* predeclared crap */
static void operator(int);
static void pushand(Reinst*, Reinst*);
static void pushator(int);
static void evaluntil(int);
static int bldcclass(void);
static jmp_buf regkaboom;
static void
rcerror(char *s)
{
errors++;
regerror(s);
longjmp(regkaboom, 1);
}
static Reinst*
newinst(int t)
{
freep->type = t;
freep->u2.left = 0;
freep->u1.right = 0;
return freep++;
}
static void
operand(int t)
{
Reinst *i;
if(lastwasand)
operator(CAT); /* catenate is implicit */
i = newinst(t);
if(t == CCLASS || t == NCCLASS)
i->u1.cp = yyclassp;
if(t == RUNE)
i->u1.r = yyrune;
pushand(i, i);
lastwasand = TRUE;
}
static void
operator(int t)
{
if(t==RBRA && --nbra<0)
rcerror("unmatched right paren");
if(t==LBRA){
if(++cursubid >= NSUBEXP)
rcerror ("too many subexpressions");
nbra++;
if(lastwasand)
operator(CAT);
} else
evaluntil(t);
if(t != RBRA)
pushator(t);
lastwasand = FALSE;
if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
lastwasand = TRUE; /* these look like operands */
}
static void
regerr2(char *s, int c)
{
char buf[100];
char *cp = buf;
while(*s)
*cp++ = *s++;
*cp++ = c;
*cp = '\0';
rcerror(buf);
}
static void
cant(char *s)
{
char buf[100];
strcpy(buf, "can't happen: ");
strcat(buf, s);
rcerror(buf);
}
static void
pushand(Reinst *f, Reinst *l)
{
if(andp >= &andstack[NSTACK])
cant("operand stack overflow");
andp->first = f;
andp->last = l;
andp++;
}
static void
pushator(int t)
{
if(atorp >= &atorstack[NSTACK])
cant("operator stack overflow");
*atorp++ = t;
*subidp++ = cursubid;
}
static Node*
popand(int op)
{
Reinst *inst;
if(andp <= &andstack[0]){
regerr2("missing operand for ", op);
inst = newinst(NOP);
pushand(inst,inst);
}
return --andp;
}
static int
popator(void)
{
if(atorp <= &atorstack[0])
cant("operator stack underflow");
--subidp;
return *--atorp;
}
static void
evaluntil(int pri)
{
Node *op1, *op2;
Reinst *inst1, *inst2;
while(pri==RBRA || atorp[-1]>=pri){
switch(popator()){
default:
rcerror("unknown operator in evaluntil");
break;
case LBRA: /* must have been RBRA */
op1 = popand('(');
inst2 = newinst(RBRA);
inst2->u1.subid = *subidp;
op1->last->u2.next = inst2;
inst1 = newinst(LBRA);
inst1->u1.subid = *subidp;
inst1->u2.next = op1->first;
pushand(inst1, inst2);
return;
case OR:
op2 = popand('|');
op1 = popand('|');
inst2 = newinst(NOP);
op2->last->u2.next = inst2;
op1->last->u2.next = inst2;
inst1 = newinst(OR);
inst1->u1.right = op1->first;
inst1->u2.left = op2->first;
pushand(inst1, inst2);
break;
case CAT:
op2 = popand(0);
op1 = popand(0);
op1->last->u2.next = op2->first;
pushand(op1->first, op2->last);
break;
case STAR:
op2 = popand('*');
inst1 = newinst(OR);
op2->last->u2.next = inst1;
inst1->u1.right = op2->first;
pushand(inst1, inst1);
break;
case PLUS:
op2 = popand('+');
inst1 = newinst(OR);
op2->last->u2.next = inst1;
inst1->u1.right = op2->first;
pushand(op2->first, inst1);
break;
case QUEST:
op2 = popand('?');
inst1 = newinst(OR);
inst2 = newinst(NOP);
inst1->u2.left = inst2;
inst1->u1.right = op2->first;
op2->last->u2.next = inst2;
pushand(inst1, inst2);
break;
}
}
}
static Reprog*
optimize(Reprog *pp)
{
Reinst *inst, *target;
int size;
Reprog *npp;
Reclass *cl;
int diff, proglen;
/*
* get rid of NOOP chains
*/
for(inst=pp->firstinst; inst->type!=END; inst++){
target = inst->u2.next;
while(target->type == NOP)
target = target->u2.next;
inst->u2.next = target;
}
/*
* The original allocation is for an area larger than
* necessary. Reallocate to the actual space used
* and then relocate the code.
*/
proglen = freep - pp->firstinst;
size = sizeof(Reprog) + proglen*sizeof(Reinst);
npp = realloc(pp, size);
if(npp==0 || npp==pp){
pp->proglen = proglen;
return pp;
}
diff = (char *)npp - (char *)pp;
freep = (Reinst *)((char *)freep + diff);
for(inst=npp->firstinst; inst<freep; inst++){
switch(inst->type){
case OR:
case STAR:
case PLUS:
case QUEST:
*(char**)(void*)&inst->u1.right += diff;
break;
case CCLASS:
case NCCLASS:
*(char**)(void*)&inst->u1.right += diff;
cl = inst->u1.cp;
*(char**)(void*)&cl->end += diff;
break;
}
*(char**)(void*)&inst->u2.left += diff;
}
*(char**)(void*)&npp->startinst += diff;
npp->proglen = proglen;
return npp;
}
#ifdef DEBUG
static void
dumpstack(void){
Node *stk;
int *ip;
print("operators\n");
for(ip=atorstack; ip<atorp; ip++)
print("0%o\n", *ip);
print("operands\n");
for(stk=andstack; stk<andp; stk++)
print("0%o\t0%o\n", stk->first->type, stk->last->type);
}
static void
dump(Reprog *pp)
{
Reinst *l;
Rune *p;
l = pp->firstinst;
do{
print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
l->u2.left-pp->firstinst, l->u1.right-pp->firstinst);
if(l->type == RUNE)
print("\t%C\n", l->u1.r);
else if(l->type == CCLASS || l->type == NCCLASS){
print("\t[");
if(l->type == NCCLASS)
print("^");
for(p = l->u1.cp->spans; p < l->u1.cp->end; p += 2)
if(p[0] == p[1])
print("%C", p[0]);
else
print("%C-%C", p[0], p[1]);
print("]\n");
} else
print("\n");
}while(l++->type);
}
#endif
static Reclass*
newclass(void)
{
if(nclass >= NCLASS)
regerr2("too many character classes; limit", NCLASS+'0');
return &(classp[nclass++]);
}
static int
nextc(Rune *rp)
{
if(lexdone){
*rp = 0;
return 1;
}
exprp += chartorune(rp, exprp);
if(*rp == '\\'){
exprp += chartorune(rp, exprp);
return 1;
}
if(*rp == 0)
lexdone = 1;
return 0;
}
static int
lex(int literal, int dot_type)
{
int quoted;
quoted = nextc(&yyrune);
if(literal || quoted){
if(yyrune == 0)
return END;
return RUNE;
}
switch(yyrune){
case 0:
return END;
case '*':
return STAR;
case '?':
return QUEST;
case '+':
return PLUS;
case '|':
return OR;
case '.':
return dot_type;
case '(':
return LBRA;
case ')':
return RBRA;
case '^':
return BOL;
case '$':
return EOL;
case '[':
return bldcclass();
}
return RUNE;
}
static int
bldcclass(void)
{
int type;
Rune r[NCCRUNE];
Rune *p, *ep, *np;
Rune rune;
int quoted;
/* we have already seen the '[' */
type = CCLASS;
yyclassp = newclass();
/* look ahead for negation */
/* SPECIAL CASE!!! negated classes don't match \n */
ep = r;
quoted = nextc(&rune);
if(!quoted && rune == '^'){
type = NCCLASS;
quoted = nextc(&rune);
*ep++ = '\n';
*ep++ = '\n';
}
/* parse class into a set of spans */
for(; ep<&r[NCCRUNE];){
if(rune == 0){
rcerror("malformed '[]'");
return 0;
}
if(!quoted && rune == ']')
break;
if(!quoted && rune == '-'){
if(ep == r){
rcerror("malformed '[]'");
return 0;
}
quoted = nextc(&rune);
if((!quoted && rune == ']') || rune == 0){
rcerror("malformed '[]'");
return 0;
}
*(ep-1) = rune;
} else {
*ep++ = rune;
*ep++ = rune;
}
quoted = nextc(&rune);
}
/* sort on span start */
for(p = r; p < ep; p += 2){
for(np = p; np < ep; np += 2)
if(*np < *p){
rune = np[0];
np[0] = p[0];
p[0] = rune;
rune = np[1];
np[1] = p[1];
p[1] = rune;
}
}
/* merge spans */
np = yyclassp->spans;
p = r;
if(r == ep)
yyclassp->end = np;
else {
np[0] = *p++;
np[1] = *p++;
for(; p < ep; p += 2)
if(p[0] <= np[1]){
if(p[1] > np[1])
np[1] = p[1];
} else {
np += 2;
np[0] = p[0];
np[1] = p[1];
}
yyclassp->end = np+2;
}
return type;
}
static Reprog*
regcomp1(char *s, int literal, int dot_type)
{
int token;
Reprog *volatile pp;
/* get memory for the program */
pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
if(pp == 0){
regerror("out of memory");
return 0;
}
freep = pp->firstinst;
classp = pp->class;
errors = 0;
if(setjmp(regkaboom))
goto out;
/* go compile the sucker */
lexdone = 0;
exprp = s;
nclass = 0;
nbra = 0;
atorp = atorstack;
andp = andstack;
subidp = subidstack;
lastwasand = FALSE;
cursubid = 0;
/* Start with a low priority operator to prime parser */
pushator(START-1);
while((token = lex(literal, dot_type)) != END){
if((token&0300) == OPERATOR)
operator(token);
else
operand(token);
}
/* Close with a low priority operator */
evaluntil(START);
/* Force END */
operand(END);
evaluntil(START);
#ifdef DEBUG
dumpstack();
#endif
if(nbra)
rcerror("unmatched left paren");
--andp; /* points to first and only operand */
pp->startinst = andp->first;
#ifdef DEBUG
dump(pp);
#endif
pp = optimize(pp);
#ifdef DEBUG
print("start: %d\n", andp->first-pp->firstinst);
dump(pp);
#endif
out:
if(errors){
free(pp);
pp = 0;
}
return pp;
}
extern Reprog*
regcomp(char *s)
{
return regcomp1(s, 0, ANY);
}
extern Reprog*
regcomplit(char *s)
{
return regcomp1(s, 1, ANY);
}
extern Reprog*
regcompnl(char *s)
{
return regcomp1(s, 0, ANYNL);
}