echo SYSSYSR1 | sed 's/SYS.+/sysr1/' was producing SYSsysr1 instead of sysr1. Bug was introduced during overflow cleanup earlier this year. Also bring regexec.c and rregexec.c into sync again. Also allocate large enough lists in the regexec2/rregexec2 case.
560 lines
9.5 KiB
C
560 lines
9.5 KiB
C
#include "lib9.h"
|
|
#include "regexp9.h"
|
|
#include "regcomp.h"
|
|
|
|
#define TRUE 1
|
|
#define FALSE 0
|
|
|
|
/*
|
|
* Parser Information
|
|
*/
|
|
typedef
|
|
struct Node
|
|
{
|
|
Reinst* first;
|
|
Reinst* last;
|
|
}Node;
|
|
|
|
#define NSTACK 20
|
|
static Node andstack[NSTACK];
|
|
static Node *andp;
|
|
static int atorstack[NSTACK];
|
|
static int* atorp;
|
|
static int cursubid; /* id of current subexpression */
|
|
static int subidstack[NSTACK]; /* parallel to atorstack */
|
|
static int* subidp;
|
|
static int lastwasand; /* Last token was operand */
|
|
static int nbra;
|
|
static char* exprp; /* pointer to next character in source expression */
|
|
static int lexdone;
|
|
static int nclass;
|
|
static Reclass*classp;
|
|
static Reinst* freep;
|
|
static int errors;
|
|
static Rune yyrune; /* last lex'd rune */
|
|
static Reclass*yyclassp; /* last lex'd class */
|
|
|
|
/* predeclared crap */
|
|
static void operator(int);
|
|
static void pushand(Reinst*, Reinst*);
|
|
static void pushator(int);
|
|
static void evaluntil(int);
|
|
static int bldcclass(void);
|
|
|
|
static jmp_buf regkaboom;
|
|
|
|
static void
|
|
rcerror(char *s)
|
|
{
|
|
errors++;
|
|
regerror(s);
|
|
longjmp(regkaboom, 1);
|
|
}
|
|
|
|
static Reinst*
|
|
newinst(int t)
|
|
{
|
|
freep->type = t;
|
|
freep->u2.left = 0;
|
|
freep->u1.right = 0;
|
|
return freep++;
|
|
}
|
|
|
|
static void
|
|
operand(int t)
|
|
{
|
|
Reinst *i;
|
|
|
|
if(lastwasand)
|
|
operator(CAT); /* catenate is implicit */
|
|
i = newinst(t);
|
|
|
|
if(t == CCLASS || t == NCCLASS)
|
|
i->u1.cp = yyclassp;
|
|
if(t == RUNE)
|
|
i->u1.r = yyrune;
|
|
|
|
pushand(i, i);
|
|
lastwasand = TRUE;
|
|
}
|
|
|
|
static void
|
|
operator(int t)
|
|
{
|
|
if(t==RBRA && --nbra<0)
|
|
rcerror("unmatched right paren");
|
|
if(t==LBRA){
|
|
if(++cursubid >= NSUBEXP)
|
|
rcerror ("too many subexpressions");
|
|
nbra++;
|
|
if(lastwasand)
|
|
operator(CAT);
|
|
} else
|
|
evaluntil(t);
|
|
if(t != RBRA)
|
|
pushator(t);
|
|
lastwasand = FALSE;
|
|
if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
|
|
lastwasand = TRUE; /* these look like operands */
|
|
}
|
|
|
|
static void
|
|
regerr2(char *s, int c)
|
|
{
|
|
char buf[100];
|
|
char *cp = buf;
|
|
while(*s)
|
|
*cp++ = *s++;
|
|
*cp++ = c;
|
|
*cp = '\0';
|
|
rcerror(buf);
|
|
}
|
|
|
|
static void
|
|
cant(char *s)
|
|
{
|
|
char buf[100];
|
|
strcpy(buf, "can't happen: ");
|
|
strcat(buf, s);
|
|
rcerror(buf);
|
|
}
|
|
|
|
static void
|
|
pushand(Reinst *f, Reinst *l)
|
|
{
|
|
if(andp >= &andstack[NSTACK])
|
|
cant("operand stack overflow");
|
|
andp->first = f;
|
|
andp->last = l;
|
|
andp++;
|
|
}
|
|
|
|
static void
|
|
pushator(int t)
|
|
{
|
|
if(atorp >= &atorstack[NSTACK])
|
|
cant("operator stack overflow");
|
|
*atorp++ = t;
|
|
*subidp++ = cursubid;
|
|
}
|
|
|
|
static Node*
|
|
popand(int op)
|
|
{
|
|
Reinst *inst;
|
|
|
|
if(andp <= &andstack[0]){
|
|
regerr2("missing operand for ", op);
|
|
inst = newinst(NOP);
|
|
pushand(inst,inst);
|
|
}
|
|
return --andp;
|
|
}
|
|
|
|
static int
|
|
popator(void)
|
|
{
|
|
if(atorp <= &atorstack[0])
|
|
cant("operator stack underflow");
|
|
--subidp;
|
|
return *--atorp;
|
|
}
|
|
|
|
static void
|
|
evaluntil(int pri)
|
|
{
|
|
Node *op1, *op2;
|
|
Reinst *inst1, *inst2;
|
|
|
|
while(pri==RBRA || atorp[-1]>=pri){
|
|
switch(popator()){
|
|
default:
|
|
rcerror("unknown operator in evaluntil");
|
|
break;
|
|
case LBRA: /* must have been RBRA */
|
|
op1 = popand('(');
|
|
inst2 = newinst(RBRA);
|
|
inst2->u1.subid = *subidp;
|
|
op1->last->u2.next = inst2;
|
|
inst1 = newinst(LBRA);
|
|
inst1->u1.subid = *subidp;
|
|
inst1->u2.next = op1->first;
|
|
pushand(inst1, inst2);
|
|
return;
|
|
case OR:
|
|
op2 = popand('|');
|
|
op1 = popand('|');
|
|
inst2 = newinst(NOP);
|
|
op2->last->u2.next = inst2;
|
|
op1->last->u2.next = inst2;
|
|
inst1 = newinst(OR);
|
|
inst1->u1.right = op1->first;
|
|
inst1->u2.left = op2->first;
|
|
pushand(inst1, inst2);
|
|
break;
|
|
case CAT:
|
|
op2 = popand(0);
|
|
op1 = popand(0);
|
|
op1->last->u2.next = op2->first;
|
|
pushand(op1->first, op2->last);
|
|
break;
|
|
case STAR:
|
|
op2 = popand('*');
|
|
inst1 = newinst(OR);
|
|
op2->last->u2.next = inst1;
|
|
inst1->u1.right = op2->first;
|
|
pushand(inst1, inst1);
|
|
break;
|
|
case PLUS:
|
|
op2 = popand('+');
|
|
inst1 = newinst(OR);
|
|
op2->last->u2.next = inst1;
|
|
inst1->u1.right = op2->first;
|
|
pushand(op2->first, inst1);
|
|
break;
|
|
case QUEST:
|
|
op2 = popand('?');
|
|
inst1 = newinst(OR);
|
|
inst2 = newinst(NOP);
|
|
inst1->u2.left = inst2;
|
|
inst1->u1.right = op2->first;
|
|
op2->last->u2.next = inst2;
|
|
pushand(inst1, inst2);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
static Reprog*
|
|
optimize(Reprog *pp)
|
|
{
|
|
Reinst *inst, *target;
|
|
int size;
|
|
Reprog *npp;
|
|
Reclass *cl;
|
|
int diff, proglen;
|
|
|
|
/*
|
|
* get rid of NOOP chains
|
|
*/
|
|
for(inst=pp->firstinst; inst->type!=END; inst++){
|
|
target = inst->u2.next;
|
|
while(target->type == NOP)
|
|
target = target->u2.next;
|
|
inst->u2.next = target;
|
|
}
|
|
|
|
/*
|
|
* The original allocation is for an area larger than
|
|
* necessary. Reallocate to the actual space used
|
|
* and then relocate the code.
|
|
*/
|
|
proglen = freep - pp->firstinst;
|
|
size = sizeof(Reprog) + proglen*sizeof(Reinst);
|
|
npp = realloc(pp, size);
|
|
if(npp==0 || npp==pp){
|
|
pp->proglen = proglen;
|
|
return pp;
|
|
}
|
|
diff = (char *)npp - (char *)pp;
|
|
freep = (Reinst *)((char *)freep + diff);
|
|
for(inst=npp->firstinst; inst<freep; inst++){
|
|
switch(inst->type){
|
|
case OR:
|
|
case STAR:
|
|
case PLUS:
|
|
case QUEST:
|
|
*(char**)(void*)&inst->u1.right += diff;
|
|
break;
|
|
case CCLASS:
|
|
case NCCLASS:
|
|
*(char**)(void*)&inst->u1.right += diff;
|
|
cl = inst->u1.cp;
|
|
*(char**)(void*)&cl->end += diff;
|
|
break;
|
|
}
|
|
*(char**)(void*)&inst->u2.left += diff;
|
|
}
|
|
*(char**)(void*)&npp->startinst += diff;
|
|
npp->proglen = proglen;
|
|
return npp;
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
static void
|
|
dumpstack(void){
|
|
Node *stk;
|
|
int *ip;
|
|
|
|
print("operators\n");
|
|
for(ip=atorstack; ip<atorp; ip++)
|
|
print("0%o\n", *ip);
|
|
print("operands\n");
|
|
for(stk=andstack; stk<andp; stk++)
|
|
print("0%o\t0%o\n", stk->first->type, stk->last->type);
|
|
}
|
|
|
|
static void
|
|
dump(Reprog *pp)
|
|
{
|
|
Reinst *l;
|
|
Rune *p;
|
|
|
|
l = pp->firstinst;
|
|
do{
|
|
print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
|
|
l->u2.left-pp->firstinst, l->u1.right-pp->firstinst);
|
|
if(l->type == RUNE)
|
|
print("\t%C\n", l->u1.r);
|
|
else if(l->type == CCLASS || l->type == NCCLASS){
|
|
print("\t[");
|
|
if(l->type == NCCLASS)
|
|
print("^");
|
|
for(p = l->u1.cp->spans; p < l->u1.cp->end; p += 2)
|
|
if(p[0] == p[1])
|
|
print("%C", p[0]);
|
|
else
|
|
print("%C-%C", p[0], p[1]);
|
|
print("]\n");
|
|
} else
|
|
print("\n");
|
|
}while(l++->type);
|
|
}
|
|
#endif
|
|
|
|
static Reclass*
|
|
newclass(void)
|
|
{
|
|
if(nclass >= NCLASS)
|
|
regerr2("too many character classes; limit", NCLASS+'0');
|
|
return &(classp[nclass++]);
|
|
}
|
|
|
|
static int
|
|
nextc(Rune *rp)
|
|
{
|
|
if(lexdone){
|
|
*rp = 0;
|
|
return 1;
|
|
}
|
|
exprp += chartorune(rp, exprp);
|
|
if(*rp == '\\'){
|
|
exprp += chartorune(rp, exprp);
|
|
return 1;
|
|
}
|
|
if(*rp == 0)
|
|
lexdone = 1;
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
lex(int literal, int dot_type)
|
|
{
|
|
int quoted;
|
|
|
|
quoted = nextc(&yyrune);
|
|
if(literal || quoted){
|
|
if(yyrune == 0)
|
|
return END;
|
|
return RUNE;
|
|
}
|
|
|
|
switch(yyrune){
|
|
case 0:
|
|
return END;
|
|
case '*':
|
|
return STAR;
|
|
case '?':
|
|
return QUEST;
|
|
case '+':
|
|
return PLUS;
|
|
case '|':
|
|
return OR;
|
|
case '.':
|
|
return dot_type;
|
|
case '(':
|
|
return LBRA;
|
|
case ')':
|
|
return RBRA;
|
|
case '^':
|
|
return BOL;
|
|
case '$':
|
|
return EOL;
|
|
case '[':
|
|
return bldcclass();
|
|
}
|
|
return RUNE;
|
|
}
|
|
|
|
static int
|
|
bldcclass(void)
|
|
{
|
|
int type;
|
|
Rune r[NCCRUNE];
|
|
Rune *p, *ep, *np;
|
|
Rune rune;
|
|
int quoted;
|
|
|
|
/* we have already seen the '[' */
|
|
type = CCLASS;
|
|
yyclassp = newclass();
|
|
|
|
/* look ahead for negation */
|
|
/* SPECIAL CASE!!! negated classes don't match \n */
|
|
ep = r;
|
|
quoted = nextc(&rune);
|
|
if(!quoted && rune == '^'){
|
|
type = NCCLASS;
|
|
quoted = nextc(&rune);
|
|
*ep++ = '\n';
|
|
*ep++ = '\n';
|
|
}
|
|
|
|
/* parse class into a set of spans */
|
|
for(; ep<&r[NCCRUNE];){
|
|
if(rune == 0){
|
|
rcerror("malformed '[]'");
|
|
return 0;
|
|
}
|
|
if(!quoted && rune == ']')
|
|
break;
|
|
if(!quoted && rune == '-'){
|
|
if(ep == r){
|
|
rcerror("malformed '[]'");
|
|
return 0;
|
|
}
|
|
quoted = nextc(&rune);
|
|
if((!quoted && rune == ']') || rune == 0){
|
|
rcerror("malformed '[]'");
|
|
return 0;
|
|
}
|
|
*(ep-1) = rune;
|
|
} else {
|
|
*ep++ = rune;
|
|
*ep++ = rune;
|
|
}
|
|
quoted = nextc(&rune);
|
|
}
|
|
|
|
/* sort on span start */
|
|
for(p = r; p < ep; p += 2){
|
|
for(np = p; np < ep; np += 2)
|
|
if(*np < *p){
|
|
rune = np[0];
|
|
np[0] = p[0];
|
|
p[0] = rune;
|
|
rune = np[1];
|
|
np[1] = p[1];
|
|
p[1] = rune;
|
|
}
|
|
}
|
|
|
|
/* merge spans */
|
|
np = yyclassp->spans;
|
|
p = r;
|
|
if(r == ep)
|
|
yyclassp->end = np;
|
|
else {
|
|
np[0] = *p++;
|
|
np[1] = *p++;
|
|
for(; p < ep; p += 2)
|
|
if(p[0] <= np[1]){
|
|
if(p[1] > np[1])
|
|
np[1] = p[1];
|
|
} else {
|
|
np += 2;
|
|
np[0] = p[0];
|
|
np[1] = p[1];
|
|
}
|
|
yyclassp->end = np+2;
|
|
}
|
|
|
|
return type;
|
|
}
|
|
|
|
static Reprog*
|
|
regcomp1(char *s, int literal, int dot_type)
|
|
{
|
|
int token;
|
|
Reprog *volatile pp;
|
|
|
|
/* get memory for the program */
|
|
pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
|
|
if(pp == 0){
|
|
regerror("out of memory");
|
|
return 0;
|
|
}
|
|
freep = pp->firstinst;
|
|
classp = pp->class;
|
|
errors = 0;
|
|
|
|
if(setjmp(regkaboom))
|
|
goto out;
|
|
|
|
/* go compile the sucker */
|
|
lexdone = 0;
|
|
exprp = s;
|
|
nclass = 0;
|
|
nbra = 0;
|
|
atorp = atorstack;
|
|
andp = andstack;
|
|
subidp = subidstack;
|
|
lastwasand = FALSE;
|
|
cursubid = 0;
|
|
|
|
/* Start with a low priority operator to prime parser */
|
|
pushator(START-1);
|
|
while((token = lex(literal, dot_type)) != END){
|
|
if((token&0300) == OPERATOR)
|
|
operator(token);
|
|
else
|
|
operand(token);
|
|
}
|
|
|
|
/* Close with a low priority operator */
|
|
evaluntil(START);
|
|
|
|
/* Force END */
|
|
operand(END);
|
|
evaluntil(START);
|
|
#ifdef DEBUG
|
|
dumpstack();
|
|
#endif
|
|
if(nbra)
|
|
rcerror("unmatched left paren");
|
|
--andp; /* points to first and only operand */
|
|
pp->startinst = andp->first;
|
|
#ifdef DEBUG
|
|
dump(pp);
|
|
#endif
|
|
pp = optimize(pp);
|
|
#ifdef DEBUG
|
|
print("start: %d\n", andp->first-pp->firstinst);
|
|
dump(pp);
|
|
#endif
|
|
out:
|
|
if(errors){
|
|
free(pp);
|
|
pp = 0;
|
|
}
|
|
return pp;
|
|
}
|
|
|
|
extern Reprog*
|
|
regcomp(char *s)
|
|
{
|
|
return regcomp1(s, 0, ANY);
|
|
}
|
|
|
|
extern Reprog*
|
|
regcomplit(char *s)
|
|
{
|
|
return regcomp1(s, 1, ANY);
|
|
}
|
|
|
|
extern Reprog*
|
|
regcompnl(char *s)
|
|
{
|
|
return regcomp1(s, 0, ANYNL);
|
|
}
|