--- sam/regexp.c 2016-03-01 16:52:35.085509675 +0000 +++ acme/regx.c 2016-03-01 16:52:34.861508409 +0000 @@ -1,47 +1,52 @@ -#include "sam.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dat.h" +#include "fns.h" Rangeset sel; -String lastregexp; +Rune *lastregexp; + /* * Machine Information */ typedef struct Inst Inst; - struct Inst { - long type; /* < 0x10000 ==> literal, otherwise action */ + uint type; /* < 0x10000 ==> literal, otherwise action */ union { - int rsid; - int rsubid; + int sid; + int subid; int class; - struct Inst *rother; - struct Inst *rright; - } r; + Inst *other; + Inst *right; + }; union{ - struct Inst *lleft; - struct Inst *lnext; - } l; + Inst *left; + Inst *next; + }; }; -#define sid r.rsid -#define subid r.rsubid -#define rclass r.class -#define other r.rother -#define right r.rright -#define left l.lleft -#define next l.lnext #define NPROG 1024 Inst program[NPROG]; Inst *progp; Inst *startinst; /* First inst. of program; might not be program[0] */ Inst *bstartinst; /* same for backwards machine */ +Channel *rechan; /* chan(Inst*) */ typedef struct Ilist Ilist; struct Ilist { Inst *inst; /* Instruction of the thread */ Rangeset se; - Posn startp; /* first char of match */ + uint startp; /* first char of match */ }; #define NLIST 127 @@ -120,35 +125,40 @@ void bldcclass(void); void -regerror(Err e) +rxinit(void) { - Strzero(&lastregexp); - error(e); + rechan = chancreate(sizeof(Inst*), 0); + lastregexp = runemalloc(1); } void -regerror_c(Err e, int c) +regerror(char *e) { - Strzero(&lastregexp); - error_c(e, c); + lastregexp[0] = 0; + warning(nil, "regexp: %s\n", e); + sendp(rechan, nil); + threadexits(nil); } Inst * newinst(int t) { if(progp >= &program[NPROG]) - regerror(Etoolong); + regerror("expression too long"); progp->type = t; - progp->left = 0; - progp->right = 0; + progp->left = nil; + progp->right = nil; return progp++; } -Inst * -realcompile(Rune *s) +void +realcompile(void *arg) { int token; + Rune *s; + threadsetname("regcomp"); + s = arg; startlex(s); atorp = atorstack; andp = andstack; @@ -169,31 +179,44 @@ operand(END); evaluntil(START); if(nbra) - regerror(Eleftpar); + regerror("unmatched `('"); --andp; /* points to first and only operand */ - return andp->first; + sendp(rechan, andp->first); + threadexits(nil); } -void -compile(String *s) +/* r is null terminated */ +int +rxcompile(Rune *r) { - int i; + int i, nr; Inst *oprogp; - if(Strcmp(s, &lastregexp)==0) - return; + nr = runestrlen(r)+1; + if(runeeq(lastregexp, runestrlen(lastregexp)+1, r, nr)==TRUE) + return TRUE; + lastregexp[0] = 0; for(i=0; is); + bstartinst = nil; + threadcreate(realcompile, r, STACK); + startinst = recvp(rechan); + if(startinst == nil) + return FALSE; optimize(program); oprogp = progp; backwards = TRUE; - bstartinst = realcompile(s->s); + threadcreate(realcompile, r, STACK); + bstartinst = recvp(rechan); + if(bstartinst == nil) + return FALSE; optimize(oprogp); - Strduplstr(&lastregexp, s); + lastregexp = runerealloc(lastregexp, nr); + runemove(lastregexp, r, nr); + return TRUE; } void @@ -206,7 +229,7 @@ if(t == CCLASS){ if(negateclass) i->type = NCCLASS; /* UGH */ - i->rclass = nclass-1; /* UGH */ + i->class = nclass-1; /* UGH */ } pushand(i, i); lastwasand = TRUE; @@ -216,12 +239,8 @@ operator(int t) { if(t==RBRA && --nbra<0) - regerror(Erightpar); + regerror("unmatched `)'"); if(t==LBRA){ -/* - * if(++cursubid >= NSUBEXP) - * regerror(Esubexp); - */ cursubid++; /* silently ignored */ nbra++; if(lastwasand) @@ -236,19 +255,10 @@ } void -cant(char *s) -{ - char buf[100]; - - sprint(buf, "regexp: can't happen: %s", s); - panic(buf); -} - -void pushand(Inst *f, Inst *l) { if(andp >= &andstack[NSTACK]) - cant("operand stack overflow"); + error("operand stack overflow"); andp->first = f; andp->last = l; andp++; @@ -258,9 +268,9 @@ pushator(int t) { if(atorp >= &atorstack[NSTACK]) - cant("operator stack overflow"); + error("operator stack overflow"); *atorp++=t; - if(cursubid >= NSUBEXP) + if(cursubid >= NRange) *subidp++= -1; else *subidp++=cursubid; @@ -269,19 +279,22 @@ Node * popand(int op) { + char buf[64]; + if(andp <= &andstack[0]) - if(op) - regerror_c(Emissop, op); - else - regerror(Ebadregexp); + if(op){ + sprint(buf, "missing operand for %c", op); + regerror(buf); + }else + regerror("malformed regexp"); return --andp; } int -popator(void) +popator() { if(atorp <= &atorstack[0]) - cant("operator stack underflow"); + error("operator stack underflow"); --subidp; return *--atorp; } @@ -305,7 +318,7 @@ pushand(inst1, inst2); return; /* must have been RBRA */ default: - panic("unknown regexp operator"); + error("unknown regexp operator"); break; case OR: op2 = popand('|'); @@ -321,8 +334,11 @@ case CAT: op2 = popand(0); op1 = popand(0); - if(backwards && op2->first->type!=END) - t = op1, op1 = op2, op2 = t; + if(backwards && op2->first->type!=END){ + t = op1; + op1 = op2; + op2 = t; + } op1->last->next = op2->first; pushand(op1->first, op2->last); break; @@ -367,31 +383,6 @@ } } -#ifdef DEBUG -void -dumpstack(void){ - Node *stk; - int *ip; - - dprint("operators\n"); - for(ip = atorstack; ipfirst->type, stk->last->type); -} -void -dump(void){ - Inst *l; - - l = program; - do{ - dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type, - l->left-program, l->right-program); - }while(l++->type); -} -#endif - void startlex(Rune *s) { @@ -402,8 +393,9 @@ int lex(void){ - int c= *exprp++; + int c; + c = *exprp++; switch(c){ case '\\': if(*exprp) @@ -449,10 +441,11 @@ return c; } -long -nextrec(void){ +int +nextrec(void) +{ if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0)) - regerror(Ebadclass); + regerror("malformed `[]'"); if(exprp[0] == '\\'){ exprp++; if(*exprp=='n'){ @@ -467,10 +460,10 @@ void bldcclass(void) { - long c1, c2, n, na; + int c1, c2, n, na; Rune *classp; - classp = emalloc(DCLASS*RUNESIZE); + classp = runemalloc(DCLASS); n = 0; na = DCLASS; /* we have already seen the '[' */ @@ -484,11 +477,11 @@ if(c1 == '-'){ Error: free(classp); - regerror(Ebadclass); + regerror("malformed `[]'"); } if(n+4 >= na){ /* 3 runes plus NUL */ na += DCLASS; - classp = erealloc(classp, na*RUNESIZE); + classp = runerealloc(classp, na); } if(*exprp == '-'){ exprp++; /* eat '-' */ @@ -504,7 +497,7 @@ classp[n] = 0; if(nclass == Nclass){ Nclass += DCLASS; - class = erealloc(class, Nclass*sizeof(Rune*)); + class = realloc(class, Nclass*sizeof(Rune*)); } class[nclass++] = classp; } @@ -538,66 +531,93 @@ for(p = l; p->inst; p++){ if(p->inst==inst){ - if((sep)->p[0].p1 < p->se.p[0].p1) + if((sep)->r[0].q0 < p->se.r[0].q0) p->se= *sep; /* this would be bug */ return 0; /* It's already there */ } } p->inst = inst; p->se= *sep; - (p+1)->inst = 0; + (p+1)->inst = nil; return 1; } int -execute(File *f, Posn startp, Posn eof) +rxnull(void) +{ + return startinst==nil || bstartinst==nil; +} + +/* either t!=nil or r!=nil, and we match the string in the appropriate place */ +int +rxexecute(Text *t, Rune *r, uint startp, uint eof, Rangeset *rp) { - int flag = 0; + int flag; Inst *inst; Ilist *tlp; - Posn p = startp; - int nnl = 0, ntl; - int c; - int wrapped = 0; - int startchar = startinst->typetype : 0; - - list[0][0].inst = list[1][0].inst = 0; - sel.p[0].p1 = -1; + uint p; + int nnl, ntl; + int nc, c; + int wrapped; + int startchar; + + flag = 0; + p = startp; + startchar = 0; + wrapped = 0; + nnl = 0; + if(startinst->typetype; + list[0][0].inst = list[1][0].inst = nil; + sel.r[0].q0 = -1; + if(t != nil) + nc = t->file->nc; + else + nc = runestrlen(r); /* Execute machine once for each character */ for(;;p++){ doloop: - c = filereadc(f, p); - if(p>=eof || c<0){ + if(p>=eof || p>=nc){ switch(wrapped++){ case 0: /* let loop run one more click */ case 2: break; case 1: /* expired; wrap to beginning */ - if(sel.p[0].p1>=0 || eof!=INFINITY) + if(sel.r[0].q0>=0 || eof!=Infinity) goto Return; - list[0][0].inst = list[1][0].inst = 0; + list[0][0].inst = list[1][0].inst = nil; p = 0; goto doloop; default: goto Return; } - }else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0) - break; + c = 0; + }else{ + if(((wrapped && p>=startp) || sel.r[0].q0>0) && nnl==0) + break; + if(t != nil) + c = textreadc(t, p); + else + c = r[p]; + } /* fast check for first char */ if(startchar && nnl==0 && c!=startchar) continue; tl = list[flag]; nl = list[flag^=1]; - nl->inst = 0; + nl->inst = nil; ntl = nnl; nnl = 0; - if(sel.p[0].p1<0 && (!wrapped || p= NLIST) + if(++ntl >= NLIST){ Overflow: - error(Eoverflow); + warning(nil, "regexp list overflow\n"); + sel.r[0].q0 = -1; + goto Return; + } } /* Execute machine until this list is empty */ for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */ @@ -613,12 +633,12 @@ break; case LBRA: if(inst->subid>=0) - tlp->se.p[inst->subid].p1 = p; + tlp->se.r[inst->subid].q0 = p; inst = inst->next; goto Switchstmt; case RBRA: if(inst->subid>=0) - tlp->se.p[inst->subid].p2 = p; + tlp->se.r[inst->subid].q1 = p; inst = inst->next; goto Switchstmt; case ANY: @@ -626,7 +646,7 @@ goto Addinst; break; case BOL: - if(p==0 || filereadc(f, p - 1)=='\n'){ + if(p==0 || (t!=nil && textreadc(t, p-1)=='\n') || (r!=nil && r[p-1]=='\n')){ Step: inst = inst->next; goto Switchstmt; @@ -637,11 +657,11 @@ goto Step; break; case CCLASS: - if(c>=0 && classmatch(inst->rclass, c, 0)) + if(c>=0 && classmatch(inst->class, c, 0)) goto Addinst; break; case NCCLASS: - if(c>=0 && classmatch(inst->rclass, c, 1)) + if(c>=0 && classmatch(inst->class, c, 1)) goto Addinst; break; case OR: @@ -653,77 +673,89 @@ inst = inst->left; goto Switchstmt; case END: /* Match! */ - tlp->se.p[0].p2 = p; + tlp->se.r[0].q1 = p; newmatch(&tlp->se); break; } } } Return: - return sel.p[0].p1>=0; + *rp = sel; + return sel.r[0].q0 >= 0; } void newmatch(Rangeset *sp) { - int i; - - if(sel.p[0].p1<0 || sp->p[0].p1p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2)) - for(i = 0; ip[i]; + if(sel.r[0].q0<0 || sp->r[0].q0r[0].q0==sel.r[0].q0 && sp->r[0].q1>sel.r[0].q1)) + sel = *sp; } int -bexecute(File *f, Posn startp) +rxbexecute(Text *t, uint startp, Rangeset *rp) { - int flag = 0; + int flag; Inst *inst; Ilist *tlp; - Posn p = startp; - int nnl = 0, ntl; + int p; + int nnl, ntl; int c; - int wrapped = 0; - int startchar = bstartinst->typetype : 0; + int wrapped; + int startchar; - list[0][0].inst = list[1][0].inst = 0; - sel.p[0].p1= -1; + flag = 0; + nnl = 0; + wrapped = 0; + p = startp; + startchar = 0; + if(bstartinst->typetype; + list[0][0].inst = list[1][0].inst = nil; + sel.r[0].q0= -1; /* Execute machine once for each character, including terminal NUL */ for(;;--p){ doloop: - if((c = filereadc(f, p - 1))==-1){ + if(p <= 0){ switch(wrapped++){ case 0: /* let loop run one more click */ case 2: break; case 1: /* expired; wrap to end */ - if(sel.p[0].p1>=0) - case 3: + if(sel.r[0].q0>=0) goto Return; - list[0][0].inst = list[1][0].inst = 0; - p = f->nc; + list[0][0].inst = list[1][0].inst = nil; + p = t->file->nc; goto doloop; + case 3: default: goto Return; } - }else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0) - break; + c = 0; + }else{ + if(((wrapped && p<=startp) || sel.r[0].q0>0) && nnl==0) + break; + c = textreadc(t, p-1); + } /* fast check for first char */ if(startchar && nnl==0 && c!=startchar) continue; tl = list[flag]; nl = list[flag^=1]; - nl->inst = 0; + nl->inst = nil; ntl = nnl; nnl = 0; - if(sel.p[0].p1<0 && (!wrapped || p>startp)){ + if(sel.r[0].q0<0 && (!wrapped || p>startp)){ /* Add first instruction to this list */ /* the minus is so the optimizations in addinst work */ - sempty.p[0].p1 = -p; + sempty.r[0].q0 = -p; if(addinst(tl, bstartinst, &sempty)) - if(++ntl >= NLIST) + if(++ntl >= NLIST){ Overflow: - error(Eoverflow); + warning(nil, "regexp list overflow\n"); + sel.r[0].q0 = -1; + goto Return; + } } /* Execute machine until this list is empty */ for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */ @@ -739,12 +771,12 @@ break; case LBRA: if(inst->subid>=0) - tlp->se.p[inst->subid].p1 = p; + tlp->se.r[inst->subid].q0 = p; inst = inst->next; goto Switchstmt; case RBRA: if(inst->subid >= 0) - tlp->se.p[inst->subid].p2 = p; + tlp->se.r[inst->subid].q1 = p; inst = inst->next; goto Switchstmt; case ANY: @@ -759,15 +791,15 @@ } break; case EOL: - if(p==f->nc || filereadc(f, p)=='\n') + if(pfile->nc && textreadc(t, p)=='\n') goto Step; break; case CCLASS: - if(c>=0 && classmatch(inst->rclass, c, 0)) + if(c>0 && classmatch(inst->class, c, 0)) goto Addinst; break; case NCCLASS: - if(c>=0 && classmatch(inst->rclass, c, 1)) + if(c>0 && classmatch(inst->class, c, 1)) goto Addinst; break; case OR: @@ -779,24 +811,26 @@ inst = inst->left; goto Switchstmt; case END: /* Match! */ - tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */ - tlp->se.p[0].p2 = p; + tlp->se.r[0].q0 = -tlp->se.r[0].q0; /* minus sign */ + tlp->se.r[0].q1 = p; bnewmatch(&tlp->se); break; } } } Return: - return sel.p[0].p1>=0; + *rp = sel; + return sel.r[0].q0 >= 0; } void bnewmatch(Rangeset *sp) { int i; - if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2p[i].p2; - sel.p[i].p2 = sp->p[i].p1; + + if(sel.r[0].q0<0 || sp->r[0].q0>sel.r[0].q1 || (sp->r[0].q0==sel.r[0].q1 && sp->r[0].q1r[i].q1; + sel.r[i].q1 = sp->r[i].q0; } }