 #include <pmachine.h>    #ifdef NOREGEX /*F  * These routines are BSD regex(3)/ed(1) compatible regular-expressionH  * routines written by Ozan S. Yigit, Computer Science, York University.G  * Parts of the code that are not needed by Prospero have been removed, B  * but most of the accompanying information has been left intact. E  * This file is to be included on those operating systems that do not   * support re_comp and re_exec.   */    /*.  * regex - Regular expression pattern matching  *         and replacement  *)  * by:  Ozan S. Yigit (oz@nexus.yorku.ca)   *	Dept. of Computing Services  *      York University   *4  * These routines are the PUBLIC DOMAIN equivalents 8  * of regex routines as found in 4.nBSD UN*X, with minor  * extensions.  *  * Modification history:  *  * $Log:	regex.c,v $'  * Revision 1.3  89/04/01  14:18:09  oz ;  * Change all references to a dfa: this is actually an nfa.   *  '  * Revision 1.2  88/08/28  15:36:04  oz ,  * Use a complement bitmap to represent NCL.*  * This removes the need to have seperate )  * code in the pmatch case block - it is    * just CCL code now.   *  %  * Use the actual CCL code in the CLO -  * section of pmatch. No need for a recursive   * pmatch call.   *  ,  * Use a bitmap table to set char bits in an  * 8-bit chunk.   *    * Routines:9  *      re_comp:        compile a regular expression into   *                      a NFA.  *  *			char *re_comp(s) 
  *			char *s;   *;  *      re_exec:        execute the NFA to match a pattern.   *  *			int re_exec(s) 
  *			char *s;   *  * Regular Expressions:   *>  *      [1]     char    matches itself, unless it is a special=  *                      character (metachar): . \ [ ] * + ^ $   *.  *      [2]     .       matches any character.  *B  *      [3]     \       matches the character following it, except4  *			when followed by a left or right round bracket,6  *			a digit 1 to 9 or a left or right angle bracket.   *			(see [7], [8] and [9]) /  *			It is used as an escape character for all  1  *			other meta-characters, and itself. When used 1  *			in a set ([4]), it is treated as an ordinary   *			character.   *A  *      [4]     [set]   matches one of the characters in the set. A  *                      If the first character in the set is "^", D  *                      it matches a character NOT in the set, i.e. -  *			complements the set. A shorthand S-E is  0  *			used to specify a set of characters S upto 2  *			E, inclusive. The special characters "]" and 0  *			"-" have no special meaning if they appear #  *			as the first chars in the set. /  *                      examples:        match:   *<  *                              [a-z]    any lowercase alpha  *@  *                              [^]-]    any char except ] and -  *B  *                              [^A-Z]   any char except uppercase.  *                                       alpha  *2  *                              [a-zA-Z] any alpha  *K  *      [5]     *       any regular expression form [1] to [4], followed by H  *                      closure char (*) matches zero or more matches of"  *                      that form.  *C  *      [6]     +       same as [5], except it matches one or more.   *N  *      [7]             a regular expression in the form [1] to [10], enclosedL  *                      as \(form\) matches what form matches. The enclosureC  *                      creates a set of tags, used for [8] and for I  *                      pattern substution. The tagged forms are numbered   *			starting from 1.   *I  *      [8]             a \ followed by a digit 1 to 9 matches whatever a K  *                      previously tagged regular expression ([7]) matched.   *;  *	[9]	\<	a regular expression starting with a \< construct 7  *		\>	and/or ending with a \> construct, restricts the 8  *			pattern matching to the beginning of a word, and/or;  *			the end of a word. A word is defined to be a character 7  *			string beginning and/or ending with the characters 7  *			A-Z a-z 0-9 and _. It must also be preceded and/or 7  *			followed by any character outside those mentioned.   *G  *      [10]            a composite regular expression xy where x and y G  *                      are in the form [1] to [10] matches the longest =  *                      match of x followed by a match for y.   *?  *      [11]	^	a regular expression starting with a ^ character 5  *		$	and/or ending with a $ character, restricts the F  *                      pattern matching to the beginning of the line,F  *                      or the end of line. [anchors] Elsewhere in the9  *			pattern, ^ and $ are treated as ordinary characters.   *  *  * Acknowledgements:  *9  *	HCR's Hugh Redelmeier has been most helpful in various 8  *	stages of development. He convinced me to include BOW9  *	and EOW constructs, originally invented by Rob Pike at   *	the University of Toronto.   *  * References:4  *              Software tools			Kernighan & PlaugerC  *              Software tools in Pascal        Kernighan & Plauger <  *              Grep [rsx-11 C dist]            David Conroy.  *		ed - text editor		Un*x Programmer's Manual,  *		Advanced editing on Un*x	B. W. Kernighan#  *		regexp routines			Henry Spencer   *	  * Notes:   *B  *	This implementation uses a bit-set representation for characterD  *	classes for speed and compactness. Each character is represented ;  *	by one bit in a 128-bit block. Thus, CCL always takes a  C  *	constant 16 bytes in the internal nfa, and re_exec does a single 5  *	bit comparison to locate the character in the set.   *  * Examples:  *  *	pattern:	foo*.*5  *	compile:	CHR f CHR o CLO CHR o END CLO ANY END END -  *	matches:	fo foo fooo foobar fobar foxx ...   *  *	pattern:	fo[ob]a[rz]	7  *	compile:	CHR f CHR o CCL bitset CHR a CCL bitset END #  *	matches:	fobar fooar fobaz fooaz   *  *	pattern:	foo\\+5  *	compile:	CHR f CHR o CHR o CHR \ CLO CHR \ END END "  *	matches:	foo\ foo\\ foo\\\  ...  *0  *	pattern:	\(foo\)[1-3]\1	(same as foo[1-3]foo)>  *	compile:	BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END#  *	matches:	foo1foo foo2foo foo3foo   *  *	pattern:	\(fo.*\)-\1 ?  *	compile:	BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END 3  *	matches:	foo-foo fo-fo fob-fob foobar-foobar ...   *    */    #define MAXNFA  1024 #define MAXTAG  10   #define OKP     1  #define NOP     0    #define CHR     1  #define ANY     2  #define CCL     3  #define BOL     4  #define EOL     5  #define BOT     6  #define EOT     7 
 #define BOW	8 
 #define EOW	9  #define REF     10 #define CLO     11   #define END     0    /*&  * The following defines are not meant-  * to be changeable. They are for readability   * only.  *  */  #define MAXCHR	128 #define CHRBIT	8 #define BITBLK	MAXCHR/CHRBIT #define BLKIND	0170  #define BITIND	07    #define ASCIIB	0177    typedef /*unsigned*/ char CHAR;   ? static int  tagstk[MAXTAG];             /* subpat tag stack..*/ 1 static CHAR nfa[MAXNFA];		/* automaton..       */ = static int  sta = NOP;               	/* status of lastpat */   4 static CHAR bittab[BITBLK];		/* bit table for CCL */ 					/* pre-set bits...   */. static CHAR bitarr[] = {1,2,4,8,16,32,64,128};   static int internal_error;   static void  chset(c) register CHAR c; { 5 	bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];  }   ( #define badpat(x)	return (*nfa = END, x) #define store(x)	*mp++ = x    char *       re_comp(pat)
 char *pat; { 8 	register char *p;               /* pattern pointer   */8 	register CHAR *mp = nfa;        /* nfa pointer       */8 	register CHAR *lp;              /* saved pointer..   */8 	register CHAR *sp = nfa;        /* another one..     */  8 	register int tagi = 0;          /* tag stack index   */8 	register int tagc = 1;          /* actual tag count  */   	register int n;- 	register CHAR mask;		/* xor mask -CCL/NCL */  	int c1, c2; 		 	if (!pat || !*pat) 
 		if (sta) 			return 0; 		else, 			badpat("No previous regular expression"); 	sta = NOP;    	for (p = pat; *p; p++) { 
 		lp = mp; 		switch(*p) {  1 		case '.':               /* match any char..  */  			store(ANY);	 			break;h  1 		case '^':               /* match beginning.. */e 			if (p == pat) 				store(BOL);r	 			else {r 				store(CHR);y 				store(*p); 			}	 			break;   1 		case '$':               /* match endofline.. */e 			if (!*(p+1))h 				store(EOL); 	 			else {s 				store(CHR);y 				store(*p); 			}	 			break;a  1 		case '[':               /* match char class..*/n 			store(CCL);   			if (*++p == '^') {p 				mask = 0377;	  				p++; 			} 			elser
 				mask = 0;t  " 			if (*p == '-')		/* real dash */ 				chset(*p++);" 			if (*p == ']')		/* real brac */ 				chset(*p++); 			while (*p && *p != ']') {/ 				if (*p == '-' && *(p+1) && *(p+1) != ']') {I	 					p++;t 					c1 = *(p-2) + 1;s 					c2 = *p++;S 					while (c1 <= c2)e 						chset(c1++); 				}t
 #ifdef EXTEND*$ 				else if (*p == '\\' && *(p+1)) {	 					p++;  					chset(*p++);C 				}l #endif 				else 					chset(*p++);l 			} 			if (!*p)* 				badpat("Missing ]");  2 			for (n = 0; n < BITBLK; bittab[n++] = (char) 0) 				store(mask ^ bittab[n]); 	 	 			break;t  1 		case '*':               /* match 0 or more.. */s1 		case '+':               /* match 1 or more.. */i 			if (p == pat) 				badpat("Empty closure");" 			lp = sp;		/* previous opcode */) 			if (*lp == CLO)		/* equivalence..   */n
 				break; 			switch(*lp) {   			case BOL: 			case BOT: 			case EOT: 			case BOW: 			case EOW: 			case REF: 				badpat("Illegal closure"); 			default:*
 				break; 			}   			if (*p == '+')   				for (sp = mp; lp < sp; lp++) 					store(*lp);   			store(END); 			store(END); 			sp = mp;p 			while (--mp > lp) 				*mp = mp[-1];a 			store(CLO); 			mp = sp;s	 			break;   1 		case '\\':              /* tags, backrefs .. */  			switch(*++p) {2   			case '(': 				if (tagc < MAXTAG) { 					tagstk[++tagi] = tagc;e 					store(BOT); 					store(tagc++);* 				}  				else% 					badpat("Too many \\(\\) pairs");a
 				break; 			case ')': 				if (*sp == BOT) * 					badpat("Null pattern inside \\(\\)"); 				if (tagi > 0) {a 					store(EOT); 					store(tagstk[tagi--]);  				}  				else 					badpat("Unmatched \\)");e
 				break; 			case '<': 				store(BOW); 
 				break; 			case '>': 				if (*sp == BOW)a* 					badpat("Null pattern inside \\<\\>"); 				store(EOW);a
 				break; 			case '1': 			case '2': 			case '3': 			case '4': 			case '5': 			case '6': 			case '7': 			case '8': 			case '9': 				n = *p-'0';o& 				if (tagi > 0 && tagstk[tagi] == n)" 					badpat("Cyclical reference"); 				if (tagc > n) {" 					store(REF); 					store(n); 				}  				else& 					badpat("Undetermined reference");
 				break;
 #ifdef EXTEND  			case 'b': 				store(CHR);  				store('\b');
 				break; 			case 'n': 				store(CHR);  				store('\n');
 				break; 			case 'f': 				store(CHR);  				store('\f');
 				break; 			case 'r': 				store(CHR);  				store('\r');
 				break; 			case 't': 				store(CHR);  				store('\t');
 				break; #endif 			default:  				store(CHR);  				store(*p); 			}	 			break;o  1 		default :               /* an ordinary char  */h 			store(CHR);
 			store(*p);c	 			break;  		} 
 		sp = lp; 	} 	if (tagi > 0) 		badpat("Unmatched \\("); 	store(END); 	sta = OKP;e
 	return 0; }      static char *bol;  static char *bopat[MAXTAG];o static char *eopat[MAXTAG];  char *pmatch();    /*  * re_exec:e   * 	execute nfa to find a match.  *  *	special cases: (nfa[0])	   *		BOLt'  *			Match only once, starting from the   *			beginning.   *		CHRn'  *			First locate the character withouta'  *			calling pmatch, and if found, call %  *			pmatch for the remaining string.s  *		END '  *			re_comp failed, poor luser did notg  *			check for it. Fail fast.t  *5  *	If a match is found, bopat[0] and eopat[0] are set\8  *	to the beginning and the end of the matched fragment,  *	respectively.  *  */m   int  re_exec(lp)i register char *lp; {* 	register char c;d 	register char *ep = 0;  	register CHAR *ap = nfa;e  
 	bol = lp;   	bopat[0] = 0; 	bopat[1] = 0; 	bopat[2] = 0; 	bopat[3] = 0; 	bopat[4] = 0; 	bopat[5] = 0; 	bopat[6] = 0; 	bopat[7] = 0; 	bopat[8] = 0; 	bopat[9] = 0;   	switch(*ap) {  0 	case BOL:			/* anchored: match from BOL only */ 		ep = pmatch(lp,ap);t 		break;0 	case CHR:			/* ordinary char: locate it fast */ 		c = *(ap+1); 		while (*lp && *lp != c)  			lp++;0 		if (!*lp)		/* if EOS, fail, else fall thru. */ 			return 0;/ 	default:			/* regular matching all the way. */h 		while (*lp) {  			if ((ep = pmatch(lp,ap)))
 				break; 			lp++; 		}  		break;0 	case END:			/* munged automaton. fail always */ 		return 0;* 	}	 	if (!ep)n 		return 0;e   	if (internal_error) 		return -1;   	bopat[0] = lp;  	eopat[0] = ep; 
 	return 1; }b   /* t  * pmatch: a%  *	internal routine for the hard partn  *-  * 	This code is mostly snarfed from an earlyi1  * 	grep written by David Conroy. The backref and 0  * 	tag stuff, and various other mods are by oZ.  *$  *	special cases: (nfa[n], nfa[n+1])  *		CLO ANYo%  *			We KNOW ".*" will match ANYTHING &  *			upto the end of line. Thus, go to&  *			the end of line straight, without&  *			calling pmatch recursively. As in+  *			the other closure cases, the remainingn&  *			pattern must be matched by moving)  *			backwards on the string recursively,u+  *			to find a match for xy (x is ".*" and *&  *			y is the remaining pattern) where*  *			the match satisfies the LONGEST match%  *			for x followed by a match for y.s  *		CLO CHRt)  *			We can again scan the string forwarde,  *			for the single char without recursion, -  *			and at the point of failure, we execute *&  *			the remaining nfa recursively, as  *			described above.   *:  *	At the end of a successful match, bopat[n] and eopat[n]:  *	are set to the beginning and end of subpatterns matched'  *	by tagged expressions (n = 1 to 9).	a  *  */o   /*3  * character classification table for word boundary 3  * operators BOW and EOW. the reason for not using \5  * ctype macros is that we can let the user add into o3  * our own table. see re_modw. This table is not in 2  * the bitset form, since we may wish to extend it6  * in the future for other character classifications.   *  *	TRUE for 0-9 A-Z a-z _H  */E static char chrtyp[MAXCHR] = { 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, * 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, f 	0, 0, 0, 0, 0, 0, 0, 0, 1, 1,   	1, 1, 1, 1, 1, 1, 1, 1, 0, 0,   	0, 0, 0, 0, 0, 1, 1, 1, 1, 1, n 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,   	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, d 	1, 0, 0, 0, 0, 1, 0, 1, 1, 1,   	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, * 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, a 	1, 1, 1, 0, 0, 0, 0, 0T 	};r   #define inascii(x)	(0177&(x)) & #define iswordc(x) 	chrtyp[inascii(x)]A #define isinset(x,y) 	((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])I   /*3  * skip values for CLO XXX to skip past the closure   *  */   / #define ANYSKIP	2 	/* [CLO] ANY END ...	     */a1 #define CHRSKIP	3	/* [CLO] CHR chr END ...     */n2 #define CCLSKIP 18	/* [CLO] CCL 16bytes END ... */  
 static char *p pmatch(lp, ap) register char *lp; register CHAR *ap; {  	register int op, c, n;./ 	register char *e;		/* extra pointer for CLO */10 	register char *bp;		/* beginning of subpat.. */. 	register char *ep;		/* ending of subpat..	 */) 	char *are;			/* to save the line ptr. */    	while ((op = *ap++) != END) 		switch(op) {   		case CHR:* 			if (*lp++ != *ap++)
 				return 0;)	 			break;; 		case ANY:e 			if (!*lp++)
 				return 0;t	 			break;  		case CCL:e
 			c = *lp++;a 			if (!isinset(ap,c))
 				return 0;s 			ap += BITBLK;	 			break;  		case BOL:. 			if (lp != bol)H
 				return 0; 	 			break;t 		case EOL:  			if (*lp)e
 				return 0; 	 			break;  		case BOT:e 			bopat[*ap++] = lp;a	 			break;  		case EOT:l 			eopat[*ap++] = lp;s	 			break;   		case BOW:3 			if (lp!=bol && iswordc(lp[-1]) || !iswordc(*lp))i
 				return 0;t	 			break;t 		case EOW: 3 			if (lp==bol || !iswordc(lp[-1]) || iswordc(*lp))"
 				return 0; 	 			break;  		case REF:+
 			n = *ap++;; 			bp = bopat[n];  			ep = eopat[n];  			while (bp < ep) 				if (*bp++ != *lp++); 					return 0;	 			break;  		case CLO:  			are = lp; 			switch(*ap) {   			case ANY: 				while (*lp)e
 					lp++; 				n = ANYSKIP;
 				break; 			case CHR: 				c = *(ap+1); 				while (*lp && c == *lp)i
 					lp++; 				n = CHRSKIP;
 				break; 			case CCL:( 				while ((c = *lp) && isinset(ap+1,c))
 					lp++; 				n = CCLSKIP;
 				break; 			default:l 				internal_error++;;
 				return 0;  			}   			ap += n;=   			while (lp >= are) { 				if (e = pmatch(lp, ap))	 					return e;	 				--lp;s 			} 			return 0;
 		default: 			internal_error++; 			return 0; 		}+ 	return lp;( }&> #endif /* Need regex libraries? Compile to nothing if not.  */