diff options
Diffstat (limited to 'src/RESearch.cpp')
-rwxr-xr-x | src/RESearch.cpp | 788 |
1 files changed, 788 insertions, 0 deletions
diff --git a/src/RESearch.cpp b/src/RESearch.cpp new file mode 100755 index 0000000..00e1830 --- /dev/null +++ b/src/RESearch.cpp @@ -0,0 +1,788 @@ +// Scintilla source code edit control +/** @file RESearch.cxx + ** Regular expression search library. + **/ + +/* + * regex - Regular expression pattern matching and replacement + * + * By: Ozan S. Yigit (oz) + * Dept. of Computer Science + * York University + * + * Original code available from http://www.cs.yorku.ca/~oz/ + * Translation to C++ by Neil Hodgson [email protected] + * Removed all use of register. + * Converted to modern function prototypes. + * Put all global/static variables into an object so this code can be + * used from multiple threads, etc. + * + * These routines are the PUBLIC DOMAIN equivalents of regex + * routines as found in 4.nBSD UN*X, with minor extensions. + * + * These routines are derived from various implementations found + * in software tools books, and Conroy's grep. They are NOT derived + * from licensed/restricted software. + * For more interesting/academic/complicated implementations, + * see Henry Spencer's regexp routines, or GNU Emacs pattern + * matching module. + * + * Modification history removed. + * + * Interfaces: + * RESearch::Compile: compile a regular expression into a NFA. + * + * const char *RESearch::Compile(const char *pat, int length, + * bool caseSensitive, bool posix) + * + * Returns a short error string if they fail. + * + * RESearch::Execute: execute the NFA to match a pattern. + * + * int RESearch::Execute(characterIndexer &ci, int lp, int endp) + * + * RESearch::Substitute: substitute the matched portions in a new string. + * + * int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) + * + * re_fail: failure routine for RESearch::Execute. (no longer used) + * + * void re_fail(char *msg, char op) + * + * Regular Expressions: + * + * [1] char matches itself, unless it is a special + * character (metachar): . \ [ ] * + ^ $ + * and ( ) if posix option. + * + * [2] . matches any character. + * + * [3] \ matches the character following it, except: + * - \a, \b, \f, \n, \t, \v match the + * corresponding C escape char; + * - if not in posix mode, when followed by a + * left or right round bracket (see [7]); + * - when followed by a digit 1 to 9 (see [8]); + * - when followed by a left or right angle bracket + * (see [9]). + * It is used as an escape character for all + * other meta-characters, and itself. When used + * in a set ([4]), it is treated as an ordinary + * character (except for escape chars). + * + * [4] [set] matches one of the characters in the set. + * If the first character in the set is "^", + * it matches a character NOT in the set, i.e. + * complements the set. A shorthand S-E (start-end) + * is used to specify a set of characters S upto + * E, inclusive. The special characters "]" and + * "-" have no special meaning if they appear + * as the first chars in the set. To include both, + * put - first: [-]A-Z]: + * [-]|] matches these 2 chars, + * []-|] matches from ] to | chars. + * examples: match: + * + * [a-z] any lowercase alpha + * + * [^-]] any char except - and ] + * + * [^A-Z] any char except uppercase + * alpha + * + * [a-zA-Z] any alpha + * + * [5] * any regular expression form [1] to [4], followed by + * closure char (*) matches zero or more matches of + * that form. + * + * [6] + same as [5], except it matches one or more. + * + * [7] a regular expression in the form [1] to [10], enclosed + * as \(form\) (or (form) with posix flag) matches what + * form matches. The enclosure creates a set of tags, + * used for [8] and for pattern substitution. + * The tagged forms are numbered starting from 1. + * + * [8] a \ followed by a digit 1 to 9 matches whatever a + * previously tagged regular expression ([7]) matched. + * + * [9] \< a regular expression starting with a \< construct + * \> and/or ending with a \> construct, restricts the + * pattern matching to the beginning of a word, and/or + * the end of a word. A word is defined to be a character + * string beginning and/or ending with the characters + * A-Z a-z 0-9 and _. It must also be preceded and/or + * followed by any character outside those mentioned. + * + * [10] a composite regular expression xy where x and y + * are in the form [1] to [10] matches the longest + * match of x followed by a match for y. + * + * [11] ^ a regular expression starting with a ^ character + * $ and/or ending with a $ character, restricts the + * pattern matching to the beginning of the line, + * or the end of line. [anchors] Elsewhere in the + * pattern, ^ and $ are treated as ordinary characters. + * + * + * Acknowledgements: + * + * HCR's Hugh Redelmeier has been most helpful in various + * stages of development. He convinced me to include BOW + * and EOW constructs, originally invented by Rob Pike at + * the University of Toronto. + * + * References: + * Software tools Kernighan & Plauger + * Software tools in Pascal Kernighan & Plauger + * Grep [rsx-11 C dist] David Conroy + * ed - text editor Un*x Programmer's Manual + * Advanced editing on Un*x B. W. Kernighan + * RegExp routines Henry Spencer + * + * Notes: + * + * This implementation uses a bit-set representation for character + * classes for speed and compactness. Each character is represented + * by one bit in a 256-bit block. Thus, CCL always takes a + * constant 32 bytes in the internal nfa, and RESearch::Execute does a single + * bit comparison to locate the character in the set. + * + * Examples: + * + * pattern: foo*.* + * compile: CHR f CHR o CLO CHR o END CLO ANY END END + * matches: fo foo fooo foobar fobar foxx ... + * + * pattern: fo[ob]a[rz] + * compile: CHR f CHR o CCL bitset CHR a CCL bitset END + * matches: fobar fooar fobaz fooaz + * + * pattern: foo\\+ + * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END + * matches: foo\ foo\\ foo\\\ ... + * + * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo) + * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END + * matches: foo1foo foo2foo foo3foo + * + * pattern: \(fo.*\)-\1 + * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END + * matches: foo-foo fo-fo fob-fob foobar-foobar ... + */ + +#include "CharClassify.h" +#include "RESearch.h" + +// Shut up annoying Visual C++ warnings: +#ifdef _MSC_VER +#pragma warning(disable: 4514) +#endif + +#define OKP 1 +#define NOP 0 + +#define CHR 1 +#define ANY 2 +#define CCL 3 +#define BOL 4 +#define EOL 5 +#define BOT 6 +#define EOT 7 +#define BOW 8 +#define EOW 9 +#define REF 10 +#define CLO 11 + +#define END 0 + +/* + * The following defines are not meant to be changeable. + * They are for readability only. + */ +#define BLKIND 0370 +#define BITIND 07 + +const char bitarr[] = {1,2,4,8,16,32,64,'\200'}; + +#define badpat(x) (*nfa = END, x) + +/* + * Character classification table for word boundary operators BOW + * and EOW is passed in by the creator of this object (Scintilla + * Document). The Document default state is that word chars are: + * 0-9,a-z, A-Z and _ + */ + +RESearch::RESearch(CharClassify *charClassTable) { + charClass = charClassTable; + Init(); +} + +RESearch::~RESearch() { + Clear(); +} + +void RESearch::Init() { + sta = NOP; /* status of lastpat */ + bol = 0; + for (int i=0; i<MAXTAG; i++) + pat[i] = 0; + for (int j=0; j<BITBLK; j++) + bittab[j] = 0; +} + +void RESearch::Clear() { + for (int i=0; i<MAXTAG; i++) { + delete []pat[i]; + pat[i] = 0; + bopat[i] = NOTFOUND; + eopat[i] = NOTFOUND; + } +} + +bool RESearch::GrabMatches(CharacterIndexer &ci) { + bool success = true; + for (unsigned int i=0; i<MAXTAG; i++) { + if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) { + unsigned int len = eopat[i] - bopat[i]; + pat[i] = new char[len + 1]; + if (pat[i]) { + for (unsigned int j=0; j<len; j++) + pat[i][j] = ci.CharAt(bopat[i] + j); + pat[i][len] = '\0'; + } else { + success = false; + } + } + } + return success; +} + +void RESearch::ChSet(char c) { + bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND]; +} + +void RESearch::ChSetWithCase(char c, bool caseSensitive) { + if (caseSensitive) { + ChSet(c); + } else { + if ((c >= 'a') && (c <= 'z')) { + ChSet(c); + ChSet(static_cast<char>(c - 'a' + 'A')); + } else if ((c >= 'A') && (c <= 'Z')) { + ChSet(c); + ChSet(static_cast<char>(c - 'A' + 'a')); + } else { + ChSet(c); + } + } +} + +const char escapeValue(char ch) { + switch (ch) { + case 'a': return '\a'; + case 'b': return '\b'; + case 'f': return '\f'; + case 'n': return '\n'; + case 'r': return '\r'; + case 't': return '\t'; + case 'v': return '\v'; + } + return 0; +} + +const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, bool posix) { + char *mp=nfa; /* nfa pointer */ + char *lp; /* saved pointer */ + char *sp=nfa; /* another one */ + char *mpMax = mp + MAXNFA - BITBLK - 10; + + int tagi = 0; /* tag stack index */ + int tagc = 1; /* actual tag count */ + + int n; + char mask; /* xor mask -CCL/NCL */ + int c1, c2; + + if (!pat || !length) + if (sta) + return 0; + else + return badpat("No previous regular expression"); + sta = NOP; + + const char *p=pat; /* pattern pointer */ + for (int i=0; i<length; i++, p++) { + if (mp > mpMax) + return badpat("Pattern too long"); + lp = mp; + switch(*p) { + + case '.': /* match any char */ + *mp++ = ANY; + break; + + case '^': /* match beginning */ + if (p == pat) + *mp++ = BOL; + else { + *mp++ = CHR; + *mp++ = *p; + } + break; + + case '$': /* match endofline */ + if (!*(p+1)) + *mp++ = EOL; + else { + *mp++ = CHR; + *mp++ = *p; + } + break; + + case '[': /* match char class */ + *mp++ = CCL; + + i++; + if (*++p == '^') { + mask = '\377'; + i++; + p++; + } else + mask = 0; + + if (*p == '-') { /* real dash */ + i++; + ChSet(*p++); + } + if (*p == ']') { /* real brace */ + i++; + ChSet(*p++); + } + while (*p && *p != ']') { + if (*p == '-' && *(p+1) && *(p+1) != ']') { + i++; + p++; + c1 = *(p-2) + 1; + i++; + c2 = *p++; + while (c1 <= c2) { + ChSetWithCase(static_cast<char>(c1++), caseSensitive); + } + } else if (*p == '\\' && *(p+1)) { + i++; + p++; + char escape = escapeValue(*p); + if (escape) + ChSetWithCase(escape, caseSensitive); + else + ChSetWithCase(*p, caseSensitive); + i++; + p++; + } else { + i++; + ChSetWithCase(*p++, caseSensitive); + } + } + if (!*p) + return badpat("Missing ]"); + + for (n = 0; n < BITBLK; bittab[n++] = (char) 0) + *mp++ = static_cast<char>(mask ^ bittab[n]); + + break; + + case '*': /* match 0 or more... */ + case '+': /* match 1 or more... */ + if (p == pat) + return badpat("Empty closure"); + lp = sp; /* previous opcode */ + if (*lp == CLO) /* equivalence... */ + break; + switch(*lp) { + + case BOL: + case BOT: + case EOT: + case BOW: + case EOW: + case REF: + return badpat("Illegal closure"); + default: + break; + } + + if (*p == '+') + for (sp = mp; lp < sp; lp++) + *mp++ = *lp; + + *mp++ = END; + *mp++ = END; + sp = mp; + while (--mp > lp) + *mp = mp[-1]; + *mp = CLO; + mp = sp; + break; + + case '\\': /* tags, backrefs... */ + i++; + switch(*++p) { + + case '<': + *mp++ = BOW; + break; + case '>': + if (*sp == BOW) + return badpat("Null pattern inside \\<\\>"); + *mp++ = EOW; + break; + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + n = *p-'0'; + if (tagi > 0 && tagstk[tagi] == n) + return badpat("Cyclical reference"); + if (tagc > n) { + *mp++ = static_cast<char>(REF); + *mp++ = static_cast<char>(n); + } + else + return badpat("Undetermined reference"); + break; + case 'a': + case 'b': + case 'n': + case 'f': + case 'r': + case 't': + case 'v': + *mp++ = CHR; + *mp++ = escapeValue(*p); + break; + default: + if (!posix && *p == '(') { + if (tagc < MAXTAG) { + tagstk[++tagi] = tagc; + *mp++ = BOT; + *mp++ = static_cast<char>(tagc++); + } + else + return badpat("Too many \\(\\) pairs"); + } else if (!posix && *p == ')') { + if (*sp == BOT) + return badpat("Null pattern inside \\(\\)"); + if (tagi > 0) { + *mp++ = static_cast<char>(EOT); + *mp++ = static_cast<char>(tagstk[tagi--]); + } + else + return badpat("Unmatched \\)"); + } else { + *mp++ = CHR; + *mp++ = *p; + } + } + break; + + default : /* an ordinary char */ + if (posix && *p == '(') { + if (tagc < MAXTAG) { + tagstk[++tagi] = tagc; + *mp++ = BOT; + *mp++ = static_cast<char>(tagc++); + } + else + return badpat("Too many () pairs"); + } else if (posix && *p == ')') { + if (*sp == BOT) + return badpat("Null pattern inside ()"); + if (tagi > 0) { + *mp++ = static_cast<char>(EOT); + *mp++ = static_cast<char>(tagstk[tagi--]); + } + else + return badpat("Unmatched )"); + } else if (caseSensitive) { + *mp++ = CHR; + *mp++ = *p; + } else { + *mp++ = CCL; + mask = 0; + ChSetWithCase(*p, false); + for (n = 0; n < BITBLK; bittab[n++] = (char) 0) + *mp++ = static_cast<char>(mask ^ bittab[n]); + } + break; + } + sp = lp; + } + if (tagi > 0) + return badpat((posix ? "Unmatched (" : "Unmatched \\(")); + *mp = END; + sta = OKP; + return 0; +} + +/* + * RESearch::Execute: + * execute nfa to find a match. + * + * special cases: (nfa[0]) + * BOL + * Match only once, starting from the + * beginning. + * CHR + * First locate the character without + * calling PMatch, and if found, call + * PMatch for the remaining string. + * END + * RESearch::Compile failed, poor luser did not + * check for it. Fail fast. + * + * If a match is found, bopat[0] and eopat[0] are set + * to the beginning and the end of the matched fragment, + * respectively. + * + */ + +int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) { + char c; + int ep = NOTFOUND; + char *ap = nfa; + + bol = lp; + failure = 0; + + Clear(); + + switch(*ap) { + + case BOL: /* anchored: match from BOL only */ + ep = PMatch(ci, lp, endp, ap); + break; + case EOL: /* just searching for end of line normal path doesn't work */ + if (*(ap+1) == END) { + lp = endp; + ep = lp; + break; + } else { + return 0; + } + case CHR: /* ordinary char: locate it fast */ + c = *(ap+1); + while ((lp < endp) && (ci.CharAt(lp) != c)) + lp++; + if (lp >= endp) /* if EOS, fail, else fall thru. */ + return 0; + default: /* regular matching all the way. */ + while (lp < endp) { + ep = PMatch(ci, lp, endp, ap); + if (ep != NOTFOUND) + break; + lp++; + } + break; + case END: /* munged automaton. fail always */ + return 0; + } + if (ep == NOTFOUND) + return 0; + + bopat[0] = lp; + eopat[0] = ep; + return 1; +} + +/* + * PMatch: internal routine for the hard part + * + * This code is partly snarfed from an early grep written by + * David Conroy. The backref and tag stuff, and various other + * innovations are by oz. + * + * special case optimizations: (nfa[n], nfa[n+1]) + * CLO ANY + * We KNOW .* will match everything upto the + * end of line. Thus, directly go to the end of + * line, without recursive PMatch calls. As in + * the other closure cases, the remaining pattern + * must be matched by moving backwards on the + * string recursively, to find a match for xy + * (x is ".*" and y is the remaining pattern) + * where the match satisfies the LONGEST match for + * x followed by a match for y. + * CLO CHR + * We can again scan the string forward for the + * single char and at the point of failure, we + * execute the remaining nfa recursively, same as + * above. + * + * At the end of a successful match, bopat[n] and eopat[n] + * are set to the beginning and end of subpatterns matched + * by tagged expressions (n = 1 to 9). + */ + +extern void re_fail(char *,char); + +#define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND]) + +/* + * skip values for CLO XXX to skip past the closure + */ + +#define ANYSKIP 2 /* [CLO] ANY END */ +#define CHRSKIP 3 /* [CLO] CHR chr END */ +#define CCLSKIP 34 /* [CLO] CCL 32 bytes END */ + +int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) { + int op, c, n; + int e; /* extra pointer for CLO */ + int bp; /* beginning of subpat... */ + int ep; /* ending of subpat... */ + int are; /* to save the line ptr. */ + + while ((op = *ap++) != END) + switch(op) { + + case CHR: + if (ci.CharAt(lp++) != *ap++) + return NOTFOUND; + break; + case ANY: + if (lp++ >= endp) + return NOTFOUND; + break; + case CCL: + c = ci.CharAt(lp++); + if (!isinset(ap,c)) + return NOTFOUND; + ap += BITBLK; + break; + case BOL: + if (lp != bol) + return NOTFOUND; + break; + case EOL: + if (lp < endp) + return NOTFOUND; + break; + case BOT: + bopat[*ap++] = lp; + break; + case EOT: + eopat[*ap++] = lp; + break; + case BOW: + if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp))) + return NOTFOUND; + break; + case EOW: + if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp))) + return NOTFOUND; + break; + case REF: + n = *ap++; + bp = bopat[n]; + ep = eopat[n]; + while (bp < ep) + if (ci.CharAt(bp++) != ci.CharAt(lp++)) + return NOTFOUND; + break; + case CLO: + are = lp; + switch(*ap) { + + case ANY: + while (lp < endp) + lp++; + n = ANYSKIP; + break; + case CHR: + c = *(ap+1); + while ((lp < endp) && (c == ci.CharAt(lp))) + lp++; + n = CHRSKIP; + break; + case CCL: + while ((lp < endp) && isinset(ap+1,ci.CharAt(lp))) + lp++; + n = CCLSKIP; + break; + default: + failure = true; + //re_fail("closure: bad nfa.", *ap); + return NOTFOUND; + } + + ap += n; + + while (lp >= are) { + if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND) + return e; + --lp; + } + return NOTFOUND; + default: + //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op)); + return NOTFOUND; + } + return lp; +} + +/* + * RESearch::Substitute: + * substitute the matched portions of the src in dst. + * + * & substitute the entire matched pattern. + * + * \digit substitute a subpattern, with the given tag number. + * Tags are numbered from 1 to 9. If the particular + * tagged subpattern does not exist, null is substituted. + */ +int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) { + char c; + int pin; + int bp; + int ep; + + if (!*src || !bopat[0]) + return 0; + + while ((c = *src++) != 0) { + switch(c) { + + case '&': + pin = 0; + break; + + case '\\': + c = *src++; + if (c >= '0' && c <= '9') { + pin = c - '0'; + break; + } + + default: + *dst++ = c; + continue; + } + + if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) { + while (ci.CharAt(bp) && bp < ep) + *dst++ = ci.CharAt(bp++); + if (bp < ep) + return 0; + } + } + *dst = (char) 0; + return 1; +} |