diff options
Diffstat (limited to 'tqtinterface/qt4/src/tools/tqregexp.cpp')
-rw-r--r-- | tqtinterface/qt4/src/tools/tqregexp.cpp | 4050 |
1 files changed, 0 insertions, 4050 deletions
diff --git a/tqtinterface/qt4/src/tools/tqregexp.cpp b/tqtinterface/qt4/src/tools/tqregexp.cpp deleted file mode 100644 index 7a7f1ab..0000000 --- a/tqtinterface/qt4/src/tools/tqregexp.cpp +++ /dev/null @@ -1,4050 +0,0 @@ -/**************************************************************************** -** -** Implementation of TQRegExp class -** -** Created : 950126 -** -** Copyright (C) 2010 Timothy Pearson and (C) 1992-2008 Trolltech ASA. -** -** This file is part of the tools module of the TQt GUI Toolkit. -** -** This file may be used under the terms of the GNU General -** Public License versions 2.0 or 3.0 as published by the Free -** Software Foundation and appearing in the files LICENSE.GPL2 -** and LICENSE.GPL3 included in the packaging of this file. -** Alternatively you may (at your option) use any later version -** of the GNU General Public License if such license has been -** publicly approved by Trolltech ASA (or its successors, if any) -** and the KDE Free TQt Foundation. -** -** Please review the following information to ensure GNU General -** Public Licensing requirements will be met: -** http://trolltech.com/products/qt/licenses/licensing/opensource/. -** If you are unsure which license is appropriate for your use, please -** review the following information: -** http://trolltech.com/products/qt/licenses/licensing/licensingoverview -** or contact the sales department at [email protected]. -** -** This file may be used under the terms of the Q Public License as -** defined by Trolltech ASA and appearing in the file LICENSE.TQPL -** included in the packaging of this file. Licensees holding valid TQt -** Commercial licenses may use this file in accordance with the TQt -** Commercial License Agreement provided with the Software. -** -** This file is provided "AS IS" with NO WARRANTY OF ANY KIND, -** INCLUDING THE WARRANTIES OF DESIGN, MERCHANTABILITY AND FITNESS FOR -** A PARTICULAR PURPOSE. Trolltech reserves all rights not granted -** herein. -** -**********************************************************************/ - -#include "tqregexp.h" - -#ifndef TQT_NO_REGEXP - -#include "tqmemarray.h" -#include "tqbitarray.h" -#include "tqcache.h" -#include "tqcleanuphandler.h" -#include "tqintdict.h" -#include "tqmap.h" -#include "tqptrvector.h" -#include "tqstring.h" -#include "tqtl.h" - -#ifdef TQT_THREAD_SUPPORT -#include "tqthreadstorage.h" -#include <private/tqthreadinstance_p.h> -#endif // TQT_THREAD_SUPPORT - -#undef TQT_TRANSLATE_NOOP -#define TQT_TRANSLATE_NOOP( context, sourceText ) sourceText - -#include <limits.h> - -// error strings for the regexp parser -#define RXERR_OK TQT_TRANSLATE_NOOP( "TQRegExp", "no error occurred" ) -#define RXERR_DISABLED TQT_TRANSLATE_NOOP( "TQRegExp", "disabled feature used" ) -#define RXERR_CHARCLASS TQT_TRANSLATE_NOOP( "TQRegExp", "bad char class syntax" ) -#define RXERR_LOOKAHEAD TQT_TRANSLATE_NOOP( "TQRegExp", "bad lookahead syntax" ) -#define RXERR_REPETITION TQT_TRANSLATE_NOOP( "TQRegExp", "bad repetition syntax" ) -#define RXERR_OCTAL TQT_TRANSLATE_NOOP( "TQRegExp", "invalid octal value" ) -#define RXERR_LEFTDELIM TQT_TRANSLATE_NOOP( "TQRegExp", "missing left delim" ) -#define RXERR_END TQT_TRANSLATE_NOOP( "TQRegExp", "unexpected end" ) -#define RXERR_LIMIT TQT_TRANSLATE_NOOP( "TQRegExp", "met internal limit" ) - -#ifdef USE_QT4 - -/*! - QT4 INTEROPERABILITY -*/ -int TQRegExp::search( const TQString& str, int offset ) const -{ - return indexIn( str, offset, CaretAtZero); -} - -/*! - QT4 INTEROPERABILITY -*/ -int TQRegExp::search( const TQString& str, int offset, CaretMode caretMode ) const -{ - return indexIn( str, offset, caretMode); -} - -/*! - QT4 INTEROPERABILITY -*/ -int TQRegExp::searchRev( const TQString& str, int offset ) const -{ - return lastIndexIn( str, offset, CaretAtZero); -} - -/*! - QT4 INTEROPERABILITY -*/ -int TQRegExp::searchRev( const TQString& str, int offset, CaretMode caretMode ) const -{ - return lastIndexIn( str, offset, caretMode); -} - -/*! - QT4 INTEROPERABILITY -*/ -bool TQRegExp::caseSensitive() const -{ - return caseSensitivity(); -} - -/*! - QT4 INTEROPERABILITY -*/ -void TQRegExp::setCaseSensitive( bool sensitive ) -{ - setCaseSensitivity( (Qt::CaseSensitivity)sensitive); -} - -#else // USE_QT4 - -/* - WARNING! Be sure to read qregexp.tex before modifying this file. -*/ - -/*! - \class TQRegExp tqregexp.h - \reentrant - \brief The TQRegExp class provides pattern matching using regular expressions. - - \ingroup tools - \ingroup misc - \ingroup shared - \mainclass - \keyword regular expression - - Regular expressions, or "regexps", provide a way to find patterns - within text. This is useful in many contexts, for example: - - \table - \row \i Validation - \i A regexp can be used to check whether a piece of text - meets some criteria, e.g. is an integer or contains no - whitespace. - \row \i Searching - \i Regexps provide a much more powerful means of searching - text than simple string matching does. For example we can - create a regexp which says "find one of the words 'mail', - 'letter' or 'correspondence' but not any of the words - 'email', 'mailman' 'mailer', 'letterbox' etc." - \row \i Search and Replace - \i A regexp can be used to replace a pattern with a piece of - text, for example replace all occurrences of '&' with - '\&' except where the '&' is already followed by 'amp;'. - \row \i String Splitting - \i A regexp can be used to identify where a string should be - split into its component fields, e.g. splitting tab-delimited - strings. - \endtable - - We present a very brief introduction to regexps, a description of - TQt's regexp language, some code examples, and finally the function - documentation itself. TQRegExp is modeled on Perl's regexp - language, and also fully supports Unicode. TQRegExp can also be - used in the weaker 'wildcard' (globbing) mode which works in a - similar way to command shells. A good text on regexps is \e - {Mastering Regular Expressions: Powerful Techniques for Perl and - Other Tools} by Jeffrey E. Friedl, ISBN 1565922573. - - Experienced regexp users may prefer to skip the introduction and - go directly to the relevant information. - - In case of multi-threaded programming, note that TQRegExp depends on - TQThreadStorage internally. For that reason, TQRegExp should only be - used with threads started with TQThread, i.e. not with threads - started with platform-specific APIs. - - \tableofcontents - - \section1 Introduction - - Regexps are built up from expressions, quantifiers, and assertions. - The simplest form of expression is simply a character, e.g. - <b>x</b> or <b>5</b>. An expression can also be a set of - characters. For example, <b>[ABCD]</b>, will match an <b>A</b> or - a <b>B</b> or a <b>C</b> or a <b>D</b>. As a shorthand we could - write this as <b>[A-D]</b>. If we want to match any of the - captital letters in the English alphabet we can write - <b>[A-Z]</b>. A quantifier tells the regexp engine how many - occurrences of the expression we want, e.g. <b>x{1,1}</b> means - match an <b>x</b> which occurs at least once and at most once. - We'll look at assertions and more complex expressions later. - - Note that in general regexps cannot be used to check for balanced - brackets or tags. For example if you want to match an opening html - \c <b> and its closing \c </b> you can only use a regexp if you - know that these tags are not nested; the html fragment, \c{<b>bold - <b>bolder</b></b>} will not match as expected. If you know the - maximum level of nesting it is possible to create a regexp that - will match correctly, but for an unknown level of nesting, regexps - will fail. - - We'll start by writing a regexp to match integers in the range 0 - to 99. We will require at least one digit so we will start with - <b>[0-9]{1,1}</b> which means match a digit exactly once. This - regexp alone will match integers in the range 0 to 9. To match one - or two digits we can increase the maximum number of occurrences so - the regexp becomes <b>[0-9]{1,2}</b> meaning match a digit at - least once and at most twice. However, this regexp as it stands - will not match correctly. This regexp will match one or two digits - \e within a string. To ensure that we match against the whole - string we must use the anchor assertions. We need <b>^</b> (caret) - which when it is the first character in the regexp means that the - regexp must match from the beginning of the string. And we also - need <b>$</b> (dollar) which when it is the last character in the - regexp means that the regexp must match until the end of the - string. So now our regexp is <b>^[0-9]{1,2}$</b>. Note that - assertions, such as <b>^</b> and <b>$</b>, do not match any - characters. - - If you've seen regexps elsewhere they may have looked different from - the ones above. This is because some sets of characters and some - quantifiers are so common that they have special symbols to - represent them. <b>[0-9]</b> can be replaced with the symbol - <b>\d</b>. The quantifier to match exactly one occurrence, - <b>{1,1}</b>, can be replaced with the expression itself. This means - that <b>x{1,1}</b> is exactly the same as <b>x</b> alone. So our 0 - to 99 matcher could be written <b>^\d{1,2}$</b>. Another way of - writing it would be <b>^\d\d{0,1}$</b>, i.e. from the start of the - string match a digit followed by zero or one digits. In practice - most people would write it <b>^\d\d?$</b>. The <b>?</b> is a - shorthand for the quantifier <b>{0,1}</b>, i.e. a minimum of no - occurrences a maximum of one occurrence. This is used to make an - expression optional. The regexp <b>^\d\d?$</b> means "from the - beginning of the string match one digit followed by zero or one - digits and then the end of the string". - - Our second example is matching the words 'mail', 'letter' or - 'correspondence' but without matching 'email', 'mailman', - 'mailer', 'letterbox' etc. We'll start by just matching 'mail'. In - full the regexp is, <b>m{1,1}a{1,1}i{1,1}l{1,1}</b>, but since - each expression itself is automatically quantified by <b>{1,1}</b> - we can simply write this as <b>mail</b>; an 'm' followed by an 'a' - followed by an 'i' followed by an 'l'. The symbol '|' (bar) is - used for \e alternation, so our regexp now becomes - <b>mail|letter|correspondence</b> which means match 'mail' \e or - 'letter' \e or 'correspondence'. Whilst this regexp will find the - words we want it will also find words we don't want such as - 'email'. We will start by putting our regexp in parentheses, - <b>(mail|letter|correspondence)</b>. Parentheses have two effects, - firstly they group expressions together and secondly they identify - parts of the regexp that we wish to \link #capturing-text capture - \endlink. Our regexp still matches any of the three words but now - they are grouped together as a unit. This is useful for building - up more complex regexps. It is also useful because it allows us to - examine which of the words actually matched. We need to use - another assertion, this time <b>\b</b> "word boundary": - <b>\b(mail|letter|correspondence)\b</b>. This regexp means "match - a word boundary followed by the expression in parentheses followed - by another word boundary". The <b>\b</b> assertion matches at a \e - position in the regexp not a \e character in the regexp. A word - boundary is any non-word character such as a space a newline or - the beginning or end of the string. - - For our third example we want to replace ampersands with the HTML - entity '\&'. The regexp to match is simple: <b>\&</b>, i.e. - match one ampersand. Unfortunately this will mess up our text if - some of the ampersands have already been turned into HTML - entities. So what we really want to say is replace an ampersand - providing it is not followed by 'amp;'. For this we need the - negative lookahead assertion and our regexp becomes: - <b>\&(?!amp;)</b>. The negative lookahead assertion is introduced - with '(?!' and finishes at the ')'. It means that the text it - contains, 'amp;' in our example, must \e not follow the expression - that preceeds it. - - Regexps provide a rich language that can be used in a variety of - ways. For example suppose we want to count all the occurrences of - 'Eric' and 'Eirik' in a string. Two valid regexps to match these - are <b>\\b(Eric|Eirik)\\b</b> and <b>\\bEi?ri[ck]\\b</b>. We need - the word boundary '\b' so we don't get 'Ericsson' etc. The second - regexp actually matches more than we want, 'Eric', 'Erik', 'Eiric' - and 'Eirik'. - - We will implement some the examples above in the - \link #code-examples code examples \endlink section. - - \target characters-and-abbreviations-for-sets-of-characters - \section1 Characters and Abbreviations for Sets of Characters - - \table - \header \i Element \i Meaning - \row \i <b>c</b> - \i Any character represents itself unless it has a special - regexp meaning. Thus <b>c</b> matches the character \e c. - \row \i <b>\\c</b> - \i A character that follows a backslash matches the character - itself except where mentioned below. For example if you - wished to match a literal caret at the beginning of a string - you would write <b>\^</b>. - \row \i <b>\\a</b> - \i This matches the ASCII bell character (BEL, 0x07). - \row \i <b>\\f</b> - \i This matches the ASCII form feed character (FF, 0x0C). - \row \i <b>\\n</b> - \i This matches the ASCII line feed character (LF, 0x0A, Unix newline). - \row \i <b>\\r</b> - \i This matches the ASCII carriage return character (CR, 0x0D). - \row \i <b>\\t</b> - \i This matches the ASCII horizontal tab character (HT, 0x09). - \row \i <b>\\v</b> - \i This matches the ASCII vertical tab character (VT, 0x0B). - \row \i <b>\\xhhhh</b> - \i This matches the Unicode character corresponding to the - hexadecimal number hhhh (between 0x0000 and 0xFFFF). \0ooo - (i.e., \zero ooo) matches the ASCII/Latin-1 character - corresponding to the octal number ooo (between 0 and 0377). - \row \i <b>. (dot)</b> - \i This matches any character (including newline). - \row \i <b>\\d</b> - \i This matches a digit (TQChar::isDigit()). - \row \i <b>\\D</b> - \i This matches a non-digit. - \row \i <b>\\s</b> - \i This matches a whitespace (TQChar::isSpace()). - \row \i <b>\\S</b> - \i This matches a non-whitespace. - \row \i <b>\\w</b> - \i This matches a word character (TQChar::isLetterOrNumber() or '_'). - \row \i <b>\\W</b> - \i This matches a non-word character. - \row \i <b>\\n</b> - \i The n-th \link #capturing-text backreference \endlink, - e.g. \1, \2, etc. - \endtable - - \e {Note that the C++ compiler transforms backslashes in strings - so to include a <b>\\</b> in a regexp you will need to enter it - twice, i.e. <b>\\\\</b>.} - - \target sets-of-characters - \section1 Sets of Characters - - Square brackets are used to match any character in the set of - characters contained within the square brackets. All the character - set abbreviations described above can be used within square - brackets. Apart from the character set abbreviations and the - following two exceptions no characters have special meanings in - square brackets. - - \table - \row \i <b>^</b> - \i The caret negates the character set if it occurs as the - first character, i.e. immediately after the opening square - bracket. For example, <b>[abc]</b> matches 'a' or 'b' or 'c', - but <b>[^abc]</b> matches anything \e except 'a' or 'b' or - 'c'. - \row \i <b>-</b> - \i The dash is used to indicate a range of characters, for - example <b>[W-Z]</b> matches 'W' or 'X' or 'Y' or 'Z'. - \endtable - - Using the predefined character set abbreviations is more portable - than using character ranges across platforms and languages. For - example, <b>[0-9]</b> matches a digit in Western alphabets but - <b>\d</b> matches a digit in \e any alphabet. - - Note that in most regexp literature sets of characters are called - "character classes". - - \target quantifiers - \section1 Quantifiers - - By default an expression is automatically quantified by - <b>{1,1}</b>, i.e. it should occur exactly once. In the following - list <b>\e {E}</b> stands for any expression. An expression is a - character or an abbreviation for a set of characters or a set of - characters in square brackets or any parenthesised expression. - - \table - \row \i <b>\e {E}?</b> - \i Matches zero or one occurrence of \e E. This quantifier - means "the previous expression is optional" since it will - match whether or not the expression occurs in the string. It - is the same as <b>\e {E}{0,1}</b>. For example <b>dents?</b> - will match 'dent' and 'dents'. - - \row \i <b>\e {E}+</b> - \i Matches one or more occurrences of \e E. This is the same - as <b>\e {E}{1,MAXINT}</b>. For example, <b>0+</b> will match - '0', '00', '000', etc. - - \row \i <b>\e {E}*</b> - \i Matches zero or more occurrences of \e E. This is the same - as <b>\e {E}{0,MAXINT}</b>. The <b>*</b> quantifier is often - used by a mistake. Since it matches \e zero or more - occurrences it will match no occurrences at all. For example - if we want to match strings that end in whitespace and use - the regexp <b>\s*$</b> we would get a match on every string. - This is because we have said find zero or more whitespace - followed by the end of string, so even strings that don't end - in whitespace will match. The regexp we want in this case is - <b>\s+$</b> to match strings that have at least one - whitespace at the end. - - \row \i <b>\e {E}{n}</b> - \i Matches exactly \e n occurrences of the expression. This - is the same as repeating the expression \e n times. For - example, <b>x{5}</b> is the same as <b>xxxxx</b>. It is also - the same as <b>\e {E}{n,n}</b>, e.g. <b>x{5,5}</b>. - - \row \i <b>\e {E}{n,}</b> - \i Matches at least \e n occurrences of the expression. This - is the same as <b>\e {E}{n,MAXINT}</b>. - - \row \i <b>\e {E}{,m}</b> - \i Matches at most \e m occurrences of the expression. This - is the same as <b>\e {E}{0,m}</b>. - - \row \i <b>\e {E}{n,m}</b> - \i Matches at least \e n occurrences of the expression and at - most \e m occurrences of the expression. - \endtable - - (MAXINT is implementation dependent but will not be smaller than - 1024.) - - If we wish to apply a quantifier to more than just the preceding - character we can use parentheses to group characters together in - an expression. For example, <b>tag+</b> matches a 't' followed by - an 'a' followed by at least one 'g', whereas <b>(tag)+</b> matches - at least one occurrence of 'tag'. - - Note that quantifiers are "greedy". They will match as much text - as they can. For example, <b>0+</b> will match as many zeros as it - can from the first zero it finds, e.g. '2.<u>000</u>5'. - Quantifiers can be made non-greedy, see setMinimal(). - - \target capturing-text - \section1 Capturing Text - - Parentheses allow us to group elements together so that we can - quantify and capture them. For example if we have the expression - <b>mail|letter|correspondence</b> that matches a string we know - that \e one of the words matched but not which one. Using - parentheses allows us to "capture" whatever is matched within - their bounds, so if we used <b>(mail|letter|correspondence)</b> - and matched this regexp against the string "I sent you some email" - we can use the cap() or capturedTexts() functions to extract the - matched characters, in this case 'mail'. - - We can use captured text within the regexp itself. To refer to the - captured text we use \e backreferences which are indexed from 1, - the same as for cap(). For example we could search for duplicate - words in a string using <b>\b(\w+)\W+\1\b</b> which means match a - word boundary followed by one or more word characters followed by - one or more non-word characters followed by the same text as the - first parenthesised expression followed by a word boundary. - - If we want to use parentheses purely for grouping and not for - capturing we can use the non-capturing syntax, e.g. - <b>(?:green|blue)</b>. Non-capturing parentheses begin '(?:' and - end ')'. In this example we match either 'green' or 'blue' but we - do not capture the match so we only know whether or not we matched - but not which color we actually found. Using non-capturing - parentheses is more efficient than using capturing parentheses - since the regexp engine has to do less book-keeping. - - Both capturing and non-capturing parentheses may be nested. - - \target assertions - \section1 Assertions - - Assertions make some statement about the text at the point where - they occur in the regexp but they do not match any characters. In - the following list <b>\e {E}</b> stands for any expression. - - \table - \row \i <b>^</b> - \i The caret signifies the beginning of the string. If you - wish to match a literal \c{^} you must escape it by - writing <b>\^</b>. For example, <b>^#include</b> will only - match strings which \e begin with the characters '#include'. - (When the caret is the first character of a character set it - has a special meaning, see \link #sets-of-characters Sets of - Characters \endlink.) - - \row \i <b>$</b> - \i The dollar signifies the end of the string. For example - <b>\d\s*$</b> will match strings which end with a digit - optionally followed by whitespace. If you wish to match a - literal \c{$} you must escape it by writing - <b>\$</b>. - - \row \i <b>\\b</b> - \i A word boundary. For example the regexp - <b>\\bOK\\b</b> means match immediately after a word - boundary (e.g. start of string or whitespace) the letter 'O' - then the letter 'K' immediately before another word boundary - (e.g. end of string or whitespace). But note that the - assertion does not actually match any whitespace so if we - write <b>(\\bOK\\b)</b> and we have a match it will only - contain 'OK' even if the string is "Its <u>OK</u> now". - - \row \i <b>\\B</b> - \i A non-word boundary. This assertion is true wherever - <b>\\b</b> is false. For example if we searched for - <b>\\Bon\\B</b> in "Left on" the match would fail (space - and end of string aren't non-word boundaries), but it would - match in "t<u>on</u>ne". - - \row \i <b>(?=\e E)</b> - \i Positive lookahead. This assertion is true if the - expression matches at this point in the regexp. For example, - <b>const(?=\\s+char)</b> matches 'const' whenever it is - followed by 'char', as in 'static <u>const</u> char *'. - (Compare with <b>const\\s+char</b>, which matches 'static - <u>const char</u> *'.) - - \row \i <b>(?!\e E)</b> - \i Negative lookahead. This assertion is true if the - expression does not match at this point in the regexp. For - example, <b>const(?!\\s+char)</b> matches 'const' \e except - when it is followed by 'char'. - \endtable - - \target wildcard-matching - \section1 Wildcard Matching (globbing) - - Most command shells such as \e bash or \e cmd.exe support "file - globbing", the ability to identify a group of files by using - wildcards. The setWildcard() function is used to switch between - regexp and wildcard mode. Wildcard matching is much simpler than - full regexps and has only four features: - - \table - \row \i <b>c</b> - \i Any character represents itself apart from those mentioned - below. Thus <b>c</b> matches the character \e c. - \row \i <b>?</b> - \i This matches any single character. It is the same as - <b>.</b> in full regexps. - \row \i <b>*</b> - \i This matches zero or more of any characters. It is the - same as <b>.*</b> in full regexps. - \row \i <b>[...]</b> - \i Sets of characters can be represented in square brackets, - similar to full regexps. Within the character class, like - outside, backslash has no special meaning. - \endtable - - For example if we are in wildcard mode and have strings which - contain filenames we could identify HTML files with <b>*.html</b>. - This will match zero or more characters followed by a dot followed - by 'h', 't', 'm' and 'l'. - - \target perl-users - \section1 Notes for Perl Users - - Most of the character class abbreviations supported by Perl are - supported by TQRegExp, see \link - #characters-and-abbreviations-for-sets-of-characters characters - and abbreviations for sets of characters \endlink. - - In TQRegExp, apart from within character classes, \c{^} always - signifies the start of the string, so carets must always be - escaped unless used for that purpose. In Perl the meaning of caret - varies automagically depending on where it occurs so escaping it - is rarely necessary. The same applies to \c{$} which in - TQRegExp always signifies the end of the string. - - TQRegExp's quantifiers are the same as Perl's greedy quantifiers. - Non-greedy matching cannot be applied to individual quantifiers, - but can be applied to all the quantifiers in the pattern. For - example, to match the Perl regexp <b>ro+?m</b> requires: - \code - TQRegExp rx( "ro+m" ); - rx.setMinimal( TRUE ); - \endcode - - The equivalent of Perl's \c{/i} option is - setCaseSensitive(FALSE). - - Perl's \c{/g} option can be emulated using a \link - #cap_in_a_loop loop \endlink. - - In TQRegExp <b>.</b> matches any character, therefore all TQRegExp - regexps have the equivalent of Perl's \c{/s} option. TQRegExp - does not have an equivalent to Perl's \c{/m} option, but this - can be emulated in various ways for example by splitting the input - into lines or by looping with a regexp that searches for newlines. - - Because TQRegExp is string oriented there are no \A, \Z or \z - assertions. The \G assertion is not supported but can be emulated - in a loop. - - Perl's $& is cap(0) or capturedTexts()[0]. There are no TQRegExp - equivalents for $`, $' or $+. Perl's capturing variables, $1, $2, - ... correspond to cap(1) or capturedTexts()[1], cap(2) or - capturedTexts()[2], etc. - - To substitute a pattern use TQString::replace(). - - Perl's extended \c{/x} syntax is not supported, nor are - directives, e.g. (?i), or regexp comments, e.g. (?#comment). On - the other hand, C++'s rules for literal strings can be used to - achieve the same: - \code - TQRegExp mark( "\\b" // word boundary - "[Mm]ark" // the word we want to match - ); - \endcode - - Both zero-width positive and zero-width negative lookahead - assertions (?=pattern) and (?!pattern) are supported with the same - syntax as Perl. Perl's lookbehind assertions, "independent" - subexpressions and conditional expressions are not supported. - - Non-capturing parentheses are also supported, with the same - (?:pattern) syntax. - - See TQStringList::split() and TQStringList::join() for equivalents - to Perl's split and join functions. - - Note: because C++ transforms \\'s they must be written \e twice in - code, e.g. <b>\\b</b> must be written <b>\\\\b</b>. - - \target code-examples - \section1 Code Examples - - \code - TQRegExp rx( "^\\d\\d?$" ); // match integers 0 to 99 - rx.search( "123" ); // returns -1 (no match) - rx.search( "-6" ); // returns -1 (no match) - rx.search( "6" ); // returns 0 (matched as position 0) - \endcode - - The third string matches '<u>6</u>'. This is a simple validation - regexp for integers in the range 0 to 99. - - \code - TQRegExp rx( "^\\S+$" ); // match strings without whitespace - rx.search( "Hello world" ); // returns -1 (no match) - rx.search( "This_is-OK" ); // returns 0 (matched at position 0) - \endcode - - The second string matches '<u>This_is-OK</u>'. We've used the - character set abbreviation '\S' (non-whitespace) and the anchors - to match strings which contain no whitespace. - - In the following example we match strings containing 'mail' or - 'letter' or 'correspondence' but only match whole words i.e. not - 'email' - - \code - TQRegExp rx( "\\b(mail|letter|correspondence)\\b" ); - rx.search( "I sent you an email" ); // returns -1 (no match) - rx.search( "Please write the letter" ); // returns 17 - \endcode - - The second string matches "Please write the <u>letter</u>". The - word 'letter' is also captured (because of the parentheses). We - can see what text we've captured like this: - - \code - TQString captured = rx.cap( 1 ); // captured == "letter" - \endcode - - This will capture the text from the first set of capturing - parentheses (counting capturing left parentheses from left to - right). The parentheses are counted from 1 since cap( 0 ) is the - whole matched regexp (equivalent to '&' in most regexp engines). - - \code - TQRegExp rx( "&(?!amp;)" ); // match ampersands but not & - TQString line1 = "This & that"; - line1.replace( rx, "&" ); - // line1 == "This & that" - TQString line2 = "His & hers & theirs"; - line2.replace( rx, "&" ); - // line2 == "His & hers & theirs" - \endcode - - Here we've passed the TQRegExp to TQString's replace() function to - replace the matched text with new text. - - \code - TQString str = "One Eric another Eirik, and an Ericsson." - " How many Eiriks, Eric?"; - TQRegExp rx( "\\b(Eric|Eirik)\\b" ); // match Eric or Eirik - int pos = 0; // where we are in the string - int count = 0; // how many Eric and Eirik's we've counted - while ( pos >= 0 ) { - pos = rx.search( str, pos ); - if ( pos >= 0 ) { - pos++; // move along in str - count++; // count our Eric or Eirik - } - } - \endcode - - We've used the search() function to repeatedly match the regexp in - the string. Note that instead of moving forward by one character - at a time \c pos++ we could have written \c {pos += - rx.matchedLength()} to skip over the already matched string. The - count will equal 3, matching 'One <u>Eric</u> another - <u>Eirik</u>, and an Ericsson. How many Eiriks, <u>Eric</u>?'; it - doesn't match 'Ericsson' or 'Eiriks' because they are not bounded - by non-word boundaries. - - One common use of regexps is to split lines of delimited data into - their component fields. - - \code - str = "Trolltech AS\twww.trolltech.com\tNorway"; - TQString company, web, country; - rx.setPattern( "^([^\t]+)\t([^\t]+)\t([^\t]+)$" ); - if ( rx.search( str ) != -1 ) { - company = rx.cap( 1 ); - web = rx.cap( 2 ); - country = rx.cap( 3 ); - } - \endcode - - In this example our input lines have the format company name, web - address and country. Unfortunately the regexp is rather long and - not very versatile -- the code will break if we add any more - fields. A simpler and better solution is to look for the - separator, '\t' in this case, and take the surrounding text. The - TQStringList split() function can take a separator string or regexp - as an argument and split a string accordingly. - - \code - TQStringList field = TQStringList::split( "\t", str ); - \endcode - - Here field[0] is the company, field[1] the web address and so on. - - To imitate the matching of a shell we can use wildcard mode. - - \code - TQRegExp rx( "*.html" ); // invalid regexp: * doesn't quantify anything - rx.setWildcard( TRUE ); // now it's a valid wildcard regexp - rx.exactMatch( "index.html" ); // returns TRUE - rx.exactMatch( "default.htm" ); // returns FALSE - rx.exactMatch( "readme.txt" ); // returns FALSE - \endcode - - Wildcard matching can be convenient because of its simplicity, but - any wildcard regexp can be defined using full regexps, e.g. - <b>.*\.html$</b>. Notice that we can't match both \c .html and \c - .htm files with a wildcard unless we use <b>*.htm*</b> which will - also match 'test.html.bak'. A full regexp gives us the precision - we need, <b>.*\\.html?$</b>. - - TQRegExp can match case insensitively using setCaseSensitive(), and - can use non-greedy matching, see setMinimal(). By default TQRegExp - uses full regexps but this can be changed with setWildcard(). - Searching can be forward with search() or backward with - searchRev(). Captured text can be accessed using capturedTexts() - which returns a string list of all captured strings, or using - cap() which returns the captured string for the given index. The - pos() function takes a match index and returns the position in the - string where the match was made (or -1 if there was no match). - - \sa TQRegExpValidator TQString TQStringList - - \target member-function-documentation -*/ - -const int NumBadChars = 64; -#define BadChar( ch ) ( (ch).tqunicode() % NumBadChars ) - -const int NoOccurrence = INT_MAX; -const int EmptyCapture = INT_MAX; -const int InftyLen = INT_MAX; -const int InftyRep = 1025; -const int EOS = -1; - -static bool isWord( TQChar ch ) -{ - return ch.isLetterOrNumber() || ch == TQChar( '_' ); -} - -/* - Merges two TQMemArrays of ints and puts the result into the first - one. -*/ -static void mergeInto( TQMemArray<int> *a, const TQMemArray<int>& b ) -{ - int asize = a->size(); - int bsize = b.size(); - if ( asize == 0 ) { - *a = b.copy(); -#ifndef TQT_NO_REGEXP_OPTIM - } else if ( bsize == 1 && (*a)[asize - 1] < b[0] ) { - a->resize( asize + 1 ); - (*a)[asize] = b[0]; -#endif - } else if ( bsize >= 1 ) { - int csize = asize + bsize; - TQMemArray<int> c( csize ); - int i = 0, j = 0, k = 0; - while ( i < asize ) { - if ( j < bsize ) { - if ( (*a)[i] == b[j] ) { - i++; - csize--; - } else if ( (*a)[i] < b[j] ) { - c[k++] = (*a)[i++]; - } else { - c[k++] = b[j++]; - } - } else { - memcpy( c.data() + k, (*a).data() + i, - (asize - i) * sizeof(int) ); - break; - } - } - c.resize( csize ); - if ( j < bsize ) - memcpy( c.data() + k, b.data() + j, (bsize - j) * sizeof(int) ); - *a = c; - } -} - -/* - Merges two disjoint TQMaps of (int, int) pairs and puts the result - into the first one. -*/ -static void mergeInto( TQMap<int, int> *a, const TQMap<int, int>& b ) -{ - TQMap<int, int>::ConstIterator it; - for ( it = b.begin(); it != b.end(); ++it ) - a->insert( it.key(), *it ); -} - -/* - Returns the value associated to key k in TQMap m of (int, int) - pairs, or 0 if no such value is explicitly present. -*/ -static int at( const TQMap<int, int>& m, int k ) -{ - TQMap<int, int>::ConstIterator it = m.find( k ); - if ( it == m.end() ) - return 0; - else - return *it; -} - -#ifndef TQT_NO_REGEXP_WILDCARD -/* - Translates a wildcard pattern to an equivalent regular expression - pattern (e.g., *.cpp to .*\.cpp). -*/ -static TQString wc2rx( const TQString& wc_str ) -{ - int wclen = wc_str.length(); - TQString rx = TQString::tqfromLatin1( "" ); - int i = 0; - const TQChar *wc = wc_str.tqunicode(); - while ( i < wclen ) { - TQChar c = wc[i++]; - switch ( c.tqunicode() ) { - case '*': - rx += TQString::tqfromLatin1( ".*" ); - break; - case '?': - rx += TQChar( '.' ); - break; - case '$': - case '(': - case ')': - case '+': - case '.': - case '\\': - case '^': - case '{': - case '|': - case '}': - rx += TQChar( '\\' ); - rx += c; - break; - case '[': - rx += c; - if ( wc[i] == TQChar('^') ) - rx += wc[i++]; - if ( i < wclen ) { - if ( rx[i] == ']' ) - rx += wc[i++]; - while ( i < wclen && wc[i] != TQChar(']') ) { - if ( wc[i] == '\\' ) - rx += TQChar( '\\' ); - rx += wc[i++]; - } - } - break; - default: - rx += c; - } - } - return rx; -} -#endif - -/* - The class TQRegExpEngine encapsulates a modified nondeterministic - finite automaton (NFA). -*/ -class TQRegExpEngine : public TQShared -{ -public: -#ifndef TQT_NO_REGEXP_CCLASS - /* - The class CharClass represents a set of characters, such as can - be found in regular expressions (e.g., [a-z] denotes the set - {a, b, ..., z}). - */ - class CharClass - { - public: - CharClass(); - CharClass( const CharClass& cc ) { operator=( cc ); } - - CharClass& operator=( const CharClass& cc ); - - void clear(); - bool negative() const { return n; } - void setNegative( bool negative ); - void addCategories( int cats ); - void addRange( ushort from, ushort to ); - void addSingleton( ushort ch ) { addRange( ch, ch ); } - - bool in( TQChar ch ) const; -#ifndef TQT_NO_REGEXP_OPTIM - const TQMemArray<int>& firstOccurrence() const { return occ1; } -#endif - -#if defined(TQT_DEBUG) - void dump() const; -#endif - - private: - /* - The struct Range represents a range of characters (e.g., - [0-9] denotes range 48 to 57). - */ - struct Range - { - ushort from; // 48 - ushort to; // 57 - }; - - int c; // character classes - TQMemArray<Range> r; // character ranges - bool n; // negative? -#ifndef TQT_NO_REGEXP_OPTIM - TQMemArray<int> occ1; // first-occurrence array -#endif - }; -#else - struct CharClass - { - int dummy; - -#ifndef TQT_NO_REGEXP_OPTIM - CharClass() { occ1.fill( 0, NumBadChars ); } - - const TQMemArray<int>& firstOccurrence() const { return occ1; } - TQMemArray<int> occ1; -#endif - }; -#endif - - TQRegExpEngine( bool caseSensitive ) { setup( caseSensitive ); } - TQRegExpEngine( const TQString& rx, bool caseSensitive ); -#ifndef TQT_NO_REGEXP_OPTIM - ~TQRegExpEngine(); -#endif - - bool isValid() const { return valid; } - bool caseSensitive() const { return cs; } - const TQString& errorString() const { return yyError; } - int numCaptures() const { return officialncap; } - void match( const TQString& str, int pos, bool minimal, bool oneTest, - int caretIndex, TQMemArray<int>& captured ); - int partialMatchLength() const { return mmOneTestMatchedLen; } - - int createState( TQChar ch ); - int createState( const CharClass& cc ); -#ifndef TQT_NO_REGEXP_BACKREF - int createState( int bref ); -#endif - - void addCatTransitions( const TQMemArray<int>& from, - const TQMemArray<int>& to ); -#ifndef TQT_NO_REGEXP_CAPTURE - void addPlusTransitions( const TQMemArray<int>& from, - const TQMemArray<int>& to, int atom ); -#endif - -#ifndef TQT_NO_REGEXP_ANCHOR_ALT - int anchorAlternation( int a, int b ); - int anchorConcatenation( int a, int b ); -#else - int anchorAlternation( int a, int b ) { return a & b; } - int anchorConcatenation( int a, int b ) { return a | b; } -#endif - void addAnchors( int from, int to, int a ); - -#ifndef TQT_NO_REGEXP_OPTIM - void heuristicallyChooseHeuristic(); -#endif - -#if defined(TQT_DEBUG) - void dump() const; -#endif - -private: - enum { CharClassBit = 0x10000, BackRefBit = 0x20000 }; - - /* - The struct State represents one state in a modified NFA. The - input characters matched are stored in the state instead of on - the transitions, something possible for an automaton - constructed from a regular expression. - */ - struct State - { -#ifndef TQT_NO_REGEXP_CAPTURE - int atom; // which atom does this state belong to? -#endif - int match; // what does it match? (see CharClassBit and BackRefBit) - TQMemArray<int> outs; // out-transitions - TQMap<int, int> *reenter; // atoms reentered when transiting out - TQMap<int, int> *anchors; // anchors met when transiting out - -#ifndef TQT_NO_REGEXP_CAPTURE - State( int a, int m ) - : atom( a ), match( m ), reenter( 0 ), anchors( 0 ) { } -#else - State( int m ) - : match( m ), reenter( 0 ), anchors( 0 ) { } -#endif - ~State() { delete reenter; delete anchors; } - }; - -#ifndef TQT_NO_REGEXP_LOOKAHEAD - /* - The struct Lookahead represents a lookahead a la Perl (e.g., - (?=foo) and (?!bar)). - */ - struct Lookahead - { - TQRegExpEngine *eng; // NFA representing the embedded regular expression - bool neg; // negative lookahead? - - Lookahead( TQRegExpEngine *eng0, bool neg0 ) - : eng( eng0 ), neg( neg0 ) { } - ~Lookahead() { delete eng; } - }; -#endif - -#ifndef TQT_NO_REGEXP_CAPTURE - /* - The struct Atom represents one node in the hierarchy of regular - expression atoms. - */ - struct Atom - { - int parent; // index of parent in array of atoms - int capture; // index of capture, from 1 to ncap - }; -#endif - -#ifndef TQT_NO_REGEXP_ANCHOR_ALT - /* - The struct AnchorAlternation represents a pair of anchors with - OR semantics. - */ - struct AnchorAlternation - { - int a; // this anchor... - int b; // ...or this one - }; -#endif - - enum { InitialState = 0, FinalState = 1 }; - void setup( bool caseSensitive ); - int setupState( int match ); - - /* - Let's hope that 13 lookaheads and 14 back-references are - enough. - */ - enum { MaxLookaheads = 13, MaxBackRefs = 14 }; - enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, - Anchor_Word = 0x00000004, Anchor_NonWord = 0x00000008, - Anchor_FirstLookahead = 0x00000010, - Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads, - Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1, - Anchor_Alternation = Anchor_BackRef1Empty << MaxBackRefs, - - Anchor_LookaheadMask = ( Anchor_FirstLookahead - 1 ) ^ - ( (Anchor_FirstLookahead << MaxLookaheads) - 1 ) }; -#ifndef TQT_NO_REGEXP_CAPTURE - int startAtom( bool capture ); - void finishAtom( int atom ) { cf = f[atom].parent; } -#endif - -#ifndef TQT_NO_REGEXP_LOOKAHEAD - int addLookahead( TQRegExpEngine *eng, bool negative ); -#endif - -#ifndef TQT_NO_REGEXP_CAPTURE - bool isBetterCapture( const int *begin1, const int *end1, const int *begin2, - const int *end2 ); -#endif - bool testAnchor( int i, int a, const int *capBegin ); - -#ifndef TQT_NO_REGEXP_OPTIM - bool goodStringMatch(); - bool badCharMatch(); -#else - bool bruteMatch(); -#endif - bool matchHere(); - - TQPtrVector<State> s; // array of states - int ns; // number of states -#ifndef TQT_NO_REGEXP_CAPTURE - TQMemArray<Atom> f; // atom hierarchy - int nf; // number of atoms - int cf; // current atom -#endif - int officialncap; // number of captures, seen from the outside - int ncap; // number of captures, seen from the inside -#ifndef TQT_NO_REGEXP_CCLASS - TQPtrVector<CharClass> cl; // array of character classes -#endif -#ifndef TQT_NO_REGEXP_LOOKAHEAD - TQPtrVector<Lookahead> ahead; // array of lookaheads -#endif -#ifndef TQT_NO_REGEXP_ANCHOR_ALT - TQMemArray<AnchorAlternation> aa; // array of (a, b) pairs of anchors -#endif -#ifndef TQT_NO_REGEXP_OPTIM - bool caretAnchored; // does the regexp start with ^? - bool trivial; // is the good-string all that needs to match? -#endif - bool valid; // is the regular expression valid? - bool cs; // case sensitive? -#ifndef TQT_NO_REGEXP_BACKREF - int nbrefs; // number of back-references -#endif - -#ifndef TQT_NO_REGEXP_OPTIM - bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch - - int goodEarlyStart; // the index where goodStr can first occur in a match - int goodLateStart; // the index where goodStr can last occur in a match - TQString goodStr; // the string that any match has to contain - - int minl; // the minimum length of a match - TQMemArray<int> occ1; // first-occurrence array -#endif - - /* - The class Box is an abstraction for a regular expression - fragment. It can also be seen as one node in the syntax tree of - a regular expression with synthetized attributes. - - Its interface is ugly for performance reasons. - */ - class Box - { - public: - Box( TQRegExpEngine *engine ); - Box( const Box& b ) { operator=( b ); } - - Box& operator=( const Box& b ); - - void clear() { operator=( Box(eng) ); } - void set( TQChar ch ); - void set( const CharClass& cc ); -#ifndef TQT_NO_REGEXP_BACKREF - void set( int bref ); -#endif - - void cat( const Box& b ); - void orx( const Box& b ); - void plus( int atom ); - void opt(); - void catAnchor( int a ); -#ifndef TQT_NO_REGEXP_OPTIM - void setupHeuristics(); -#endif - -#if defined(TQT_DEBUG) - void dump() const; -#endif - - private: - void addAnchorsToEngine( const Box& to ) const; - - TQRegExpEngine *eng; // the automaton under construction - TQMemArray<int> ls; // the left states (firstpos) - TQMemArray<int> rs; // the right states (lastpos) - TQMap<int, int> lanchors; // the left anchors - TQMap<int, int> ranchors; // the right anchors - int skipanchors; // the anchors to match if the box is skipped - -#ifndef TQT_NO_REGEXP_OPTIM - int earlyStart; // the index where str can first occur - int lateStart; // the index where str can last occur - TQString str; // a string that has to occur in any match - TQString leftStr; // a string occurring at the left of this box - TQString rightStr; // a string occurring at the right of this box - int maxl; // the maximum length of this box (possibly InftyLen) -#endif - - int minl; // the minimum length of this box -#ifndef TQT_NO_REGEXP_OPTIM - TQMemArray<int> occ1; // first-occurrence array -#endif - }; - friend class Box; - - /* - This is the lexical analyzer for regular expressions. - */ - enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, - Tok_PosLookahead, Tok_NegLookahead, Tok_RightParen, Tok_CharClass, - Tok_Caret, Tok_Quantifier, Tok_Bar, Tok_Word, Tok_NonWord, - Tok_Char = 0x10000, Tok_BackRef = 0x20000 }; - int getChar(); - int getEscape(); -#ifndef TQT_NO_REGEXP_INTERVAL - int getRep( int def ); -#endif -#ifndef TQT_NO_REGEXP_LOOKAHEAD - void skipChars( int n ); -#endif - void error( const char *msg ); - void startTokenizer( const TQChar *rx, int len ); - int getToken(); - - const TQChar *yyIn; // a pointer to the input regular expression pattern - int yyPos0; // the position of yyTok in the input pattern - int yyPos; // the position of the next character to read - int yyLen; // the length of yyIn - int yyCh; // the last character read - CharClass *yyCharClass; // attribute for Tok_CharClass tokens - int yyMinRep; // attribute for Tok_Quantifier - int yyMaxRep; // ditto - TQString yyError; // syntax error or overflow during parsing? - - /* - This is the syntactic analyzer for regular expressions. - */ - int parse( const TQChar *rx, int len ); - void parseAtom( Box *box ); - void parseFactor( Box *box ); - void parseTerm( Box *box ); - void parseExpression( Box *box ); - - int yyTok; // the last token read - bool yyMayCapture; // set this to FALSE to disable capturing - - /* - This is the engine state during matching. - */ - const TQString *mmStr; // a pointer to the input TQString - const TQChar *mmIn; // a pointer to the input string data - int mmPos; // the current position in the string - int mmCaretPos; - int mmLen; // the length of the input string - bool mmMinimal; // minimal matching? - TQMemArray<int> mmBigArray; // big TQMemArray<int> array - int *mmInNextStack; // is state is mmNextStack? - int *mmCurStack; // stack of current states - int *mmNextStack; // stack of next states - int *mmCurCapBegin; // start of current states' captures - int *mmNextCapBegin; // start of next states' captures - int *mmCurCapEnd; // end of current states' captures - int *mmNextCapEnd; // end of next states' captures - int *mmTempCapBegin; // start of temporary captures - int *mmTempCapEnd; // end of temporary captures - int *mmCapBegin; // start of captures for a next state - int *mmCapEnd; // end of captures for a next state - int *mmSlideTab; // bump-along slide table for bad-character heuristic - int mmSlideTabSize; // size of slide table -#ifndef TQT_NO_REGEXP_BACKREF - TQIntDict<int> mmSleeping; // dictionary of back-reference sleepers -#endif - int mmMatchLen; // length of match - int mmOneTestMatchedLen; // length of partial match -}; - -TQRegExpEngine::TQRegExpEngine( const TQString& rx, bool caseSensitive ) -#ifndef TQT_NO_REGEXP_BACKREF - : mmSleeping( 101 ) -#endif -{ - setup( caseSensitive ); - valid = ( parse(rx.tqunicode(), rx.length()) == (int) rx.length() ); - if ( !valid ) { -#ifndef TQT_NO_REGEXP_OPTIM - trivial = FALSE; -#endif - error( RXERR_LEFTDELIM ); - } -} - -#ifndef TQT_NO_REGEXP_OPTIM -TQRegExpEngine::~TQRegExpEngine() -{ -} -#endif - -/* - Tries to match in str and returns an array of (begin, length) pairs - for captured text. If there is no match, all pairs are (-1, -1). -*/ -void TQRegExpEngine::match( const TQString& str, int pos, bool minimal, - bool oneTest, int caretIndex, - TQMemArray<int>& captured ) -{ - bool matched = FALSE; - -#ifndef TQT_NO_REGEXP_OPTIM - if ( trivial && !oneTest ) { - mmPos = str.find( goodStr, pos, cs ); - mmMatchLen = goodStr.length(); - matched = ( mmPos != -1 ); - } else -#endif - { - mmStr = &str; - mmIn = str.tqunicode(); - if ( mmIn == 0 ) - mmIn = &TQChar::null; - mmPos = pos; - mmCaretPos = caretIndex; - mmLen = str.length(); - mmMinimal = minimal; - mmMatchLen = 0; - mmOneTestMatchedLen = 0; - - if ( valid && mmPos >= 0 && mmPos <= mmLen ) { -#ifndef TQT_NO_REGEXP_OPTIM - if ( oneTest ) { - matched = matchHere(); - } else { - if ( mmPos <= mmLen - minl ) { - if ( caretAnchored ) { - matched = matchHere(); - } else if ( useGoodStringHeuristic ) { - matched = goodStringMatch(); - } else { - matched = badCharMatch(); - } - } - } -#else - matched = oneTest ? matchHere() : bruteMatch(); -#endif - } - } - - int capturedSize = 2 + 2 * officialncap; - captured.detach(); - captured.resize( capturedSize ); - if ( matched ) { - captured[0] = mmPos; - captured[1] = mmMatchLen; - for ( int j = 0; j < officialncap; j++ ) { - int len = mmCapEnd[j] - mmCapBegin[j]; - captured[2 + 2 * j] = len > 0 ? mmPos + mmCapBegin[j] : 0; - captured[2 + 2 * j + 1] = len; - } - } else { - // we rely on 2's complement here - memset( captured.data(), -1, capturedSize * sizeof(int) ); - } -} - -/* - The three following functions add one state to the automaton and - return the number of the state. -*/ - -int TQRegExpEngine::createState( TQChar ch ) -{ - return setupState( ch.tqunicode() ); -} - -int TQRegExpEngine::createState( const CharClass& cc ) -{ -#ifndef TQT_NO_REGEXP_CCLASS - int n = cl.size(); - cl.resize( n + 1 ); - cl.insert( n, new CharClass(cc) ); - return setupState( CharClassBit | n ); -#else - TQ_UNUSED( cc ); - return setupState( CharClassBit ); -#endif -} - -#ifndef TQT_NO_REGEXP_BACKREF -int TQRegExpEngine::createState( int bref ) -{ - if ( bref > nbrefs ) { - nbrefs = bref; - if ( nbrefs > MaxBackRefs ) { - error( RXERR_LIMIT ); - return 0; - } - } - return setupState( BackRefBit | bref ); -} -#endif - -/* - The two following functions add a transition between all pairs of - states (i, j) where i is fond in from, and j is found in to. - - Cat-transitions are distinguished from plus-transitions for - capturing. -*/ - -void TQRegExpEngine::addCatTransitions( const TQMemArray<int>& from, - const TQMemArray<int>& to ) -{ - for ( int i = 0; i < (int) from.size(); i++ ) { - State *st = s[from[i]]; - mergeInto( &st->outs, to ); - } -} - -#ifndef TQT_NO_REGEXP_CAPTURE -void TQRegExpEngine::addPlusTransitions( const TQMemArray<int>& from, - const TQMemArray<int>& to, int atom ) -{ - for ( int i = 0; i < (int) from.size(); i++ ) { - State *st = s[from[i]]; - TQMemArray<int> oldOuts = st->outs.copy(); - mergeInto( &st->outs, to ); - if ( f[atom].capture >= 0 ) { - if ( st->reenter == 0 ) - st->reenter = new TQMap<int, int>; - for ( int j = 0; j < (int) to.size(); j++ ) { - if ( !st->reenter->contains(to[j]) && - oldOuts.bsearch(to[j]) < 0 ) - st->reenter->insert( to[j], atom ); - } - } - } -} -#endif - -#ifndef TQT_NO_REGEXP_ANCHOR_ALT -/* - Returns an anchor that means a OR b. -*/ -int TQRegExpEngine::anchorAlternation( int a, int b ) -{ - if ( ((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0 ) - return a & b; - - int n = aa.size(); -#ifndef TQT_NO_REGEXP_OPTIM - if ( n > 0 && aa[n - 1].a == a && aa[n - 1].b == b ) - return Anchor_Alternation | ( n - 1 ); -#endif - - aa.resize( n + 1 ); - aa[n].a = a; - aa[n].b = b; - return Anchor_Alternation | n; -} - -/* - Returns an anchor that means a AND b. -*/ -int TQRegExpEngine::anchorConcatenation( int a, int b ) -{ - if ( ((a | b) & Anchor_Alternation) == 0 ) - return a | b; - if ( (b & Anchor_Alternation) != 0 ) - tqSwap( a, b ); - - int aprime = anchorConcatenation( aa[a ^ Anchor_Alternation].a, b ); - int bprime = anchorConcatenation( aa[a ^ Anchor_Alternation].b, b ); - return anchorAlternation( aprime, bprime ); -} -#endif - -/* - Adds anchor a on a transition caracterised by its from state and - its to state. -*/ -void TQRegExpEngine::addAnchors( int from, int to, int a ) -{ - State *st = s[from]; - if ( st->anchors == 0 ) - st->anchors = new TQMap<int, int>; - if ( st->anchors->contains(to) ) - a = anchorAlternation( (*st->anchors)[to], a ); - st->anchors->insert( to, a ); -} - -#ifndef TQT_NO_REGEXP_OPTIM -/* - This function chooses between the good-string and the bad-character - heuristics. It computes two scores and chooses the heuristic with - the highest score. - - Here are some common-sense constraints on the scores that should be - respected if the formulas are ever modified: (1) If goodStr is - empty, the good-string heuristic scores 0. (2) If the regular - expression is trivial, the good-string heuristic should be used. - (3) If the search is case insensitive, the good-string heuristic - should be used, unless it scores 0. (Case insensitivity turns all - entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is - big, the good-string heuristic should score less. -*/ -void TQRegExpEngine::heuristicallyChooseHeuristic() -{ - if ( minl == 0 ) { - useGoodStringHeuristic = FALSE; - } else if ( trivial ) { - useGoodStringHeuristic = TRUE; - } else { - /* - Magic formula: The good string has to constitute a good - proportion of the minimum-length string, and appear at a - more-or-less known index. - */ - int goodStringScore = ( 64 * goodStr.length() / minl ) - - ( goodLateStart - goodEarlyStart ); - /* - Less magic formula: We pick some characters at random, and - check whether they are good or bad. - */ - int badCharScore = 0; - int step = TQMAX( 1, NumBadChars / 32 ); - for ( int i = 1; i < NumBadChars; i += step ) { - if ( occ1[i] == NoOccurrence ) - badCharScore += minl; - else - badCharScore += occ1[i]; - } - badCharScore /= minl; - useGoodStringHeuristic = ( goodStringScore > badCharScore ); - } -} -#endif - -#if defined(TQT_DEBUG) -void TQRegExpEngine::dump() const -{ - int i, j; - qDebug( "Case %ssensitive engine", cs ? "" : "in" ); - qDebug( " States" ); - for ( i = 0; i < ns; i++ ) { - qDebug( " %d%s", i, - i == InitialState ? " (initial)" : - i == FinalState ? " (final)" : "" ); -#ifndef TQT_NO_REGEXP_CAPTURE - qDebug( " in atom %d", s[i]->atom ); -#endif - int m = s[i]->match; - if ( (m & CharClassBit) != 0 ) { - qDebug( " match character class %d", m ^ CharClassBit ); -#ifndef TQT_NO_REGEXP_CCLASS - cl[m ^ CharClassBit]->dump(); -#else - qDebug( " negative character class" ); -#endif - } else if ( (m & BackRefBit) != 0 ) { - qDebug( " match back-reference %d", m ^ BackRefBit ); - } else if ( m >= 0x20 && m <= 0x7e ) { - qDebug( " match 0x%.4x (%c)", m, m ); - } else { - qDebug( " match 0x%.4x", m ); - } - for ( j = 0; j < (int) s[i]->outs.size(); j++ ) { - int next = s[i]->outs[j]; - qDebug( " -> %d", next ); - if ( s[i]->reenter != 0 && s[i]->reenter->contains(next) ) - qDebug( " [reenter %d]", (*s[i]->reenter)[next] ); - if ( s[i]->anchors != 0 && at(*s[i]->anchors, next) != 0 ) - qDebug( " [anchors 0x%.8x]", (*s[i]->anchors)[next] ); - } - } -#ifndef TQT_NO_REGEXP_CAPTURE - if ( nf > 0 ) { - qDebug( " Atom Parent Capture" ); - for ( i = 0; i < nf; i++ ) - qDebug( " %6d %6d %6d", i, f[i].parent, f[i].capture ); - } -#endif -#ifndef TQT_NO_REGEXP_ANCHOR_ALT - for ( i = 0; i < (int) aa.size(); i++ ) - qDebug( " Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, - aa[i].b ); -#endif -} -#endif - -void TQRegExpEngine::setup( bool caseSensitive ) -{ - s.setAutoDelete( TRUE ); - s.resize( 32 ); - ns = 0; -#ifndef TQT_NO_REGEXP_CAPTURE - f.resize( 32 ); - nf = 0; - cf = -1; -#endif - officialncap = 0; - ncap = 0; -#ifndef TQT_NO_REGEXP_CCLASS - cl.setAutoDelete( TRUE ); -#endif -#ifndef TQT_NO_REGEXP_LOOKAHEAD - ahead.setAutoDelete( TRUE ); -#endif -#ifndef TQT_NO_REGEXP_OPTIM - caretAnchored = TRUE; - trivial = TRUE; -#endif - valid = FALSE; - cs = caseSensitive; -#ifndef TQT_NO_REGEXP_BACKREF - nbrefs = 0; -#endif -#ifndef TQT_NO_REGEXP_OPTIM - useGoodStringHeuristic = TRUE; - minl = 0; - occ1.fill( 0, NumBadChars ); -#endif -} - -int TQRegExpEngine::setupState( int match ) -{ - if ( (ns & (ns + 1)) == 0 && ns + 1 >= (int) s.size() ) - s.resize( (ns + 1) << 1 ); -#ifndef TQT_NO_REGEXP_CAPTURE - s.insert( ns, new State(cf, match) ); -#else - s.insert( ns, new State(match) ); -#endif - return ns++; -} - -#ifndef TQT_NO_REGEXP_CAPTURE -/* - Functions startAtom() and finishAtom() should be called to delimit - atoms. When a state is created, it is assigned to the current atom. - The information is later used for capturing. -*/ -int TQRegExpEngine::startAtom( bool capture ) -{ - if ( (nf & (nf + 1)) == 0 && nf + 1 >= (int) f.size() ) - f.resize( (nf + 1) << 1 ); - f[nf].parent = cf; - cf = nf++; - f[cf].capture = capture ? ncap++ : -1; - return cf; -} -#endif - -#ifndef TQT_NO_REGEXP_LOOKAHEAD -/* - Creates a lookahead anchor. -*/ -int TQRegExpEngine::addLookahead( TQRegExpEngine *eng, bool negative ) -{ - int n = ahead.size(); - if ( n == MaxLookaheads ) { - error( RXERR_LIMIT ); - return 0; - } - ahead.resize( n + 1 ); - ahead.insert( n, new Lookahead(eng, negative) ); - return Anchor_FirstLookahead << n; -} -#endif - -#ifndef TQT_NO_REGEXP_CAPTURE -/* - We want the longest leftmost captures. -*/ -bool TQRegExpEngine::isBetterCapture( const int *begin1, const int *end1, - const int *begin2, const int *end2 ) -{ - for ( int i = 0; i < ncap; i++ ) { - int delta = begin2[i] - begin1[i]; // it has to start early... - if ( delta == 0 ) - delta = end1[i] - end2[i]; // ...and end late (like a party) - - if ( delta != 0 ) - return delta > 0; - } - return FALSE; -} -#endif - -/* - Returns TRUE if anchor a matches at position mmPos + i in the input - string, otherwise FALSE. -*/ -bool TQRegExpEngine::testAnchor( int i, int a, const int *capBegin ) -{ - int j; - -#ifndef TQT_NO_REGEXP_ANCHOR_ALT - if ( (a & Anchor_Alternation) != 0 ) { - return testAnchor( i, aa[a ^ Anchor_Alternation].a, capBegin ) || - testAnchor( i, aa[a ^ Anchor_Alternation].b, capBegin ); - } -#endif - - if ( (a & Anchor_Caret) != 0 ) { - if ( mmPos + i != mmCaretPos ) - return FALSE; - } - if ( (a & Anchor_Dollar) != 0 ) { - if ( mmPos + i != mmLen ) - return FALSE; - } -#ifndef TQT_NO_REGEXP_ESCAPE - if ( (a & (Anchor_Word | Anchor_NonWord)) != 0 ) { - bool before = FALSE; - bool after = FALSE; - if ( mmPos + i != 0 ) - before = isWord( mmIn[mmPos + i - 1] ); - if ( mmPos + i != mmLen ) - after = isWord( mmIn[mmPos + i] ); - if ( (a & Anchor_Word) != 0 && (before == after) ) - return FALSE; - if ( (a & Anchor_NonWord) != 0 && (before != after) ) - return FALSE; - } -#endif -#ifndef TQT_NO_REGEXP_LOOKAHEAD - if ( (a & Anchor_LookaheadMask) != 0 ) { - TQConstString cstr = TQConstString( (TQChar *) mmIn + mmPos + i, - mmLen - mmPos - i ); - for ( j = 0; j < (int) ahead.size(); j++ ) { - if ( (a & (Anchor_FirstLookahead << j)) != 0 ) { - TQMemArray<int> captured; - ahead[j]->eng->match( cstr.string(), 0, TRUE, TRUE, - mmCaretPos - mmPos - i, captured ); - if ( (captured[0] == 0) == ahead[j]->neg ) - return FALSE; - } - } - } -#endif -#ifndef TQT_NO_REGEXP_CAPTURE -#ifndef TQT_NO_REGEXP_BACKREF - for ( j = 0; j < nbrefs; j++ ) { - if ( (a & (Anchor_BackRef1Empty << j)) != 0 ) { - if ( capBegin[j] != EmptyCapture ) - return FALSE; - } - } -#endif -#endif - return TRUE; -} - -#ifndef TQT_NO_REGEXP_OPTIM -/* - The three following functions are what Jeffrey Friedl would call - transmissions (or bump-alongs). Using one or the other should make - no difference except in performance. -*/ - -bool TQRegExpEngine::goodStringMatch() -{ - int k = mmPos + goodEarlyStart; - while ( (k = mmStr->find(goodStr, k, cs)) != -1 ) { - int from = k - goodLateStart; - int to = k - goodEarlyStart; - if ( from > mmPos ) - mmPos = from; - - while ( mmPos <= to ) { - if ( matchHere() ) - return TRUE; - mmPos++; - } - k++; - } - return FALSE; -} - -bool TQRegExpEngine::badCharMatch() -{ - int slideHead = 0; - int slideNext = 0; - int i; - int lastPos = mmLen - minl; - memset( mmSlideTab, 0, mmSlideTabSize * sizeof(int) ); - - /* - Set up the slide table, used for the bad-character heuristic, - using the table of first occurrence of each character. - */ - for ( i = 0; i < minl; i++ ) { - int sk = occ1[BadChar(mmIn[mmPos + i])]; - if ( sk == NoOccurrence ) - sk = i + 1; - if ( sk > 0 ) { - int k = i + 1 - sk; - if ( k < 0 ) { - sk = i + 1; - k = 0; - } - if ( sk > mmSlideTab[k] ) - mmSlideTab[k] = sk; - } - } - - if ( mmPos > lastPos ) - return FALSE; - - for ( ;; ) { - if ( ++slideNext >= mmSlideTabSize ) - slideNext = 0; - if ( mmSlideTab[slideHead] > 0 ) { - if ( mmSlideTab[slideHead] - 1 > mmSlideTab[slideNext] ) - mmSlideTab[slideNext] = mmSlideTab[slideHead] - 1; - mmSlideTab[slideHead] = 0; - } else { - if ( matchHere() ) - return TRUE; - } - - if ( mmPos == lastPos ) - break; - - /* - Update the slide table. This code has much in common with - the initialization code. - */ - int sk = occ1[BadChar(mmIn[mmPos + minl])]; - if ( sk == NoOccurrence ) { - mmSlideTab[slideNext] = minl; - } else if ( sk > 0 ) { - int k = slideNext + minl - sk; - if ( k >= mmSlideTabSize ) - k -= mmSlideTabSize; - if ( sk > mmSlideTab[k] ) - mmSlideTab[k] = sk; - } - slideHead = slideNext; - mmPos++; - } - return FALSE; -} -#else -bool TQRegExpEngine::bruteMatch() -{ - while ( mmPos <= mmLen ) { - if ( matchHere() ) - return TRUE; - mmPos++; - } - return FALSE; -} -#endif - -/* - Here's the core of the engine. It tries to do a match here and now. -*/ -bool TQRegExpEngine::matchHere() -{ - int ncur = 1, nnext = 0; - int i = 0, j, k, m; - bool stop = FALSE; - - mmMatchLen = -1; - mmOneTestMatchedLen = -1; - mmCurStack[0] = InitialState; - -#ifndef TQT_NO_REGEXP_CAPTURE - if ( ncap > 0 ) { - for ( j = 0; j < ncap; j++ ) { - mmCurCapBegin[j] = EmptyCapture; - mmCurCapEnd[j] = EmptyCapture; - } - } -#endif - -#ifndef TQT_NO_REGEXP_BACKREF - int *zzZ = 0; - - while ( (ncur > 0 || !mmSleeping.isEmpty()) && i <= mmLen - mmPos && - !stop ) -#else - while ( ncur > 0 && i <= mmLen - mmPos && !stop ) -#endif - { - int ch = ( i < mmLen - mmPos ) ? mmIn[mmPos + i].tqunicode() : 0; - for ( j = 0; j < ncur; j++ ) { - int cur = mmCurStack[j]; - State *scur = s[cur]; - TQMemArray<int>& outs = scur->outs; - for ( k = 0; k < (int) outs.size(); k++ ) { - int next = outs[k]; - State *snext = s[next]; - bool in = TRUE; -#ifndef TQT_NO_REGEXP_BACKREF - int needSomeSleep = 0; -#endif - - /* - First, check if the anchors are anchored properly. - */ - if ( scur->anchors != 0 ) { - int a = at( *scur->anchors, next ); - if ( a != 0 && !testAnchor(i, a, mmCurCapBegin + j * ncap) ) - in = FALSE; - } - /* - If indeed they are, check if the input character is - correct for this transition. - */ - if ( in ) { - m = snext->match; - if ( (m & (CharClassBit | BackRefBit)) == 0 ) { - if ( cs ) - in = ( m == ch ); - else - in = ( TQChar(m).lower() == TQChar(ch).lower() ); - } else if ( next == FinalState ) { - mmMatchLen = i; - stop = mmMinimal; - in = TRUE; - } else if ( (m & CharClassBit) != 0 ) { -#ifndef TQT_NO_REGEXP_CCLASS - const CharClass *cc = cl[m ^ CharClassBit]; - if ( cs ) - in = cc->in( ch ); - else if ( cc->negative() ) - in = cc->in( TQChar(ch).lower() ) && - cc->in( TQChar(ch).upper() ); - else - in = cc->in( TQChar(ch).lower() ) || - cc->in( TQChar(ch).upper() ); -#endif -#ifndef TQT_NO_REGEXP_BACKREF - } else { /* ( (m & BackRefBit) != 0 ) */ - int bref = m ^ BackRefBit; - int ell = j * ncap + ( bref - 1 ); - - in = bref <= ncap && mmCurCapBegin[ell] != EmptyCapture; - if ( in ) { - if ( cs ) - in = ( mmIn[mmPos + mmCurCapBegin[ell]] - == TQChar(ch) ); - else - in = ( mmIn[mmPos + mmCurCapBegin[ell]].lower() - == TQChar(ch).lower() ); - } - - if ( in ) { - int delta; - if ( mmCurCapEnd[ell] == EmptyCapture ) - delta = i - mmCurCapBegin[ell]; - else - delta = mmCurCapEnd[ell] - mmCurCapBegin[ell]; - - in = ( delta <= mmLen - (mmPos + i) ); - if ( in && delta > 1 ) { - int n = 1; - if ( cs ) { - while ( n < delta ) { - if ( mmIn[mmPos + - mmCurCapBegin[ell] + n] != - mmIn[mmPos + i + n] ) - break; - n++; - } - } else { - while ( n < delta ) { - TQChar a = mmIn[mmPos + - mmCurCapBegin[ell] + n]; - TQChar b = mmIn[mmPos + i + n]; - if ( a.lower() != b.lower() ) - break; - n++; - } - } - in = ( n == delta ); - if ( in ) - needSomeSleep = delta - 1; - } - } -#endif - } - } - - /* - We must now update our data structures. - */ - if ( in ) { -#ifndef TQT_NO_REGEXP_CAPTURE - int *capBegin, *capEnd; -#endif - /* - If the next state was not encountered yet, all - is fine. - */ - if ( (m = mmInNextStack[next]) == -1 ) { - m = nnext++; - mmNextStack[m] = next; - mmInNextStack[next] = m; -#ifndef TQT_NO_REGEXP_CAPTURE - capBegin = mmNextCapBegin + m * ncap; - capEnd = mmNextCapEnd + m * ncap; - - /* - Otherwise, we'll first maintain captures in - temporary arrays, and decide at the end whether - it's best to keep the previous capture zones or - the new ones. - */ - } else { - capBegin = mmTempCapBegin; - capEnd = mmTempCapEnd; -#endif - } - -#ifndef TQT_NO_REGEXP_CAPTURE - /* - Updating the capture zones is much of a task. - */ - if ( ncap > 0 ) { - memcpy( capBegin, mmCurCapBegin + j * ncap, - ncap * sizeof(int) ); - memcpy( capEnd, mmCurCapEnd + j * ncap, - ncap * sizeof(int) ); - int c = scur->atom, n = snext->atom; - int p = -1, q = -1; - int cap; - - /* - Lemma 1. For any x in the range [0..nf), we - have f[x].parent < x. - - Proof. By looking at startAtom(), it is - clear that cf < nf holds all the time, and - thus that f[nf].parent < nf. - */ - - /* - If we are reentering an atom, we empty all - capture zones inside it. - */ - if ( scur->reenter != 0 && - (q = at(*scur->reenter, next)) != 0 ) { - TQBitArray b; - b.fill( FALSE, nf ); - b.setBit( q, TRUE ); - for ( int ell = q + 1; ell < nf; ell++ ) { - if ( b.testBit(f[ell].parent) ) { - b.setBit( ell, TRUE ); - cap = f[ell].capture; - if ( cap >= 0 ) { - capBegin[cap] = EmptyCapture; - capEnd[cap] = EmptyCapture; - } - } - } - p = f[q].parent; - - /* - Otherwise, close the capture zones we are - leaving. We are leaving f[c].capture, - f[f[c].parent].capture, - f[f[f[c].parent].parent].capture, ..., - until f[x].capture, with x such that - f[x].parent is the youngest common ancestor - for c and n. - - We go up along c's and n's ancestry until - we find x. - */ - } else { - p = c; - q = n; - while ( p != q ) { - if ( p > q ) { - cap = f[p].capture; - if ( cap >= 0 ) { - if ( capBegin[cap] == i ) { - capBegin[cap] = EmptyCapture; - capEnd[cap] = EmptyCapture; - } else { - capEnd[cap] = i; - } - } - p = f[p].parent; - } else { - q = f[q].parent; - } - } - } - - /* - In any case, we now open the capture zones - we are entering. We work upwards from n - until we reach p (the parent of the atom we - reenter or the youngest common ancestor). - */ - while ( n > p ) { - cap = f[n].capture; - if ( cap >= 0 ) { - capBegin[cap] = i; - capEnd[cap] = EmptyCapture; - } - n = f[n].parent; - } - /* - If the next state was already in - mmNextStack, we must choose carefully which - capture zones we want to keep. - */ - if ( capBegin == mmTempCapBegin && - isBetterCapture(capBegin, capEnd, - mmNextCapBegin + m * ncap, - mmNextCapEnd + m * ncap) ) { - memcpy( mmNextCapBegin + m * ncap, capBegin, - ncap * sizeof(int) ); - memcpy( mmNextCapEnd + m * ncap, capEnd, - ncap * sizeof(int) ); - } - } -#ifndef TQT_NO_REGEXP_BACKREF - /* - We are done with updating the capture zones. - It's now time to put the next state to sleep, - if it needs to, and to remove it from - mmNextStack. - */ - if ( needSomeSleep > 0 ) { - zzZ = new int[1 + 2 * ncap]; - zzZ[0] = next; - if ( ncap > 0 ) { - memcpy( zzZ + 1, capBegin, ncap * sizeof(int) ); - memcpy( zzZ + 1 + ncap, capEnd, - ncap * sizeof(int) ); - } - mmInNextStack[mmNextStack[--nnext]] = -1; - mmSleeping.insert( i + needSomeSleep, zzZ ); - } -#endif -#endif - } - } - } -#ifndef TQT_NO_REGEXP_CAPTURE - /* - If we reached the final state, hurray! Copy the captured - zone. - */ - if ( ncap > 0 && (m = mmInNextStack[FinalState]) != -1 ) { - memcpy( mmCapBegin, mmNextCapBegin + m * ncap, ncap * sizeof(int) ); - memcpy( mmCapEnd, mmNextCapEnd + m * ncap, ncap * sizeof(int) ); - } -#ifndef TQT_NO_REGEXP_BACKREF - /* - It's time to wake up the sleepers. - */ - if ( !mmSleeping.isEmpty() ) { - while ( (zzZ = mmSleeping.take(i)) != 0 ) { - int next = zzZ[0]; - int *capBegin = zzZ + 1; - int *capEnd = zzZ + 1 + ncap; - bool copyOver = TRUE; - - if ( (m = mmInNextStack[zzZ[0]]) == -1 ) { - m = nnext++; - mmNextStack[m] = next; - mmInNextStack[next] = m; - } else { - copyOver = isBetterCapture( mmNextCapBegin + m * ncap, - mmNextCapEnd + m * ncap, - capBegin, capEnd ); - } - if ( copyOver ) { - memcpy( mmNextCapBegin + m * ncap, capBegin, - ncap * sizeof(int) ); - memcpy( mmNextCapEnd + m * ncap, capEnd, - ncap * sizeof(int) ); - } - delete[] zzZ; - } - } -#endif -#endif - for ( j = 0; j < nnext; j++ ) - mmInNextStack[mmNextStack[j]] = -1; - - // avoid needless iteration that confuses mmOneTestMatchedLen - if ( nnext == 1 && mmNextStack[0] == FinalState -#ifndef TQT_NO_REGEXP_BACKREF - && mmSleeping.isEmpty() -#endif - ) - stop = TRUE; - - tqSwap( mmCurStack, mmNextStack ); -#ifndef TQT_NO_REGEXP_CAPTURE - tqSwap( mmCurCapBegin, mmNextCapBegin ); - tqSwap( mmCurCapEnd, mmNextCapEnd ); -#endif - ncur = nnext; - nnext = 0; - i++; - } - -#ifndef TQT_NO_REGEXP_BACKREF - /* - If minimal matching is enabled, we might have some sleepers - left. - */ - while ( !mmSleeping.isEmpty() ) { - zzZ = mmSleeping.take( *TQIntDictIterator<int>(mmSleeping) ); - delete[] zzZ; - } -#endif - - mmOneTestMatchedLen = i - 1; - return ( mmMatchLen >= 0 ); -} - -#ifndef TQT_NO_REGEXP_CCLASS - -TQRegExpEngine::CharClass::CharClass() - : c( 0 ), n( FALSE ) -{ -#ifndef TQT_NO_REGEXP_OPTIM - occ1.fill( NoOccurrence, NumBadChars ); -#endif -} - -TQRegExpEngine::CharClass& TQRegExpEngine::CharClass::operator=( - const CharClass& cc ) -{ - c = cc.c; - r = cc.r.copy(); - n = cc.n; -#ifndef TQT_NO_REGEXP_OPTIM - occ1 = cc.occ1; -#endif - return *this; -} - -void TQRegExpEngine::CharClass::clear() -{ - c = 0; - r.resize( 0 ); - n = FALSE; -} - -void TQRegExpEngine::CharClass::setNegative( bool negative ) -{ - n = negative; -#ifndef TQT_NO_REGEXP_OPTIM - occ1.fill( 0, NumBadChars ); -#endif -} - -void TQRegExpEngine::CharClass::addCategories( int cats ) -{ - c |= cats; -#ifndef TQT_NO_REGEXP_OPTIM - occ1.fill( 0, NumBadChars ); -#endif -} - -void TQRegExpEngine::CharClass::addRange( ushort from, ushort to ) -{ - if ( from > to ) - tqSwap( from, to ); - int m = r.size(); - r.resize( m + 1 ); - r[m].from = from; - r[m].to = to; - -#ifndef TQT_NO_REGEXP_OPTIM - int i; - - if ( to - from < NumBadChars ) { - occ1.detach(); - if ( from % NumBadChars <= to % NumBadChars ) { - for ( i = from % NumBadChars; i <= to % NumBadChars; i++ ) - occ1[i] = 0; - } else { - for ( i = 0; i <= to % NumBadChars; i++ ) - occ1[i] = 0; - for ( i = from % NumBadChars; i < NumBadChars; i++ ) - occ1[i] = 0; - } - } else { - occ1.fill( 0, NumBadChars ); - } -#endif -} - -bool TQRegExpEngine::CharClass::in( TQChar ch ) const -{ -#ifndef TQT_NO_REGEXP_OPTIM - if ( occ1[BadChar(ch)] == NoOccurrence ) - return n; -#endif - - if ( c != 0 && (c & (1 << (int) ch.category())) != 0 ) - return !n; - for ( int i = 0; i < (int) r.size(); i++ ) { - if ( ch.tqunicode() >= r[i].from && ch.tqunicode() <= r[i].to ) - return !n; - } - return n; -} - -#if defined(TQT_DEBUG) -void TQRegExpEngine::CharClass::dump() const -{ - int i; - qDebug( " %stive character class", n ? "nega" : "posi" ); -#ifndef TQT_NO_REGEXP_CCLASS - if ( c != 0 ) - qDebug( " categories 0x%.8x", c ); -#endif - for ( i = 0; i < (int) r.size(); i++ ) - qDebug( " 0x%.4x through 0x%.4x", r[i].from, r[i].to ); -} -#endif -#endif - -TQRegExpEngine::Box::Box( TQRegExpEngine *engine ) - : eng( engine ), skipanchors( 0 ) -#ifndef TQT_NO_REGEXP_OPTIM - , earlyStart( 0 ), lateStart( 0 ), maxl( 0 ) -#endif -{ -#ifndef TQT_NO_REGEXP_OPTIM - occ1.fill( NoOccurrence, NumBadChars ); -#endif - minl = 0; -} - -TQRegExpEngine::Box& TQRegExpEngine::Box::operator=( const Box& b ) -{ - eng = b.eng; - ls = b.ls; - rs = b.rs; - lanchors = b.lanchors; - ranchors = b.ranchors; - skipanchors = b.skipanchors; -#ifndef TQT_NO_REGEXP_OPTIM - earlyStart = b.earlyStart; - lateStart = b.lateStart; - str = b.str; - leftStr = b.leftStr; - rightStr = b.rightStr; - maxl = b.maxl; - occ1 = b.occ1; -#endif - minl = b.minl; - return *this; -} - -void TQRegExpEngine::Box::set( TQChar ch ) -{ - ls.resize( 1 ); - ls[0] = eng->createState( ch ); - rs = ls; - rs.detach(); -#ifndef TQT_NO_REGEXP_OPTIM - str = ch; - leftStr = ch; - rightStr = ch; - maxl = 1; - occ1.detach(); - occ1[BadChar(ch)] = 0; -#endif - minl = 1; -} - -void TQRegExpEngine::Box::set( const CharClass& cc ) -{ - ls.resize( 1 ); - ls[0] = eng->createState( cc ); - rs = ls; - rs.detach(); -#ifndef TQT_NO_REGEXP_OPTIM - maxl = 1; - occ1 = cc.firstOccurrence(); -#endif - minl = 1; -} - -#ifndef TQT_NO_REGEXP_BACKREF -void TQRegExpEngine::Box::set( int bref ) -{ - ls.resize( 1 ); - ls[0] = eng->createState( bref ); - rs = ls; - rs.detach(); - if ( bref >= 1 && bref <= MaxBackRefs ) - skipanchors = Anchor_BackRef0Empty << bref; -#ifndef TQT_NO_REGEXP_OPTIM - maxl = InftyLen; -#endif - minl = 0; -} -#endif - -void TQRegExpEngine::Box::cat( const Box& b ) -{ - eng->addCatTransitions( rs, b.ls ); - addAnchorsToEngine( b ); - if ( minl == 0 ) { - mergeInto( &lanchors, b.lanchors ); - if ( skipanchors != 0 ) { - for ( int i = 0; i < (int) b.ls.size(); i++ ) { - int a = eng->anchorConcatenation( at(lanchors, b.ls[i]), - skipanchors ); - lanchors.insert( b.ls[i], a ); - } - } - mergeInto( &ls, b.ls ); - } - if ( b.minl == 0 ) { - mergeInto( &ranchors, b.ranchors ); - if ( b.skipanchors != 0 ) { - for ( int i = 0; i < (int) rs.size(); i++ ) { - int a = eng->anchorConcatenation( at(ranchors, rs[i]), - b.skipanchors ); - ranchors.insert( rs[i], a ); - } - } - mergeInto( &rs, b.rs ); - } else { - ranchors = b.ranchors; - rs = b.rs; - } - -#ifndef TQT_NO_REGEXP_OPTIM - if ( maxl != InftyLen ) { - if ( rightStr.length() + b.leftStr.length() > - TQMAX(str.length(), b.str.length()) ) { - earlyStart = minl - rightStr.length(); - lateStart = maxl - rightStr.length(); - str = rightStr + b.leftStr; - } else if ( b.str.length() > str.length() ) { - earlyStart = minl + b.earlyStart; - lateStart = maxl + b.lateStart; - str = b.str; - } - } - - if ( (int) leftStr.length() == maxl ) - leftStr += b.leftStr; - - if ( (int) b.rightStr.length() == b.maxl ) { - rightStr += b.rightStr; - } else { - rightStr = b.rightStr; - } - - if ( maxl == InftyLen || b.maxl == InftyLen ) { - maxl = InftyLen; - } else { - maxl += b.maxl; - } - - occ1.detach(); - for ( int i = 0; i < NumBadChars; i++ ) { - if ( b.occ1[i] != NoOccurrence && minl + b.occ1[i] < occ1[i] ) - occ1[i] = minl + b.occ1[i]; - } -#endif - - minl += b.minl; - if ( minl == 0 ) - skipanchors = eng->anchorConcatenation( skipanchors, b.skipanchors ); - else - skipanchors = 0; -} - -void TQRegExpEngine::Box::orx( const Box& b ) -{ - mergeInto( &ls, b.ls ); - mergeInto( &lanchors, b.lanchors ); - mergeInto( &rs, b.rs ); - mergeInto( &ranchors, b.ranchors ); - - if ( b.minl == 0 ) { - if ( minl == 0 ) - skipanchors = eng->anchorAlternation( skipanchors, b.skipanchors ); - else - skipanchors = b.skipanchors; - } - -#ifndef TQT_NO_REGEXP_OPTIM - occ1.detach(); - for ( int i = 0; i < NumBadChars; i++ ) { - if ( occ1[i] > b.occ1[i] ) - occ1[i] = b.occ1[i]; - } - earlyStart = 0; - lateStart = 0; - str = TQString(); - leftStr = TQString(); - rightStr = TQString(); - if ( b.maxl > maxl ) - maxl = b.maxl; -#endif - if ( b.minl < minl ) - minl = b.minl; -} - -void TQRegExpEngine::Box::plus( int atom ) -{ -#ifndef TQT_NO_REGEXP_CAPTURE - eng->addPlusTransitions( rs, ls, atom ); -#else - TQ_UNUSED( atom ); - eng->addCatTransitions( rs, ls ); -#endif - addAnchorsToEngine( *this ); -#ifndef TQT_NO_REGEXP_OPTIM - maxl = InftyLen; -#endif -} - -void TQRegExpEngine::Box::opt() -{ -#ifndef TQT_NO_REGEXP_OPTIM - earlyStart = 0; - lateStart = 0; - str = TQString(); - leftStr = TQString(); - rightStr = TQString(); -#endif - skipanchors = 0; - minl = 0; -} - -void TQRegExpEngine::Box::catAnchor( int a ) -{ - if ( a != 0 ) { - for ( int i = 0; i < (int) rs.size(); i++ ) { - a = eng->anchorConcatenation( at(ranchors, rs[i]), a ); - ranchors.insert( rs[i], a ); - } - if ( minl == 0 ) - skipanchors = eng->anchorConcatenation( skipanchors, a ); - } -} - -#ifndef TQT_NO_REGEXP_OPTIM -void TQRegExpEngine::Box::setupHeuristics() -{ - eng->goodEarlyStart = earlyStart; - eng->goodLateStart = lateStart; - eng->goodStr = eng->cs ? str : str.lower(); - - eng->minl = minl; - if ( eng->cs ) { - /* - A regular expression such as 112|1 has occ1['2'] = 2 and minl = - 1 at this point. An entry of occ1 has to be at most minl or - infinity for the rest of the algorithm to go well. - - We waited until here before normalizing these cases (instead of - doing it in Box::orx()) because sometimes things improve by - themselves. Consider for example (112|1)34. - */ - for ( int i = 0; i < NumBadChars; i++ ) { - if ( occ1[i] != NoOccurrence && occ1[i] >= minl ) - occ1[i] = minl; - } - eng->occ1 = occ1; - } else { - eng->occ1.fill( 0, NumBadChars ); - } - - eng->heuristicallyChooseHeuristic(); -} -#endif - -#if defined(TQT_DEBUG) -void TQRegExpEngine::Box::dump() const -{ - int i; - qDebug( "Box of at least %d character%s", minl, minl == 1 ? "" : "s" ); - qDebug( " Left states:" ); - for ( i = 0; i < (int) ls.size(); i++ ) { - if ( at(lanchors, ls[i]) == 0 ) - qDebug( " %d", ls[i] ); - else - qDebug( " %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]] ); - } - qDebug( " Right states:" ); - for ( i = 0; i < (int) rs.size(); i++ ) { - if ( at(ranchors, rs[i]) == 0 ) - qDebug( " %d", rs[i] ); - else - qDebug( " %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]] ); - } - qDebug( " Skip anchors: 0x%.8x", skipanchors ); -} -#endif - -void TQRegExpEngine::Box::addAnchorsToEngine( const Box& to ) const -{ - for ( int i = 0; i < (int) to.ls.size(); i++ ) { - for ( int j = 0; j < (int) rs.size(); j++ ) { - int a = eng->anchorConcatenation( at(ranchors, rs[j]), - at(to.lanchors, to.ls[i]) ); - eng->addAnchors( rs[j], to.ls[i], a ); - } - } -} - -int TQRegExpEngine::getChar() -{ - return ( yyPos == yyLen ) ? EOS : yyIn[yyPos++].tqunicode(); -} - -int TQRegExpEngine::getEscape() -{ -#ifndef TQT_NO_REGEXP_ESCAPE - const char tab[] = "afnrtv"; // no b, as \b means word boundary - const char backTab[] = "\a\f\n\r\t\v"; - ushort low; - int i; -#endif - ushort val; - int prevCh = yyCh; - - if ( prevCh == EOS ) { - error( RXERR_END ); - return Tok_Char | '\\'; - } - yyCh = getChar(); -#ifndef TQT_NO_REGEXP_ESCAPE - if ( (prevCh & ~0xff) == 0 ) { - const char *p = strchr( tab, prevCh ); - if ( p != 0 ) - return Tok_Char | backTab[p - tab]; - } -#endif - - switch ( prevCh ) { -#ifndef TQT_NO_REGEXP_ESCAPE - case '0': - val = 0; - for ( i = 0; i < 3; i++ ) { - if ( yyCh >= '0' && yyCh <= '7' ) - val = ( val << 3 ) | ( yyCh - '0' ); - else - break; - yyCh = getChar(); - } - if ( (val & ~0377) != 0 ) - error( RXERR_OCTAL ); - return Tok_Char | val; -#endif -#ifndef TQT_NO_REGEXP_ESCAPE - case 'B': - return Tok_NonWord; -#endif -#ifndef TQT_NO_REGEXP_CCLASS - case 'D': - // see TQChar::isDigit() - yyCharClass->addCategories( 0x7fffffef ); - return Tok_CharClass; - case 'S': - // see TQChar::isSpace() - yyCharClass->addCategories( 0x7ffff87f ); - yyCharClass->addRange( 0x0000, 0x0008 ); - yyCharClass->addRange( 0x000e, 0x001f ); - yyCharClass->addRange( 0x007f, 0x009f ); - return Tok_CharClass; - case 'W': - // see TQChar::isLetterOrNumber() - yyCharClass->addCategories( 0x7fe07f8f ); - yyCharClass->addRange( 0x203f, 0x2040 ); - yyCharClass->addSingleton( 0x2040 ); - yyCharClass->addSingleton( 0x30fb ); - yyCharClass->addRange( 0xfe33, 0xfe34 ); - yyCharClass->addRange( 0xfe4d, 0xfe4f ); - yyCharClass->addSingleton( 0xff3f ); - yyCharClass->addSingleton( 0xff65 ); - return Tok_CharClass; -#endif -#ifndef TQT_NO_REGEXP_ESCAPE - case 'b': - return Tok_Word; -#endif -#ifndef TQT_NO_REGEXP_CCLASS - case 'd': - // see TQChar::isDigit() - yyCharClass->addCategories( 0x00000010 ); - return Tok_CharClass; - case 's': - // see TQChar::isSpace() - yyCharClass->addCategories( 0x00000380 ); - yyCharClass->addRange( 0x0009, 0x000d ); - return Tok_CharClass; - case 'w': - // see TQChar::isLetterOrNumber() - yyCharClass->addCategories( 0x000f8070 ); - yyCharClass->addSingleton( 0x005f ); // '_' - return Tok_CharClass; -#endif -#ifndef TQT_NO_REGEXP_ESCAPE - case 'x': - val = 0; - for ( i = 0; i < 4; i++ ) { - low = TQChar( yyCh ).lower(); - if ( low >= '0' && low <= '9' ) - val = ( val << 4 ) | ( low - '0' ); - else if ( low >= 'a' && low <= 'f' ) - val = ( val << 4 ) | ( low - 'a' + 10 ); - else - break; - yyCh = getChar(); - } - return Tok_Char | val; -#endif - default: - if ( prevCh >= '1' && prevCh <= '9' ) { -#ifndef TQT_NO_REGEXP_BACKREF - val = prevCh - '0'; - while ( yyCh >= '0' && yyCh <= '9' ) { - val = ( val * 10 ) + ( yyCh - '0' ); - yyCh = getChar(); - } - return Tok_BackRef | val; -#else - error( RXERR_DISABLED ); -#endif - } - return Tok_Char | prevCh; - } -} - -#ifndef TQT_NO_REGEXP_INTERVAL -int TQRegExpEngine::getRep( int def ) -{ - if ( yyCh >= '0' && yyCh <= '9' ) { - int rep = 0; - do { - rep = 10 * rep + yyCh - '0'; - if ( rep >= InftyRep ) { - error( RXERR_REPETITION ); - rep = def; - } - yyCh = getChar(); - } while ( yyCh >= '0' && yyCh <= '9' ); - return rep; - } else { - return def; - } -} -#endif - -#ifndef TQT_NO_REGEXP_LOOKAHEAD -void TQRegExpEngine::skipChars( int n ) -{ - if ( n > 0 ) { - yyPos += n - 1; - yyCh = getChar(); - } -} -#endif - -void TQRegExpEngine::error( const char *msg ) -{ - if ( yyError.isEmpty() ) - yyError = TQString::tqfromLatin1( msg ); -} - -void TQRegExpEngine::startTokenizer( const TQChar *rx, int len ) -{ - yyIn = rx; - yyPos0 = 0; - yyPos = 0; - yyLen = len; - yyCh = getChar(); - yyCharClass = new CharClass; - yyMinRep = 0; - yyMaxRep = 0; - yyError = TQString(); -} - -int TQRegExpEngine::getToken() -{ -#ifndef TQT_NO_REGEXP_CCLASS - ushort pendingCh = 0; - bool charPending; - bool rangePending; - int tok; -#endif - int prevCh = yyCh; - - yyPos0 = yyPos - 1; -#ifndef TQT_NO_REGEXP_CCLASS - yyCharClass->clear(); -#endif - yyMinRep = 0; - yyMaxRep = 0; - yyCh = getChar(); - - switch ( prevCh ) { - case EOS: - yyPos0 = yyPos; - return Tok_Eos; - case '$': - return Tok_Dollar; - case '(': - if ( yyCh == '?' ) { - prevCh = getChar(); - yyCh = getChar(); - switch ( prevCh ) { -#ifndef TQT_NO_REGEXP_LOOKAHEAD - case '!': - return Tok_NegLookahead; - case '=': - return Tok_PosLookahead; -#endif - case ':': - return Tok_MagicLeftParen; - default: - error( RXERR_LOOKAHEAD ); - return Tok_MagicLeftParen; - } - } else { - return Tok_LeftParen; - } - case ')': - return Tok_RightParen; - case '*': - yyMinRep = 0; - yyMaxRep = InftyRep; - return Tok_Quantifier; - case '+': - yyMinRep = 1; - yyMaxRep = InftyRep; - return Tok_Quantifier; - case '.': -#ifndef TQT_NO_REGEXP_CCLASS - yyCharClass->setNegative( TRUE ); -#endif - return Tok_CharClass; - case '?': - yyMinRep = 0; - yyMaxRep = 1; - return Tok_Quantifier; - case '[': -#ifndef TQT_NO_REGEXP_CCLASS - if ( yyCh == '^' ) { - yyCharClass->setNegative( TRUE ); - yyCh = getChar(); - } - charPending = FALSE; - rangePending = FALSE; - do { - if ( yyCh == '-' && charPending && !rangePending ) { - rangePending = TRUE; - yyCh = getChar(); - } else { - if ( charPending && !rangePending ) { - yyCharClass->addSingleton( pendingCh ); - charPending = FALSE; - } - if ( yyCh == '\\' ) { - yyCh = getChar(); - tok = getEscape(); - if ( tok == Tok_Word ) - tok = '\b'; - } else { - tok = Tok_Char | yyCh; - yyCh = getChar(); - } - if ( tok == Tok_CharClass ) { - if ( rangePending ) { - yyCharClass->addSingleton( '-' ); - yyCharClass->addSingleton( pendingCh ); - charPending = FALSE; - rangePending = FALSE; - } - } else if ( (tok & Tok_Char) != 0 ) { - if ( rangePending ) { - yyCharClass->addRange( pendingCh, tok ^ Tok_Char ); - charPending = FALSE; - rangePending = FALSE; - } else { - pendingCh = tok ^ Tok_Char; - charPending = TRUE; - } - } else { - error( RXERR_CHARCLASS ); - } - } - } while ( yyCh != ']' && yyCh != EOS ); - if ( rangePending ) - yyCharClass->addSingleton( '-' ); - if ( charPending ) - yyCharClass->addSingleton( pendingCh ); - if ( yyCh == EOS ) - error( RXERR_END ); - else - yyCh = getChar(); - return Tok_CharClass; -#else - error( RXERR_END ); - return Tok_Char | '['; -#endif - case '\\': - return getEscape(); - case ']': - error( RXERR_LEFTDELIM ); - return Tok_Char | ']'; - case '^': - return Tok_Caret; - case '{': -#ifndef TQT_NO_REGEXP_INTERVAL - yyMinRep = getRep( 0 ); - yyMaxRep = yyMinRep; - if ( yyCh == ',' ) { - yyCh = getChar(); - yyMaxRep = getRep( InftyRep ); - } - if ( yyMaxRep < yyMinRep ) - tqSwap( yyMinRep, yyMaxRep ); - if ( yyCh != '}' ) - error( RXERR_REPETITION ); - yyCh = getChar(); - return Tok_Quantifier; -#else - error( RXERR_DISABLED ); - return Tok_Char | '{'; -#endif - case '|': - return Tok_Bar; - case '}': - error( RXERR_LEFTDELIM ); - return Tok_Char | '}'; - default: - return Tok_Char | prevCh; - } -} - -int TQRegExpEngine::parse( const TQChar *pattern, int len ) -{ - valid = TRUE; - startTokenizer( pattern, len ); - yyTok = getToken(); -#ifndef TQT_NO_REGEXP_CAPTURE - yyMayCapture = TRUE; -#else - yyMayCapture = FALSE; -#endif - -#ifndef TQT_NO_REGEXP_CAPTURE - int atom = startAtom( FALSE ); -#endif - CharClass anything; - Box box( this ); // create InitialState - box.set( anything ); - Box rightBox( this ); // create FinalState - rightBox.set( anything ); - - Box middleBox( this ); - parseExpression( &middleBox ); -#ifndef TQT_NO_REGEXP_CAPTURE - finishAtom( atom ); -#endif -#ifndef TQT_NO_REGEXP_OPTIM - middleBox.setupHeuristics(); -#endif - box.cat( middleBox ); - box.cat( rightBox ); - delete yyCharClass; - yyCharClass = 0; - - officialncap = ncap; -#ifndef TQT_NO_REGEXP_BACKREF - if ( nbrefs > ncap ) - ncap = nbrefs; -#endif - - /* - We use one TQMemArray<int> for all the big data used a lot in - matchHere() and friends. - */ -#ifndef TQT_NO_REGEXP_OPTIM - mmSlideTabSize = TQMAX( minl + 1, 16 ); -#else - mmSlideTabSize = 0; -#endif - mmBigArray.resize( (3 + 4 * ncap) * ns + 4 * ncap + mmSlideTabSize ); - - mmInNextStack = mmBigArray.data(); - memset( mmInNextStack, -1, ns * sizeof(int) ); - mmCurStack = mmInNextStack + ns; - mmNextStack = mmInNextStack + 2 * ns; - - mmCurCapBegin = mmInNextStack + 3 * ns; - mmNextCapBegin = mmCurCapBegin + ncap * ns; - mmCurCapEnd = mmCurCapBegin + 2 * ncap * ns; - mmNextCapEnd = mmCurCapBegin + 3 * ncap * ns; - - mmTempCapBegin = mmCurCapBegin + 4 * ncap * ns; - mmTempCapEnd = mmTempCapBegin + ncap; - mmCapBegin = mmTempCapBegin + 2 * ncap; - mmCapEnd = mmTempCapBegin + 3 * ncap; - - mmSlideTab = mmTempCapBegin + 4 * ncap; - - if ( !yyError.isEmpty() ) - return -1; - -#ifndef TQT_NO_REGEXP_OPTIM - State *sinit = s[InitialState]; - caretAnchored = ( sinit->anchors != 0 ); - if ( caretAnchored ) { - TQMap<int, int>& anchors = *sinit->anchors; - TQMap<int, int>::ConstIterator a; - for ( a = anchors.begin(); a != anchors.end(); ++a ) { - if ( -#ifndef TQT_NO_REGEXP_ANCHOR_ALT - (*a & Anchor_Alternation) != 0 || -#endif - (*a & Anchor_Caret) == 0 ) { - caretAnchored = FALSE; - break; - } - } - } -#endif - return yyPos0; -} - -void TQRegExpEngine::parseAtom( Box *box ) -{ -#ifndef TQT_NO_REGEXP_LOOKAHEAD - TQRegExpEngine *eng = 0; - bool neg; - int len; -#endif - - if ( (yyTok & Tok_Char) != 0 ) { - box->set( TQChar(yyTok ^ Tok_Char) ); - } else { -#ifndef TQT_NO_REGEXP_OPTIM - trivial = FALSE; -#endif - switch ( yyTok ) { - case Tok_Dollar: - box->catAnchor( Anchor_Dollar ); - break; - case Tok_Caret: - box->catAnchor( Anchor_Caret ); - break; -#ifndef TQT_NO_REGEXP_LOOKAHEAD - case Tok_PosLookahead: - case Tok_NegLookahead: - neg = ( yyTok == Tok_NegLookahead ); - eng = new TQRegExpEngine( cs ); - len = eng->parse( yyIn + yyPos - 1, yyLen - yyPos + 1 ); - if ( len >= 0 ) - skipChars( len ); - else - error( RXERR_LOOKAHEAD ); - box->catAnchor( addLookahead(eng, neg) ); - yyTok = getToken(); - if ( yyTok != Tok_RightParen ) - error( RXERR_LOOKAHEAD ); - break; -#endif -#ifndef TQT_NO_REGEXP_ESCAPE - case Tok_Word: - box->catAnchor( Anchor_Word ); - break; - case Tok_NonWord: - box->catAnchor( Anchor_NonWord ); - break; -#endif - case Tok_LeftParen: - case Tok_MagicLeftParen: - yyTok = getToken(); - parseExpression( box ); - if ( yyTok != Tok_RightParen ) - error( RXERR_END ); - break; - case Tok_CharClass: - box->set( *yyCharClass ); - break; - case Tok_Quantifier: - error( RXERR_REPETITION ); - break; - default: -#ifndef TQT_NO_REGEXP_BACKREF - if ( (yyTok & Tok_BackRef) != 0 ) - box->set( yyTok ^ Tok_BackRef ); - else -#endif - error( RXERR_DISABLED ); - } - } - yyTok = getToken(); -} - -void TQRegExpEngine::parseFactor( Box *box ) -{ -#ifndef TQT_NO_REGEXP_CAPTURE - int atom = startAtom( yyMayCapture && yyTok == Tok_LeftParen ); -#else - static const int atom = 0; -#endif - -#ifndef TQT_NO_REGEXP_INTERVAL -#define YYREDO() \ - yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \ - *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok - - const TQChar *in = yyIn; - int pos0 = yyPos0; - int pos = yyPos; - int len = yyLen; - int ch = yyCh; - CharClass charClass; - if ( yyTok == Tok_CharClass ) - charClass = *yyCharClass; - int tok = yyTok; - bool mayCapture = yyMayCapture; -#endif - - parseAtom( box ); -#ifndef TQT_NO_REGEXP_CAPTURE - finishAtom( atom ); -#endif - - if ( yyTok == Tok_Quantifier ) { -#ifndef TQT_NO_REGEXP_OPTIM - trivial = FALSE; -#endif - if ( yyMaxRep == InftyRep ) { - box->plus( atom ); -#ifndef TQT_NO_REGEXP_INTERVAL - } else if ( yyMaxRep == 0 ) { - box->clear(); -#endif - } - if ( yyMinRep == 0 ) - box->opt(); - -#ifndef TQT_NO_REGEXP_INTERVAL - yyMayCapture = FALSE; - int alpha = ( yyMinRep == 0 ) ? 0 : yyMinRep - 1; - int beta = ( yyMaxRep == InftyRep ) ? 0 : yyMaxRep - ( alpha + 1 ); - - Box rightBox( this ); - int i; - - for ( i = 0; i < beta; i++ ) { - YYREDO(); - Box leftBox( this ); - parseAtom( &leftBox ); - leftBox.cat( rightBox ); - leftBox.opt(); - rightBox = leftBox; - } - for ( i = 0; i < alpha; i++ ) { - YYREDO(); - Box leftBox( this ); - parseAtom( &leftBox ); - leftBox.cat( rightBox ); - rightBox = leftBox; - } - rightBox.cat( *box ); - *box = rightBox; -#endif - yyTok = getToken(); -#ifndef TQT_NO_REGEXP_INTERVAL - yyMayCapture = mayCapture; -#endif - } -#undef YYREDO -} - -void TQRegExpEngine::parseTerm( Box *box ) -{ -#ifndef TQT_NO_REGEXP_OPTIM - if ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar ) - parseFactor( box ); -#endif - while ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar ) { - Box rightBox( this ); - parseFactor( &rightBox ); - box->cat( rightBox ); - } -} - -void TQRegExpEngine::parseExpression( Box *box ) -{ - parseTerm( box ); - while ( yyTok == Tok_Bar ) { -#ifndef TQT_NO_REGEXP_OPTIM - trivial = FALSE; -#endif - Box rightBox( this ); - yyTok = getToken(); - parseTerm( &rightBox ); - box->orx( rightBox ); - } -} - -/* - The struct TQRegExpPrivate contains the private data of a regular - expression other than the automaton. It makes it possible for many - TQRegExp objects to use the same TQRegExpEngine object with different - TQRegExpPrivate objects. -*/ -struct TQRegExpPrivate -{ - TQString pattern; // regular-expression or wildcard pattern - TQString rxpattern; // regular-expression pattern -#ifndef TQT_NO_REGEXP_WILDCARD - bool wc : 1; // wildcard mode? -#endif - bool min : 1; // minimal matching? (instead of maximal) - bool cs : 1; // case sensitive? -#ifndef TQT_NO_REGEXP_CAPTURE - TQString t; // last string passed to TQRegExp::search() or searchRev() - TQStringList capturedCache; // what TQRegExp::capturedTexts() returned last -#endif - TQMemArray<int> captured; // what TQRegExpEngine::search() returned last - - TQRegExpPrivate() { captured.fill( -1, 2 ); } -}; - -#ifndef TQT_NO_REGEXP_OPTIM -static TQSingleCleanupHandler<TQCache<TQRegExpEngine> > cleanup_cache; -# ifndef TQT_THREAD_SUPPORT -static TQCache<TQRegExpEngine> *engineCache = 0; -# endif // TQT_THREAD_SUPPORT -#endif // TQT_NO_REGEXP_OPTIM - -static void regexpEngine( TQRegExpEngine *&eng, const TQString &pattern, - bool caseSensitive, bool deref ) -{ -# ifdef TQT_THREAD_SUPPORT - static TQThreadStorage<TQCache<TQRegExpEngine> *> engineCaches; - TQCache<TQRegExpEngine> *engineCache = 0; - TQThreadInstance *currentThread = TQThreadInstance::current(); - if (currentThread) - engineCache = engineCaches.localData(); -#endif // TQT_THREAD_SUPPORT - - if ( !deref ) { -#ifndef TQT_NO_REGEXP_OPTIM -# ifdef TQT_THREAD_SUPPORT - if ( currentThread ) -# endif - { - if ( engineCache != 0 ) { - eng = engineCache->take( pattern ); - if ( eng == 0 || eng->caseSensitive() != caseSensitive ) { - delete eng; - } else { - eng->ref(); - return; - } - } - } -#endif // TQT_NO_REGEXP_OPTIM - eng = new TQRegExpEngine( pattern, caseSensitive ); - return; - } - - if ( eng->deref() ) { -#ifndef TQT_NO_REGEXP_OPTIM -# ifdef TQT_THREAD_SUPPORT - if ( currentThread ) -# endif - { - if ( engineCache == 0 ) { - engineCache = new TQCache<TQRegExpEngine>; - engineCache->setAutoDelete( TRUE ); -# ifdef TQT_THREAD_SUPPORT - engineCaches.setLocalData(engineCache); -# else - cleanup_cache.set( &engineCache ); -# endif // !TQT_THREAD_SUPPORT - } - if ( !pattern.isNull() && - engineCache->insert(pattern, eng, 4 + pattern.length() / 4) ) - return; - } -#else - TQ_UNUSED( pattern ); -#endif // TQT_NO_REGEXP_OPTIM - delete eng; - eng = 0; - } -} - -/*! - \enum TQRegExp::CaretMode - - The CaretMode enum defines the different meanings of the caret - (<b>^</b>) in a regular expression. The possible values are: - - \value CaretAtZero - The caret corresponds to index 0 in the searched string. - - \value CaretAtOffset - The caret corresponds to the start offset of the search. - - \value CaretWontMatch - The caret never matches. -*/ - -/*! - Constructs an empty regexp. - - \sa isValid() errorString() -*/ -TQRegExp::TQRegExp() - : eng( 0 ) -{ - priv = new TQRegExpPrivate; -#ifndef TQT_NO_REGEXP_WILDCARD - priv->wc = FALSE; -#endif - priv->min = FALSE; - priv->cs = TRUE; -} - -/*! - Constructs a regular expression object for the given \a pattern - string. The pattern must be given using wildcard notation if \a - wildcard is TRUE (default is FALSE). The pattern is case - sensitive, unless \a caseSensitive is FALSE. Matching is greedy - (maximal), but can be changed by calling setMinimal(). - - \sa setPattern() setCaseSensitive() setWildcard() setMinimal() -*/ -TQRegExp::TQRegExp( const TQString& pattern, bool caseSensitive, bool wildcard ) - : eng( 0 ) -{ - priv = new TQRegExpPrivate; - priv->pattern = pattern; -#ifndef TQT_NO_REGEXP_WILDCARD - priv->wc = wildcard; -#endif - priv->min = FALSE; - priv->cs = caseSensitive; -} - -/*! - Constructs a regular expression as a copy of \a rx. - - \sa operator=() -*/ -TQRegExp::TQRegExp( const TQRegExp& rx ) - : eng( 0 ) -{ - priv = new TQRegExpPrivate; - operator=( rx ); -} - -/*! - Destroys the regular expression and cleans up its internal data. -*/ -TQRegExp::~TQRegExp() -{ - invalidateEngine(); - delete priv; -} - -/*! - Copies the regular expression \a rx and returns a reference to the - copy. The case sensitivity, wildcard and minimal matching options - are also copied. -*/ -TQRegExp& TQRegExp::operator=( const TQRegExp& rx ) -{ - TQRegExpEngine *otherEng = rx.eng; - if ( otherEng != 0 ) - otherEng->ref(); - invalidateEngine(); - eng = otherEng; - priv->pattern = rx.priv->pattern; - priv->rxpattern = rx.priv->rxpattern; -#ifndef TQT_NO_REGEXP_WILDCARD - priv->wc = rx.priv->wc; -#endif - priv->min = rx.priv->min; - priv->cs = rx.priv->cs; -#ifndef TQT_NO_REGEXP_CAPTURE - priv->t = rx.priv->t; - priv->capturedCache = rx.priv->capturedCache; -#endif - priv->captured = rx.priv->captured; - return *this; -} - -/*! - Returns TRUE if this regular expression is equal to \a rx; - otherwise returns FALSE. - - Two TQRegExp objects are equal if they have the same pattern - strings and the same settings for case sensitivity, wildcard and - minimal matching. -*/ -bool TQRegExp::operator==( const TQRegExp& rx ) const -{ - return priv->pattern == rx.priv->pattern && -#ifndef TQT_NO_REGEXP_WILDCARD - priv->wc == rx.priv->wc && -#endif - priv->min == rx.priv->min && - priv->cs == rx.priv->cs; -} - -/*! - \fn bool TQRegExp::operator!=( const TQRegExp& rx ) const - - Returns TRUE if this regular expression is not equal to \a rx; - otherwise returns FALSE. - - \sa operator==() -*/ - -/*! - Returns TRUE if the pattern string is empty; otherwise returns - FALSE. - - If you call exactMatch() with an empty pattern on an empty string - it will return TRUE; otherwise it returns FALSE since it operates - over the whole string. If you call search() with an empty pattern - on \e any string it will return the start offset (0 by default) - because the empty pattern matches the 'emptiness' at the start of - the string. In this case the length of the match returned by - matchedLength() will be 0. - - See TQString::isEmpty(). -*/ - -bool TQRegExp::isEmpty() const -{ - return priv->pattern.isEmpty(); -} - -/*! - Returns TRUE if the regular expression is valid; otherwise returns - FALSE. An invalid regular expression never matches. - - The pattern <b>[a-z</b> is an example of an invalid pattern, since - it lacks a closing square bracket. - - Note that the validity of a regexp may also depend on the setting - of the wildcard flag, for example <b>*.html</b> is a valid - wildcard regexp but an invalid full regexp. - - \sa errorString() -*/ -bool TQRegExp::isValid() const -{ - if ( priv->pattern.isEmpty() ) { - return TRUE; - } else { - prepareEngine(); - return eng->isValid(); - } -} - -/*! - Returns the pattern string of the regular expression. The pattern - has either regular expression syntax or wildcard syntax, depending - on wildcard(). - - \sa setPattern() -*/ -TQString TQRegExp::pattern() const -{ - return priv->pattern; -} - -/*! - Sets the pattern string to \a pattern. The case sensitivity, - wildcard and minimal matching options are not changed. - - \sa pattern() -*/ -void TQRegExp::setPattern( const TQString& pattern ) -{ - if ( priv->pattern != pattern ) { - priv->pattern = pattern; - invalidateEngine(); - } -} - -/*! - Returns TRUE if case sensitivity is enabled; otherwise returns - FALSE. The default is TRUE. - - \sa setCaseSensitive() -*/ -bool TQRegExp::caseSensitive() const -{ - return priv->cs; -} - -/*! - Sets case sensitive matching to \a sensitive. - - If \a sensitive is TRUE, <b>\\.txt$</b> matches \c{readme.txt} but - not \c{README.TXT}. - - \sa caseSensitive() -*/ -void TQRegExp::setCaseSensitive( bool sensitive ) -{ - if ( sensitive != priv->cs ) { - priv->cs = sensitive; - invalidateEngine(); - } -} - -#ifndef TQT_NO_REGEXP_WILDCARD -/*! - Returns TRUE if wildcard mode is enabled; otherwise returns FALSE. - The default is FALSE. - - \sa setWildcard() -*/ -bool TQRegExp::wildcard() const -{ - return priv->wc; -} - -/*! - Sets the wildcard mode for the regular expression. The default is - FALSE. - - Setting \a wildcard to TRUE enables simple shell-like wildcard - matching. (See \link #wildcard-matching wildcard matching - (globbing) \endlink.) - - For example, <b>r*.txt</b> matches the string \c{readme.txt} in - wildcard mode, but does not match \c{readme}. - - \sa wildcard() -*/ -void TQRegExp::setWildcard( bool wildcard ) -{ - if ( wildcard != priv->wc ) { - priv->wc = wildcard; - invalidateEngine(); - } -} -#endif - -/*! - Returns TRUE if minimal (non-greedy) matching is enabled; - otherwise returns FALSE. - - \sa setMinimal() -*/ -bool TQRegExp::minimal() const -{ - return priv->min; -} - -/*! - Enables or disables minimal matching. If \a minimal is FALSE, - matching is greedy (maximal) which is the default. - - For example, suppose we have the input string "We must be - \<b>bold\</b>, very \<b>bold\</b>!" and the pattern - <b>\<b>.*\</b></b>. With the default greedy (maximal) matching, - the match is "We must be <u>\<b>bold\</b>, very - \<b>bold\</b></u>!". But with minimal (non-greedy) matching the - first match is: "We must be <u>\<b>bold\</b></u>, very - \<b>bold\</b>!" and the second match is "We must be \<b>bold\</b>, - very <u>\<b>bold\</b></u>!". In practice we might use the pattern - <b>\<b>[^\<]+\</b></b> instead, although this will still fail for - nested tags. - - \sa minimal() -*/ -void TQRegExp::setMinimal( bool minimal ) -{ - priv->min = minimal; -} - -/*! - Returns TRUE if \a str is matched exactly by this regular - expression; otherwise returns FALSE. You can determine how much of - the string was matched by calling matchedLength(). - - For a given regexp string, R, exactMatch("R") is the equivalent of - search("^R$") since exactMatch() effectively encloses the regexp - in the start of string and end of string anchors, except that it - sets matchedLength() differently. - - For example, if the regular expression is <b>blue</b>, then - exactMatch() returns TRUE only for input \c blue. For inputs \c - bluebell, \c blutak and \c lightblue, exactMatch() returns FALSE - and matchedLength() will return 4, 3 and 0 respectively. - - Although const, this function sets matchedLength(), - capturedTexts() and pos(). - - \sa search() searchRev() TQRegExpValidator -*/ -bool TQRegExp::exactMatch( const TQString& str ) const -{ - prepareEngineForMatch( str ); - eng->match( str, 0, priv->min, TRUE, 0, priv->captured ); - if ( priv->captured[1] == (int) str.length() ) { - return TRUE; - } else { - priv->captured[0] = 0; - priv->captured[1] = eng->partialMatchLength(); - return FALSE; - } -} - -#ifndef TQT_NO_COMPAT -/*! \obsolete - - Attempts to match in \a str, starting from position \a index. - Returns the position of the match, or -1 if there was no match. - - The length of the match is stored in \a *len, unless \a len is a - null pointer. - - If \a indexIsStart is TRUE (the default), the position \a index in - the string will match the start of string anchor, <b>^</b>, in the - regexp, if present. Otherwise, position 0 in \a str will match. - - Use search() and matchedLength() instead of this function. - - \sa TQString::mid() TQConstString -*/ -int TQRegExp::match( const TQString& str, int index, int *len, - bool indexIsStart ) const -{ - int pos = search( str, index, indexIsStart ? CaretAtOffset : CaretAtZero ); - if ( len != 0 ) - *len = matchedLength(); - return pos; -} -#endif // TQT_NO_COMPAT - -int TQRegExp::search( const TQString& str, int offset ) const -{ - return search( str, offset, CaretAtZero ); -} - -/*! - Attempts to find a match in \a str from position \a offset (0 by - default). If \a offset is -1, the search starts at the last - character; if -2, at the next to last character; etc. - - Returns the position of the first match, or -1 if there was no - match. - - The \a caretMode parameter can be used to instruct whether <b>^</b> - should match at index 0 or at \a offset. - - You might prefer to use TQString::find(), TQString::contains() or - even TQStringList::grep(). To replace matches use - TQString::replace(). - - Example: - \code - TQString str = "offsets: 1.23 .50 71.00 6.00"; - TQRegExp rx( "\\d*\\.\\d+" ); // primitive floating point matching - int count = 0; - int pos = 0; - while ( (pos = rx.search(str, pos)) != -1 ) { - count++; - pos += rx.matchedLength(); - } - // pos will be 9, 14, 18 and finally 24; count will end up as 4 - \endcode - - Although const, this function sets matchedLength(), - capturedTexts() and pos(). - - \sa searchRev() exactMatch() -*/ - -int TQRegExp::search( const TQString& str, int offset, CaretMode caretMode ) const -{ - prepareEngineForMatch( str ); - if ( offset < 0 ) - offset += str.length(); - eng->match( str, offset, priv->min, FALSE, caretIndex(offset, caretMode), - priv->captured ); - return priv->captured[0]; -} - - -int TQRegExp::searchRev( const TQString& str, int offset ) const -{ - return searchRev( str, offset, CaretAtZero ); -} - -/*! - Attempts to find a match backwards in \a str from position \a - offset. If \a offset is -1 (the default), the search starts at the - last character; if -2, at the next to last character; etc. - - Returns the position of the first match, or -1 if there was no - match. - - The \a caretMode parameter can be used to instruct whether <b>^</b> - should match at index 0 or at \a offset. - - Although const, this function sets matchedLength(), - capturedTexts() and pos(). - - \warning Searching backwards is much slower than searching - forwards. - - \sa search() exactMatch() -*/ - -int TQRegExp::searchRev( const TQString& str, int offset, - CaretMode caretMode ) const -{ - prepareEngineForMatch( str ); - if ( offset < 0 ) - offset += str.length(); - if ( offset < 0 || offset > (int) str.length() ) { - priv->captured.detach(); - priv->captured.fill( -1 ); - return -1; - } - - while ( offset >= 0 ) { - eng->match( str, offset, priv->min, TRUE, caretIndex(offset, caretMode), - priv->captured ); - if ( priv->captured[0] == offset ) - return offset; - offset--; - } - return -1; -} - -/*! - Returns the length of the last matched string, or -1 if there was - no match. - - \sa exactMatch() search() searchRev() -*/ -int TQRegExp::matchedLength() const -{ - return priv->captured[1]; -} - -#ifndef TQT_NO_REGEXP_CAPTURE -/*! - Returns the number of captures contained in the regular expression. - */ -int TQRegExp::numCaptures() const -{ - prepareEngine(); - return eng->numCaptures(); -} - -/*! - Returns a list of the captured text strings. - - The first string in the list is the entire matched string. Each - subsequent list element contains a string that matched a - (capturing) subexpression of the regexp. - - For example: - \code - TQRegExp rx( "(\\d+)(\\s*)(cm|inch(es)?)" ); - int pos = rx.search( "Length: 36 inches" ); - TQStringList list = rx.capturedTexts(); - // list is now ( "36 inches", "36", " ", "inches", "es" ) - \endcode - - The above example also captures elements that may be present but - which we have no interest in. This problem can be solved by using - non-capturing parentheses: - - \code - TQRegExp rx( "(\\d+)(?:\\s*)(cm|inch(?:es)?)" ); - int pos = rx.search( "Length: 36 inches" ); - TQStringList list = rx.capturedTexts(); - // list is now ( "36 inches", "36", "inches" ) - \endcode - - Note that if you want to iterate over the list, you should iterate - over a copy, e.g. - \code - TQStringList list = rx.capturedTexts(); - TQStringList::Iterator it = list.begin(); - while( it != list.end() ) { - myProcessing( *it ); - ++it; - } - \endcode - - Some regexps can match an indeterminate number of times. For - example if the input string is "Offsets: 12 14 99 231 7" and the - regexp, \c{rx}, is <b>(\\d+)+</b>, we would hope to get a list of - all the numbers matched. However, after calling - \c{rx.search(str)}, capturedTexts() will return the list ( "12", - "12" ), i.e. the entire match was "12" and the first subexpression - matched was "12". The correct approach is to use cap() in a \link - #cap_in_a_loop loop \endlink. - - The order of elements in the string list is as follows. The first - element is the entire matching string. Each subsequent element - corresponds to the next capturing open left parentheses. Thus - capturedTexts()[1] is the text of the first capturing parentheses, - capturedTexts()[2] is the text of the second and so on - (corresponding to $1, $2, etc., in some other regexp languages). - - \sa cap() pos() exactMatch() search() searchRev() -*/ -TQStringList TQRegExp::capturedTexts() -{ - if ( priv->capturedCache.isEmpty() ) { - for ( int i = 0; i < (int) priv->captured.size(); i += 2 ) { - TQString m; - if ( priv->captured[i + 1] == 0 ) - m = TQString::tqfromLatin1( "" ); - else if ( priv->captured[i] >= 0 ) - m = priv->t.mid( priv->captured[i], - priv->captured[i + 1] ); - priv->capturedCache.append( m ); - } - priv->t = TQString::null; - } - return priv->capturedCache; -} - -/*! - Returns the text captured by the \a nth subexpression. The entire - match has index 0 and the parenthesized subexpressions have - indices starting from 1 (excluding non-capturing parentheses). - - \code - TQRegExp rxlen( "(\\d+)(?:\\s*)(cm|inch)" ); - int pos = rxlen.search( "Length: 189cm" ); - if ( pos > -1 ) { - TQString value = rxlen.cap( 1 ); // "189" - TQString unit = rxlen.cap( 2 ); // "cm" - // ... - } - \endcode - - The order of elements matched by cap() is as follows. The first - element, cap(0), is the entire matching string. Each subsequent - element corresponds to the next capturing open left parentheses. - Thus cap(1) is the text of the first capturing parentheses, cap(2) - is the text of the second, and so on. - - \target cap_in_a_loop - Some patterns may lead to a number of matches which cannot be - determined in advance, for example: - - \code - TQRegExp rx( "(\\d+)" ); - str = "Offsets: 12 14 99 231 7"; - TQStringList list; - pos = 0; - while ( pos >= 0 ) { - pos = rx.search( str, pos ); - if ( pos > -1 ) { - list += rx.cap( 1 ); - pos += rx.matchedLength(); - } - } - // list contains "12", "14", "99", "231", "7" - \endcode - - \sa capturedTexts() pos() exactMatch() search() searchRev() -*/ -TQString TQRegExp::cap( int nth ) -{ - if ( nth < 0 || nth >= (int) priv->captured.size() / 2 ) { - return TQString::null; - } else { - return capturedTexts()[nth]; - } -} - -/*! - Returns the position of the \a nth captured text in the searched - string. If \a nth is 0 (the default), pos() returns the position - of the whole match. - - Example: - \code - TQRegExp rx( "/([a-z]+)/([a-z]+)" ); - rx.search( "Output /dev/null" ); // returns 7 (position of /dev/null) - rx.pos( 0 ); // returns 7 (position of /dev/null) - rx.pos( 1 ); // returns 8 (position of dev) - rx.pos( 2 ); // returns 12 (position of null) - \endcode - - For zero-length matches, pos() always returns -1. (For example, if - cap(4) would return an empty string, pos(4) returns -1.) This is - due to an implementation tradeoff. - - \sa capturedTexts() exactMatch() search() searchRev() -*/ -int TQRegExp::pos( int nth ) -{ - if ( nth < 0 || nth >= (int) priv->captured.size() / 2 ) - return -1; - else - return priv->captured[2 * nth]; -} - -/*! - Returns a text string that explains why a regexp pattern is - invalid the case being; otherwise returns "no error occurred". - - \sa isValid() -*/ -TQString TQRegExp::errorString() -{ - if ( isValid() ) { - return TQString( RXERR_OK ); - } else { - return eng->errorString(); - } -} -#endif - -/*! - Returns the string \a str with every regexp special character - escaped with a backslash. The special characters are $, (, ), *, +, - ., ?, [, \, ], ^, {, | and }. - - Example: - \code - s1 = TQRegExp::escape( "bingo" ); // s1 == "bingo" - s2 = TQRegExp::escape( "f(x)" ); // s2 == "f\\(x\\)" - \endcode - - This function is useful to construct regexp patterns dynamically: - - \code - TQRegExp rx( "(" + TQRegExp::escape(name) + - "|" + TQRegExp::escape(alias) + ")" ); - \endcode -*/ -TQString TQRegExp::escape( const TQString& str ) -{ - static const char meta[] = "$()*+.?[\\]^{|}"; - TQString quoted = str; - int i = 0; - - while ( i < (int) quoted.length() ) { - if ( strchr(meta, quoted[i].latin1()) != 0 ) - quoted.insert( i++, "\\" ); - i++; - } - return quoted; -} - -void TQRegExp::prepareEngine() const -{ - if ( eng == 0 ) { -#ifndef TQT_NO_REGEXP_WILDCARD - if ( priv->wc ) - priv->rxpattern = wc2rx( priv->pattern ); - else -#endif - priv->rxpattern = priv->pattern.isNull() ? TQString::tqfromLatin1( "" ) - : priv->pattern; - TQRegExp *that = (TQRegExp *) this; - // that->eng = newEngine( priv->rxpattern, priv->cs ); - regexpEngine( that->eng, priv->rxpattern, priv->cs, FALSE ); - priv->captured.detach(); - priv->captured.fill( -1, 2 + 2 * eng->numCaptures() ); - } -} - -void TQRegExp::prepareEngineForMatch( const TQString& str ) const -{ - prepareEngine(); -#ifndef TQT_NO_REGEXP_CAPTURE - priv->t = str; - priv->capturedCache.clear(); -#else - TQ_UNUSED( str ); -#endif -} - -void TQRegExp::invalidateEngine() -{ - if ( eng != 0 ) { - regexpEngine( eng, priv->rxpattern, priv->cs, TRUE ); - priv->rxpattern = TQString(); - eng = 0; - } -} - -int TQRegExp::caretIndex( int offset, CaretMode caretMode ) -{ - if ( caretMode == CaretAtZero ) { - return 0; - } else if ( caretMode == CaretAtOffset ) { - return offset; - } else { // CaretWontMatch - return -1; - } -} - -#endif // USE_QT4 - -#endif // TQT_NO_REGEXP |