summaryrefslogtreecommitdiffstats
path: root/tqtinterface/qt4/src/tools/tqregexp.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'tqtinterface/qt4/src/tools/tqregexp.cpp')
-rw-r--r--tqtinterface/qt4/src/tools/tqregexp.cpp4050
1 files changed, 0 insertions, 4050 deletions
diff --git a/tqtinterface/qt4/src/tools/tqregexp.cpp b/tqtinterface/qt4/src/tools/tqregexp.cpp
deleted file mode 100644
index 7a7f1ab..0000000
--- a/tqtinterface/qt4/src/tools/tqregexp.cpp
+++ /dev/null
@@ -1,4050 +0,0 @@
-/****************************************************************************
-**
-** Implementation of TQRegExp class
-**
-** Created : 950126
-**
-** Copyright (C) 2010 Timothy Pearson and (C) 1992-2008 Trolltech ASA.
-**
-** This file is part of the tools module of the TQt GUI Toolkit.
-**
-** This file may be used under the terms of the GNU General
-** Public License versions 2.0 or 3.0 as published by the Free
-** Software Foundation and appearing in the files LICENSE.GPL2
-** and LICENSE.GPL3 included in the packaging of this file.
-** Alternatively you may (at your option) use any later version
-** of the GNU General Public License if such license has been
-** publicly approved by Trolltech ASA (or its successors, if any)
-** and the KDE Free TQt Foundation.
-**
-** Please review the following information to ensure GNU General
-** Public Licensing requirements will be met:
-** http://trolltech.com/products/qt/licenses/licensing/opensource/.
-** If you are unsure which license is appropriate for your use, please
-** review the following information:
-** http://trolltech.com/products/qt/licenses/licensing/licensingoverview
-** or contact the sales department at [email protected].
-**
-** This file may be used under the terms of the Q Public License as
-** defined by Trolltech ASA and appearing in the file LICENSE.TQPL
-** included in the packaging of this file. Licensees holding valid TQt
-** Commercial licenses may use this file in accordance with the TQt
-** Commercial License Agreement provided with the Software.
-**
-** This file is provided "AS IS" with NO WARRANTY OF ANY KIND,
-** INCLUDING THE WARRANTIES OF DESIGN, MERCHANTABILITY AND FITNESS FOR
-** A PARTICULAR PURPOSE. Trolltech reserves all rights not granted
-** herein.
-**
-**********************************************************************/
-
-#include "tqregexp.h"
-
-#ifndef TQT_NO_REGEXP
-
-#include "tqmemarray.h"
-#include "tqbitarray.h"
-#include "tqcache.h"
-#include "tqcleanuphandler.h"
-#include "tqintdict.h"
-#include "tqmap.h"
-#include "tqptrvector.h"
-#include "tqstring.h"
-#include "tqtl.h"
-
-#ifdef TQT_THREAD_SUPPORT
-#include "tqthreadstorage.h"
-#include <private/tqthreadinstance_p.h>
-#endif // TQT_THREAD_SUPPORT
-
-#undef TQT_TRANSLATE_NOOP
-#define TQT_TRANSLATE_NOOP( context, sourceText ) sourceText
-
-#include <limits.h>
-
-// error strings for the regexp parser
-#define RXERR_OK TQT_TRANSLATE_NOOP( "TQRegExp", "no error occurred" )
-#define RXERR_DISABLED TQT_TRANSLATE_NOOP( "TQRegExp", "disabled feature used" )
-#define RXERR_CHARCLASS TQT_TRANSLATE_NOOP( "TQRegExp", "bad char class syntax" )
-#define RXERR_LOOKAHEAD TQT_TRANSLATE_NOOP( "TQRegExp", "bad lookahead syntax" )
-#define RXERR_REPETITION TQT_TRANSLATE_NOOP( "TQRegExp", "bad repetition syntax" )
-#define RXERR_OCTAL TQT_TRANSLATE_NOOP( "TQRegExp", "invalid octal value" )
-#define RXERR_LEFTDELIM TQT_TRANSLATE_NOOP( "TQRegExp", "missing left delim" )
-#define RXERR_END TQT_TRANSLATE_NOOP( "TQRegExp", "unexpected end" )
-#define RXERR_LIMIT TQT_TRANSLATE_NOOP( "TQRegExp", "met internal limit" )
-
-#ifdef USE_QT4
-
-/*!
- QT4 INTEROPERABILITY
-*/
-int TQRegExp::search( const TQString& str, int offset ) const
-{
- return indexIn( str, offset, CaretAtZero);
-}
-
-/*!
- QT4 INTEROPERABILITY
-*/
-int TQRegExp::search( const TQString& str, int offset, CaretMode caretMode ) const
-{
- return indexIn( str, offset, caretMode);
-}
-
-/*!
- QT4 INTEROPERABILITY
-*/
-int TQRegExp::searchRev( const TQString& str, int offset ) const
-{
- return lastIndexIn( str, offset, CaretAtZero);
-}
-
-/*!
- QT4 INTEROPERABILITY
-*/
-int TQRegExp::searchRev( const TQString& str, int offset, CaretMode caretMode ) const
-{
- return lastIndexIn( str, offset, caretMode);
-}
-
-/*!
- QT4 INTEROPERABILITY
-*/
-bool TQRegExp::caseSensitive() const
-{
- return caseSensitivity();
-}
-
-/*!
- QT4 INTEROPERABILITY
-*/
-void TQRegExp::setCaseSensitive( bool sensitive )
-{
- setCaseSensitivity( (Qt::CaseSensitivity)sensitive);
-}
-
-#else // USE_QT4
-
-/*
- WARNING! Be sure to read qregexp.tex before modifying this file.
-*/
-
-/*!
- \class TQRegExp tqregexp.h
- \reentrant
- \brief The TQRegExp class provides pattern matching using regular expressions.
-
- \ingroup tools
- \ingroup misc
- \ingroup shared
- \mainclass
- \keyword regular expression
-
- Regular expressions, or "regexps", provide a way to find patterns
- within text. This is useful in many contexts, for example:
-
- \table
- \row \i Validation
- \i A regexp can be used to check whether a piece of text
- meets some criteria, e.g. is an integer or contains no
- whitespace.
- \row \i Searching
- \i Regexps provide a much more powerful means of searching
- text than simple string matching does. For example we can
- create a regexp which says "find one of the words 'mail',
- 'letter' or 'correspondence' but not any of the words
- 'email', 'mailman' 'mailer', 'letterbox' etc."
- \row \i Search and Replace
- \i A regexp can be used to replace a pattern with a piece of
- text, for example replace all occurrences of '&' with
- '\&amp;' except where the '&' is already followed by 'amp;'.
- \row \i String Splitting
- \i A regexp can be used to identify where a string should be
- split into its component fields, e.g. splitting tab-delimited
- strings.
- \endtable
-
- We present a very brief introduction to regexps, a description of
- TQt's regexp language, some code examples, and finally the function
- documentation itself. TQRegExp is modeled on Perl's regexp
- language, and also fully supports Unicode. TQRegExp can also be
- used in the weaker 'wildcard' (globbing) mode which works in a
- similar way to command shells. A good text on regexps is \e
- {Mastering Regular Expressions: Powerful Techniques for Perl and
- Other Tools} by Jeffrey E. Friedl, ISBN 1565922573.
-
- Experienced regexp users may prefer to skip the introduction and
- go directly to the relevant information.
-
- In case of multi-threaded programming, note that TQRegExp depends on
- TQThreadStorage internally. For that reason, TQRegExp should only be
- used with threads started with TQThread, i.e. not with threads
- started with platform-specific APIs.
-
- \tableofcontents
-
- \section1 Introduction
-
- Regexps are built up from expressions, quantifiers, and assertions.
- The simplest form of expression is simply a character, e.g.
- <b>x</b> or <b>5</b>. An expression can also be a set of
- characters. For example, <b>[ABCD]</b>, will match an <b>A</b> or
- a <b>B</b> or a <b>C</b> or a <b>D</b>. As a shorthand we could
- write this as <b>[A-D]</b>. If we want to match any of the
- captital letters in the English alphabet we can write
- <b>[A-Z]</b>. A quantifier tells the regexp engine how many
- occurrences of the expression we want, e.g. <b>x{1,1}</b> means
- match an <b>x</b> which occurs at least once and at most once.
- We'll look at assertions and more complex expressions later.
-
- Note that in general regexps cannot be used to check for balanced
- brackets or tags. For example if you want to match an opening html
- \c <b> and its closing \c </b> you can only use a regexp if you
- know that these tags are not nested; the html fragment, \c{<b>bold
- <b>bolder</b></b>} will not match as expected. If you know the
- maximum level of nesting it is possible to create a regexp that
- will match correctly, but for an unknown level of nesting, regexps
- will fail.
-
- We'll start by writing a regexp to match integers in the range 0
- to 99. We will require at least one digit so we will start with
- <b>[0-9]{1,1}</b> which means match a digit exactly once. This
- regexp alone will match integers in the range 0 to 9. To match one
- or two digits we can increase the maximum number of occurrences so
- the regexp becomes <b>[0-9]{1,2}</b> meaning match a digit at
- least once and at most twice. However, this regexp as it stands
- will not match correctly. This regexp will match one or two digits
- \e within a string. To ensure that we match against the whole
- string we must use the anchor assertions. We need <b>^</b> (caret)
- which when it is the first character in the regexp means that the
- regexp must match from the beginning of the string. And we also
- need <b>$</b> (dollar) which when it is the last character in the
- regexp means that the regexp must match until the end of the
- string. So now our regexp is <b>^[0-9]{1,2}$</b>. Note that
- assertions, such as <b>^</b> and <b>$</b>, do not match any
- characters.
-
- If you've seen regexps elsewhere they may have looked different from
- the ones above. This is because some sets of characters and some
- quantifiers are so common that they have special symbols to
- represent them. <b>[0-9]</b> can be replaced with the symbol
- <b>\d</b>. The quantifier to match exactly one occurrence,
- <b>{1,1}</b>, can be replaced with the expression itself. This means
- that <b>x{1,1}</b> is exactly the same as <b>x</b> alone. So our 0
- to 99 matcher could be written <b>^\d{1,2}$</b>. Another way of
- writing it would be <b>^\d\d{0,1}$</b>, i.e. from the start of the
- string match a digit followed by zero or one digits. In practice
- most people would write it <b>^\d\d?$</b>. The <b>?</b> is a
- shorthand for the quantifier <b>{0,1}</b>, i.e. a minimum of no
- occurrences a maximum of one occurrence. This is used to make an
- expression optional. The regexp <b>^\d\d?$</b> means "from the
- beginning of the string match one digit followed by zero or one
- digits and then the end of the string".
-
- Our second example is matching the words 'mail', 'letter' or
- 'correspondence' but without matching 'email', 'mailman',
- 'mailer', 'letterbox' etc. We'll start by just matching 'mail'. In
- full the regexp is, <b>m{1,1}a{1,1}i{1,1}l{1,1}</b>, but since
- each expression itself is automatically quantified by <b>{1,1}</b>
- we can simply write this as <b>mail</b>; an 'm' followed by an 'a'
- followed by an 'i' followed by an 'l'. The symbol '|' (bar) is
- used for \e alternation, so our regexp now becomes
- <b>mail|letter|correspondence</b> which means match 'mail' \e or
- 'letter' \e or 'correspondence'. Whilst this regexp will find the
- words we want it will also find words we don't want such as
- 'email'. We will start by putting our regexp in parentheses,
- <b>(mail|letter|correspondence)</b>. Parentheses have two effects,
- firstly they group expressions together and secondly they identify
- parts of the regexp that we wish to \link #capturing-text capture
- \endlink. Our regexp still matches any of the three words but now
- they are grouped together as a unit. This is useful for building
- up more complex regexps. It is also useful because it allows us to
- examine which of the words actually matched. We need to use
- another assertion, this time <b>\b</b> "word boundary":
- <b>\b(mail|letter|correspondence)\b</b>. This regexp means "match
- a word boundary followed by the expression in parentheses followed
- by another word boundary". The <b>\b</b> assertion matches at a \e
- position in the regexp not a \e character in the regexp. A word
- boundary is any non-word character such as a space a newline or
- the beginning or end of the string.
-
- For our third example we want to replace ampersands with the HTML
- entity '\&amp;'. The regexp to match is simple: <b>\&</b>, i.e.
- match one ampersand. Unfortunately this will mess up our text if
- some of the ampersands have already been turned into HTML
- entities. So what we really want to say is replace an ampersand
- providing it is not followed by 'amp;'. For this we need the
- negative lookahead assertion and our regexp becomes:
- <b>\&(?!amp;)</b>. The negative lookahead assertion is introduced
- with '(?!' and finishes at the ')'. It means that the text it
- contains, 'amp;' in our example, must \e not follow the expression
- that preceeds it.
-
- Regexps provide a rich language that can be used in a variety of
- ways. For example suppose we want to count all the occurrences of
- 'Eric' and 'Eirik' in a string. Two valid regexps to match these
- are <b>\\b(Eric|Eirik)\\b</b> and <b>\\bEi?ri[ck]\\b</b>. We need
- the word boundary '\b' so we don't get 'Ericsson' etc. The second
- regexp actually matches more than we want, 'Eric', 'Erik', 'Eiric'
- and 'Eirik'.
-
- We will implement some the examples above in the
- \link #code-examples code examples \endlink section.
-
- \target characters-and-abbreviations-for-sets-of-characters
- \section1 Characters and Abbreviations for Sets of Characters
-
- \table
- \header \i Element \i Meaning
- \row \i <b>c</b>
- \i Any character represents itself unless it has a special
- regexp meaning. Thus <b>c</b> matches the character \e c.
- \row \i <b>\\c</b>
- \i A character that follows a backslash matches the character
- itself except where mentioned below. For example if you
- wished to match a literal caret at the beginning of a string
- you would write <b>\^</b>.
- \row \i <b>\\a</b>
- \i This matches the ASCII bell character (BEL, 0x07).
- \row \i <b>\\f</b>
- \i This matches the ASCII form feed character (FF, 0x0C).
- \row \i <b>\\n</b>
- \i This matches the ASCII line feed character (LF, 0x0A, Unix newline).
- \row \i <b>\\r</b>
- \i This matches the ASCII carriage return character (CR, 0x0D).
- \row \i <b>\\t</b>
- \i This matches the ASCII horizontal tab character (HT, 0x09).
- \row \i <b>\\v</b>
- \i This matches the ASCII vertical tab character (VT, 0x0B).
- \row \i <b>\\xhhhh</b>
- \i This matches the Unicode character corresponding to the
- hexadecimal number hhhh (between 0x0000 and 0xFFFF). \0ooo
- (i.e., \zero ooo) matches the ASCII/Latin-1 character
- corresponding to the octal number ooo (between 0 and 0377).
- \row \i <b>. (dot)</b>
- \i This matches any character (including newline).
- \row \i <b>\\d</b>
- \i This matches a digit (TQChar::isDigit()).
- \row \i <b>\\D</b>
- \i This matches a non-digit.
- \row \i <b>\\s</b>
- \i This matches a whitespace (TQChar::isSpace()).
- \row \i <b>\\S</b>
- \i This matches a non-whitespace.
- \row \i <b>\\w</b>
- \i This matches a word character (TQChar::isLetterOrNumber() or '_').
- \row \i <b>\\W</b>
- \i This matches a non-word character.
- \row \i <b>\\n</b>
- \i The n-th \link #capturing-text backreference \endlink,
- e.g. \1, \2, etc.
- \endtable
-
- \e {Note that the C++ compiler transforms backslashes in strings
- so to include a <b>\\</b> in a regexp you will need to enter it
- twice, i.e. <b>\\\\</b>.}
-
- \target sets-of-characters
- \section1 Sets of Characters
-
- Square brackets are used to match any character in the set of
- characters contained within the square brackets. All the character
- set abbreviations described above can be used within square
- brackets. Apart from the character set abbreviations and the
- following two exceptions no characters have special meanings in
- square brackets.
-
- \table
- \row \i <b>^</b>
- \i The caret negates the character set if it occurs as the
- first character, i.e. immediately after the opening square
- bracket. For example, <b>[abc]</b> matches 'a' or 'b' or 'c',
- but <b>[^abc]</b> matches anything \e except 'a' or 'b' or
- 'c'.
- \row \i <b>-</b>
- \i The dash is used to indicate a range of characters, for
- example <b>[W-Z]</b> matches 'W' or 'X' or 'Y' or 'Z'.
- \endtable
-
- Using the predefined character set abbreviations is more portable
- than using character ranges across platforms and languages. For
- example, <b>[0-9]</b> matches a digit in Western alphabets but
- <b>\d</b> matches a digit in \e any alphabet.
-
- Note that in most regexp literature sets of characters are called
- "character classes".
-
- \target quantifiers
- \section1 Quantifiers
-
- By default an expression is automatically quantified by
- <b>{1,1}</b>, i.e. it should occur exactly once. In the following
- list <b>\e {E}</b> stands for any expression. An expression is a
- character or an abbreviation for a set of characters or a set of
- characters in square brackets or any parenthesised expression.
-
- \table
- \row \i <b>\e {E}?</b>
- \i Matches zero or one occurrence of \e E. This quantifier
- means "the previous expression is optional" since it will
- match whether or not the expression occurs in the string. It
- is the same as <b>\e {E}{0,1}</b>. For example <b>dents?</b>
- will match 'dent' and 'dents'.
-
- \row \i <b>\e {E}+</b>
- \i Matches one or more occurrences of \e E. This is the same
- as <b>\e {E}{1,MAXINT}</b>. For example, <b>0+</b> will match
- '0', '00', '000', etc.
-
- \row \i <b>\e {E}*</b>
- \i Matches zero or more occurrences of \e E. This is the same
- as <b>\e {E}{0,MAXINT}</b>. The <b>*</b> quantifier is often
- used by a mistake. Since it matches \e zero or more
- occurrences it will match no occurrences at all. For example
- if we want to match strings that end in whitespace and use
- the regexp <b>\s*$</b> we would get a match on every string.
- This is because we have said find zero or more whitespace
- followed by the end of string, so even strings that don't end
- in whitespace will match. The regexp we want in this case is
- <b>\s+$</b> to match strings that have at least one
- whitespace at the end.
-
- \row \i <b>\e {E}{n}</b>
- \i Matches exactly \e n occurrences of the expression. This
- is the same as repeating the expression \e n times. For
- example, <b>x{5}</b> is the same as <b>xxxxx</b>. It is also
- the same as <b>\e {E}{n,n}</b>, e.g. <b>x{5,5}</b>.
-
- \row \i <b>\e {E}{n,}</b>
- \i Matches at least \e n occurrences of the expression. This
- is the same as <b>\e {E}{n,MAXINT}</b>.
-
- \row \i <b>\e {E}{,m}</b>
- \i Matches at most \e m occurrences of the expression. This
- is the same as <b>\e {E}{0,m}</b>.
-
- \row \i <b>\e {E}{n,m}</b>
- \i Matches at least \e n occurrences of the expression and at
- most \e m occurrences of the expression.
- \endtable
-
- (MAXINT is implementation dependent but will not be smaller than
- 1024.)
-
- If we wish to apply a quantifier to more than just the preceding
- character we can use parentheses to group characters together in
- an expression. For example, <b>tag+</b> matches a 't' followed by
- an 'a' followed by at least one 'g', whereas <b>(tag)+</b> matches
- at least one occurrence of 'tag'.
-
- Note that quantifiers are "greedy". They will match as much text
- as they can. For example, <b>0+</b> will match as many zeros as it
- can from the first zero it finds, e.g. '2.<u>000</u>5'.
- Quantifiers can be made non-greedy, see setMinimal().
-
- \target capturing-text
- \section1 Capturing Text
-
- Parentheses allow us to group elements together so that we can
- quantify and capture them. For example if we have the expression
- <b>mail|letter|correspondence</b> that matches a string we know
- that \e one of the words matched but not which one. Using
- parentheses allows us to "capture" whatever is matched within
- their bounds, so if we used <b>(mail|letter|correspondence)</b>
- and matched this regexp against the string "I sent you some email"
- we can use the cap() or capturedTexts() functions to extract the
- matched characters, in this case 'mail'.
-
- We can use captured text within the regexp itself. To refer to the
- captured text we use \e backreferences which are indexed from 1,
- the same as for cap(). For example we could search for duplicate
- words in a string using <b>\b(\w+)\W+\1\b</b> which means match a
- word boundary followed by one or more word characters followed by
- one or more non-word characters followed by the same text as the
- first parenthesised expression followed by a word boundary.
-
- If we want to use parentheses purely for grouping and not for
- capturing we can use the non-capturing syntax, e.g.
- <b>(?:green|blue)</b>. Non-capturing parentheses begin '(?:' and
- end ')'. In this example we match either 'green' or 'blue' but we
- do not capture the match so we only know whether or not we matched
- but not which color we actually found. Using non-capturing
- parentheses is more efficient than using capturing parentheses
- since the regexp engine has to do less book-keeping.
-
- Both capturing and non-capturing parentheses may be nested.
-
- \target assertions
- \section1 Assertions
-
- Assertions make some statement about the text at the point where
- they occur in the regexp but they do not match any characters. In
- the following list <b>\e {E}</b> stands for any expression.
-
- \table
- \row \i <b>^</b>
- \i The caret signifies the beginning of the string. If you
- wish to match a literal \c{^} you must escape it by
- writing <b>&#92;^</b>. For example, <b>^#include</b> will only
- match strings which \e begin with the characters '#include'.
- (When the caret is the first character of a character set it
- has a special meaning, see \link #sets-of-characters Sets of
- Characters \endlink.)
-
- \row \i <b>$</b>
- \i The dollar signifies the end of the string. For example
- <b>\d\s*$</b> will match strings which end with a digit
- optionally followed by whitespace. If you wish to match a
- literal \c{$} you must escape it by writing
- <b>&#92;$</b>.
-
- \row \i <b>\\b</b>
- \i A word boundary. For example the regexp
- <b>\\bOK\\b</b> means match immediately after a word
- boundary (e.g. start of string or whitespace) the letter 'O'
- then the letter 'K' immediately before another word boundary
- (e.g. end of string or whitespace). But note that the
- assertion does not actually match any whitespace so if we
- write <b>(\\bOK\\b)</b> and we have a match it will only
- contain 'OK' even if the string is "Its <u>OK</u> now".
-
- \row \i <b>\\B</b>
- \i A non-word boundary. This assertion is true wherever
- <b>\\b</b> is false. For example if we searched for
- <b>\\Bon\\B</b> in "Left on" the match would fail (space
- and end of string aren't non-word boundaries), but it would
- match in "t<u>on</u>ne".
-
- \row \i <b>(?=\e E)</b>
- \i Positive lookahead. This assertion is true if the
- expression matches at this point in the regexp. For example,
- <b>const(?=\\s+char)</b> matches 'const' whenever it is
- followed by 'char', as in 'static <u>const</u> char *'.
- (Compare with <b>const\\s+char</b>, which matches 'static
- <u>const char</u> *'.)
-
- \row \i <b>(?!\e E)</b>
- \i Negative lookahead. This assertion is true if the
- expression does not match at this point in the regexp. For
- example, <b>const(?!\\s+char)</b> matches 'const' \e except
- when it is followed by 'char'.
- \endtable
-
- \target wildcard-matching
- \section1 Wildcard Matching (globbing)
-
- Most command shells such as \e bash or \e cmd.exe support "file
- globbing", the ability to identify a group of files by using
- wildcards. The setWildcard() function is used to switch between
- regexp and wildcard mode. Wildcard matching is much simpler than
- full regexps and has only four features:
-
- \table
- \row \i <b>c</b>
- \i Any character represents itself apart from those mentioned
- below. Thus <b>c</b> matches the character \e c.
- \row \i <b>?</b>
- \i This matches any single character. It is the same as
- <b>.</b> in full regexps.
- \row \i <b>*</b>
- \i This matches zero or more of any characters. It is the
- same as <b>.*</b> in full regexps.
- \row \i <b>[...]</b>
- \i Sets of characters can be represented in square brackets,
- similar to full regexps. Within the character class, like
- outside, backslash has no special meaning.
- \endtable
-
- For example if we are in wildcard mode and have strings which
- contain filenames we could identify HTML files with <b>*.html</b>.
- This will match zero or more characters followed by a dot followed
- by 'h', 't', 'm' and 'l'.
-
- \target perl-users
- \section1 Notes for Perl Users
-
- Most of the character class abbreviations supported by Perl are
- supported by TQRegExp, see \link
- #characters-and-abbreviations-for-sets-of-characters characters
- and abbreviations for sets of characters \endlink.
-
- In TQRegExp, apart from within character classes, \c{^} always
- signifies the start of the string, so carets must always be
- escaped unless used for that purpose. In Perl the meaning of caret
- varies automagically depending on where it occurs so escaping it
- is rarely necessary. The same applies to \c{$} which in
- TQRegExp always signifies the end of the string.
-
- TQRegExp's quantifiers are the same as Perl's greedy quantifiers.
- Non-greedy matching cannot be applied to individual quantifiers,
- but can be applied to all the quantifiers in the pattern. For
- example, to match the Perl regexp <b>ro+?m</b> requires:
- \code
- TQRegExp rx( "ro+m" );
- rx.setMinimal( TRUE );
- \endcode
-
- The equivalent of Perl's \c{/i} option is
- setCaseSensitive(FALSE).
-
- Perl's \c{/g} option can be emulated using a \link
- #cap_in_a_loop loop \endlink.
-
- In TQRegExp <b>.</b> matches any character, therefore all TQRegExp
- regexps have the equivalent of Perl's \c{/s} option. TQRegExp
- does not have an equivalent to Perl's \c{/m} option, but this
- can be emulated in various ways for example by splitting the input
- into lines or by looping with a regexp that searches for newlines.
-
- Because TQRegExp is string oriented there are no \A, \Z or \z
- assertions. The \G assertion is not supported but can be emulated
- in a loop.
-
- Perl's $& is cap(0) or capturedTexts()[0]. There are no TQRegExp
- equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
- ... correspond to cap(1) or capturedTexts()[1], cap(2) or
- capturedTexts()[2], etc.
-
- To substitute a pattern use TQString::replace().
-
- Perl's extended \c{/x} syntax is not supported, nor are
- directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
- the other hand, C++'s rules for literal strings can be used to
- achieve the same:
- \code
- TQRegExp mark( "\\b" // word boundary
- "[Mm]ark" // the word we want to match
- );
- \endcode
-
- Both zero-width positive and zero-width negative lookahead
- assertions (?=pattern) and (?!pattern) are supported with the same
- syntax as Perl. Perl's lookbehind assertions, "independent"
- subexpressions and conditional expressions are not supported.
-
- Non-capturing parentheses are also supported, with the same
- (?:pattern) syntax.
-
- See TQStringList::split() and TQStringList::join() for equivalents
- to Perl's split and join functions.
-
- Note: because C++ transforms \\'s they must be written \e twice in
- code, e.g. <b>\\b</b> must be written <b>\\\\b</b>.
-
- \target code-examples
- \section1 Code Examples
-
- \code
- TQRegExp rx( "^\\d\\d?$" ); // match integers 0 to 99
- rx.search( "123" ); // returns -1 (no match)
- rx.search( "-6" ); // returns -1 (no match)
- rx.search( "6" ); // returns 0 (matched as position 0)
- \endcode
-
- The third string matches '<u>6</u>'. This is a simple validation
- regexp for integers in the range 0 to 99.
-
- \code
- TQRegExp rx( "^\\S+$" ); // match strings without whitespace
- rx.search( "Hello world" ); // returns -1 (no match)
- rx.search( "This_is-OK" ); // returns 0 (matched at position 0)
- \endcode
-
- The second string matches '<u>This_is-OK</u>'. We've used the
- character set abbreviation '\S' (non-whitespace) and the anchors
- to match strings which contain no whitespace.
-
- In the following example we match strings containing 'mail' or
- 'letter' or 'correspondence' but only match whole words i.e. not
- 'email'
-
- \code
- TQRegExp rx( "\\b(mail|letter|correspondence)\\b" );
- rx.search( "I sent you an email" ); // returns -1 (no match)
- rx.search( "Please write the letter" ); // returns 17
- \endcode
-
- The second string matches "Please write the <u>letter</u>". The
- word 'letter' is also captured (because of the parentheses). We
- can see what text we've captured like this:
-
- \code
- TQString captured = rx.cap( 1 ); // captured == "letter"
- \endcode
-
- This will capture the text from the first set of capturing
- parentheses (counting capturing left parentheses from left to
- right). The parentheses are counted from 1 since cap( 0 ) is the
- whole matched regexp (equivalent to '&' in most regexp engines).
-
- \code
- TQRegExp rx( "&(?!amp;)" ); // match ampersands but not &amp;
- TQString line1 = "This & that";
- line1.replace( rx, "&amp;" );
- // line1 == "This &amp; that"
- TQString line2 = "His &amp; hers & theirs";
- line2.replace( rx, "&amp;" );
- // line2 == "His &amp; hers &amp; theirs"
- \endcode
-
- Here we've passed the TQRegExp to TQString's replace() function to
- replace the matched text with new text.
-
- \code
- TQString str = "One Eric another Eirik, and an Ericsson."
- " How many Eiriks, Eric?";
- TQRegExp rx( "\\b(Eric|Eirik)\\b" ); // match Eric or Eirik
- int pos = 0; // where we are in the string
- int count = 0; // how many Eric and Eirik's we've counted
- while ( pos >= 0 ) {
- pos = rx.search( str, pos );
- if ( pos >= 0 ) {
- pos++; // move along in str
- count++; // count our Eric or Eirik
- }
- }
- \endcode
-
- We've used the search() function to repeatedly match the regexp in
- the string. Note that instead of moving forward by one character
- at a time \c pos++ we could have written \c {pos +=
- rx.matchedLength()} to skip over the already matched string. The
- count will equal 3, matching 'One <u>Eric</u> another
- <u>Eirik</u>, and an Ericsson. How many Eiriks, <u>Eric</u>?'; it
- doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
- by non-word boundaries.
-
- One common use of regexps is to split lines of delimited data into
- their component fields.
-
- \code
- str = "Trolltech AS\twww.trolltech.com\tNorway";
- TQString company, web, country;
- rx.setPattern( "^([^\t]+)\t([^\t]+)\t([^\t]+)$" );
- if ( rx.search( str ) != -1 ) {
- company = rx.cap( 1 );
- web = rx.cap( 2 );
- country = rx.cap( 3 );
- }
- \endcode
-
- In this example our input lines have the format company name, web
- address and country. Unfortunately the regexp is rather long and
- not very versatile -- the code will break if we add any more
- fields. A simpler and better solution is to look for the
- separator, '\t' in this case, and take the surrounding text. The
- TQStringList split() function can take a separator string or regexp
- as an argument and split a string accordingly.
-
- \code
- TQStringList field = TQStringList::split( "\t", str );
- \endcode
-
- Here field[0] is the company, field[1] the web address and so on.
-
- To imitate the matching of a shell we can use wildcard mode.
-
- \code
- TQRegExp rx( "*.html" ); // invalid regexp: * doesn't quantify anything
- rx.setWildcard( TRUE ); // now it's a valid wildcard regexp
- rx.exactMatch( "index.html" ); // returns TRUE
- rx.exactMatch( "default.htm" ); // returns FALSE
- rx.exactMatch( "readme.txt" ); // returns FALSE
- \endcode
-
- Wildcard matching can be convenient because of its simplicity, but
- any wildcard regexp can be defined using full regexps, e.g.
- <b>.*\.html$</b>. Notice that we can't match both \c .html and \c
- .htm files with a wildcard unless we use <b>*.htm*</b> which will
- also match 'test.html.bak'. A full regexp gives us the precision
- we need, <b>.*\\.html?$</b>.
-
- TQRegExp can match case insensitively using setCaseSensitive(), and
- can use non-greedy matching, see setMinimal(). By default TQRegExp
- uses full regexps but this can be changed with setWildcard().
- Searching can be forward with search() or backward with
- searchRev(). Captured text can be accessed using capturedTexts()
- which returns a string list of all captured strings, or using
- cap() which returns the captured string for the given index. The
- pos() function takes a match index and returns the position in the
- string where the match was made (or -1 if there was no match).
-
- \sa TQRegExpValidator TQString TQStringList
-
- \target member-function-documentation
-*/
-
-const int NumBadChars = 64;
-#define BadChar( ch ) ( (ch).tqunicode() % NumBadChars )
-
-const int NoOccurrence = INT_MAX;
-const int EmptyCapture = INT_MAX;
-const int InftyLen = INT_MAX;
-const int InftyRep = 1025;
-const int EOS = -1;
-
-static bool isWord( TQChar ch )
-{
- return ch.isLetterOrNumber() || ch == TQChar( '_' );
-}
-
-/*
- Merges two TQMemArrays of ints and puts the result into the first
- one.
-*/
-static void mergeInto( TQMemArray<int> *a, const TQMemArray<int>& b )
-{
- int asize = a->size();
- int bsize = b.size();
- if ( asize == 0 ) {
- *a = b.copy();
-#ifndef TQT_NO_REGEXP_OPTIM
- } else if ( bsize == 1 && (*a)[asize - 1] < b[0] ) {
- a->resize( asize + 1 );
- (*a)[asize] = b[0];
-#endif
- } else if ( bsize >= 1 ) {
- int csize = asize + bsize;
- TQMemArray<int> c( csize );
- int i = 0, j = 0, k = 0;
- while ( i < asize ) {
- if ( j < bsize ) {
- if ( (*a)[i] == b[j] ) {
- i++;
- csize--;
- } else if ( (*a)[i] < b[j] ) {
- c[k++] = (*a)[i++];
- } else {
- c[k++] = b[j++];
- }
- } else {
- memcpy( c.data() + k, (*a).data() + i,
- (asize - i) * sizeof(int) );
- break;
- }
- }
- c.resize( csize );
- if ( j < bsize )
- memcpy( c.data() + k, b.data() + j, (bsize - j) * sizeof(int) );
- *a = c;
- }
-}
-
-/*
- Merges two disjoint TQMaps of (int, int) pairs and puts the result
- into the first one.
-*/
-static void mergeInto( TQMap<int, int> *a, const TQMap<int, int>& b )
-{
- TQMap<int, int>::ConstIterator it;
- for ( it = b.begin(); it != b.end(); ++it )
- a->insert( it.key(), *it );
-}
-
-/*
- Returns the value associated to key k in TQMap m of (int, int)
- pairs, or 0 if no such value is explicitly present.
-*/
-static int at( const TQMap<int, int>& m, int k )
-{
- TQMap<int, int>::ConstIterator it = m.find( k );
- if ( it == m.end() )
- return 0;
- else
- return *it;
-}
-
-#ifndef TQT_NO_REGEXP_WILDCARD
-/*
- Translates a wildcard pattern to an equivalent regular expression
- pattern (e.g., *.cpp to .*\.cpp).
-*/
-static TQString wc2rx( const TQString& wc_str )
-{
- int wclen = wc_str.length();
- TQString rx = TQString::tqfromLatin1( "" );
- int i = 0;
- const TQChar *wc = wc_str.tqunicode();
- while ( i < wclen ) {
- TQChar c = wc[i++];
- switch ( c.tqunicode() ) {
- case '*':
- rx += TQString::tqfromLatin1( ".*" );
- break;
- case '?':
- rx += TQChar( '.' );
- break;
- case '$':
- case '(':
- case ')':
- case '+':
- case '.':
- case '\\':
- case '^':
- case '{':
- case '|':
- case '}':
- rx += TQChar( '\\' );
- rx += c;
- break;
- case '[':
- rx += c;
- if ( wc[i] == TQChar('^') )
- rx += wc[i++];
- if ( i < wclen ) {
- if ( rx[i] == ']' )
- rx += wc[i++];
- while ( i < wclen && wc[i] != TQChar(']') ) {
- if ( wc[i] == '\\' )
- rx += TQChar( '\\' );
- rx += wc[i++];
- }
- }
- break;
- default:
- rx += c;
- }
- }
- return rx;
-}
-#endif
-
-/*
- The class TQRegExpEngine encapsulates a modified nondeterministic
- finite automaton (NFA).
-*/
-class TQRegExpEngine : public TQShared
-{
-public:
-#ifndef TQT_NO_REGEXP_CCLASS
- /*
- The class CharClass represents a set of characters, such as can
- be found in regular expressions (e.g., [a-z] denotes the set
- {a, b, ..., z}).
- */
- class CharClass
- {
- public:
- CharClass();
- CharClass( const CharClass& cc ) { operator=( cc ); }
-
- CharClass& operator=( const CharClass& cc );
-
- void clear();
- bool negative() const { return n; }
- void setNegative( bool negative );
- void addCategories( int cats );
- void addRange( ushort from, ushort to );
- void addSingleton( ushort ch ) { addRange( ch, ch ); }
-
- bool in( TQChar ch ) const;
-#ifndef TQT_NO_REGEXP_OPTIM
- const TQMemArray<int>& firstOccurrence() const { return occ1; }
-#endif
-
-#if defined(TQT_DEBUG)
- void dump() const;
-#endif
-
- private:
- /*
- The struct Range represents a range of characters (e.g.,
- [0-9] denotes range 48 to 57).
- */
- struct Range
- {
- ushort from; // 48
- ushort to; // 57
- };
-
- int c; // character classes
- TQMemArray<Range> r; // character ranges
- bool n; // negative?
-#ifndef TQT_NO_REGEXP_OPTIM
- TQMemArray<int> occ1; // first-occurrence array
-#endif
- };
-#else
- struct CharClass
- {
- int dummy;
-
-#ifndef TQT_NO_REGEXP_OPTIM
- CharClass() { occ1.fill( 0, NumBadChars ); }
-
- const TQMemArray<int>& firstOccurrence() const { return occ1; }
- TQMemArray<int> occ1;
-#endif
- };
-#endif
-
- TQRegExpEngine( bool caseSensitive ) { setup( caseSensitive ); }
- TQRegExpEngine( const TQString& rx, bool caseSensitive );
-#ifndef TQT_NO_REGEXP_OPTIM
- ~TQRegExpEngine();
-#endif
-
- bool isValid() const { return valid; }
- bool caseSensitive() const { return cs; }
- const TQString& errorString() const { return yyError; }
- int numCaptures() const { return officialncap; }
- void match( const TQString& str, int pos, bool minimal, bool oneTest,
- int caretIndex, TQMemArray<int>& captured );
- int partialMatchLength() const { return mmOneTestMatchedLen; }
-
- int createState( TQChar ch );
- int createState( const CharClass& cc );
-#ifndef TQT_NO_REGEXP_BACKREF
- int createState( int bref );
-#endif
-
- void addCatTransitions( const TQMemArray<int>& from,
- const TQMemArray<int>& to );
-#ifndef TQT_NO_REGEXP_CAPTURE
- void addPlusTransitions( const TQMemArray<int>& from,
- const TQMemArray<int>& to, int atom );
-#endif
-
-#ifndef TQT_NO_REGEXP_ANCHOR_ALT
- int anchorAlternation( int a, int b );
- int anchorConcatenation( int a, int b );
-#else
- int anchorAlternation( int a, int b ) { return a & b; }
- int anchorConcatenation( int a, int b ) { return a | b; }
-#endif
- void addAnchors( int from, int to, int a );
-
-#ifndef TQT_NO_REGEXP_OPTIM
- void heuristicallyChooseHeuristic();
-#endif
-
-#if defined(TQT_DEBUG)
- void dump() const;
-#endif
-
-private:
- enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
-
- /*
- The struct State represents one state in a modified NFA. The
- input characters matched are stored in the state instead of on
- the transitions, something possible for an automaton
- constructed from a regular expression.
- */
- struct State
- {
-#ifndef TQT_NO_REGEXP_CAPTURE
- int atom; // which atom does this state belong to?
-#endif
- int match; // what does it match? (see CharClassBit and BackRefBit)
- TQMemArray<int> outs; // out-transitions
- TQMap<int, int> *reenter; // atoms reentered when transiting out
- TQMap<int, int> *anchors; // anchors met when transiting out
-
-#ifndef TQT_NO_REGEXP_CAPTURE
- State( int a, int m )
- : atom( a ), match( m ), reenter( 0 ), anchors( 0 ) { }
-#else
- State( int m )
- : match( m ), reenter( 0 ), anchors( 0 ) { }
-#endif
- ~State() { delete reenter; delete anchors; }
- };
-
-#ifndef TQT_NO_REGEXP_LOOKAHEAD
- /*
- The struct Lookahead represents a lookahead a la Perl (e.g.,
- (?=foo) and (?!bar)).
- */
- struct Lookahead
- {
- TQRegExpEngine *eng; // NFA representing the embedded regular expression
- bool neg; // negative lookahead?
-
- Lookahead( TQRegExpEngine *eng0, bool neg0 )
- : eng( eng0 ), neg( neg0 ) { }
- ~Lookahead() { delete eng; }
- };
-#endif
-
-#ifndef TQT_NO_REGEXP_CAPTURE
- /*
- The struct Atom represents one node in the hierarchy of regular
- expression atoms.
- */
- struct Atom
- {
- int parent; // index of parent in array of atoms
- int capture; // index of capture, from 1 to ncap
- };
-#endif
-
-#ifndef TQT_NO_REGEXP_ANCHOR_ALT
- /*
- The struct AnchorAlternation represents a pair of anchors with
- OR semantics.
- */
- struct AnchorAlternation
- {
- int a; // this anchor...
- int b; // ...or this one
- };
-#endif
-
- enum { InitialState = 0, FinalState = 1 };
- void setup( bool caseSensitive );
- int setupState( int match );
-
- /*
- Let's hope that 13 lookaheads and 14 back-references are
- enough.
- */
- enum { MaxLookaheads = 13, MaxBackRefs = 14 };
- enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002,
- Anchor_Word = 0x00000004, Anchor_NonWord = 0x00000008,
- Anchor_FirstLookahead = 0x00000010,
- Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
- Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
- Anchor_Alternation = Anchor_BackRef1Empty << MaxBackRefs,
-
- Anchor_LookaheadMask = ( Anchor_FirstLookahead - 1 ) ^
- ( (Anchor_FirstLookahead << MaxLookaheads) - 1 ) };
-#ifndef TQT_NO_REGEXP_CAPTURE
- int startAtom( bool capture );
- void finishAtom( int atom ) { cf = f[atom].parent; }
-#endif
-
-#ifndef TQT_NO_REGEXP_LOOKAHEAD
- int addLookahead( TQRegExpEngine *eng, bool negative );
-#endif
-
-#ifndef TQT_NO_REGEXP_CAPTURE
- bool isBetterCapture( const int *begin1, const int *end1, const int *begin2,
- const int *end2 );
-#endif
- bool testAnchor( int i, int a, const int *capBegin );
-
-#ifndef TQT_NO_REGEXP_OPTIM
- bool goodStringMatch();
- bool badCharMatch();
-#else
- bool bruteMatch();
-#endif
- bool matchHere();
-
- TQPtrVector<State> s; // array of states
- int ns; // number of states
-#ifndef TQT_NO_REGEXP_CAPTURE
- TQMemArray<Atom> f; // atom hierarchy
- int nf; // number of atoms
- int cf; // current atom
-#endif
- int officialncap; // number of captures, seen from the outside
- int ncap; // number of captures, seen from the inside
-#ifndef TQT_NO_REGEXP_CCLASS
- TQPtrVector<CharClass> cl; // array of character classes
-#endif
-#ifndef TQT_NO_REGEXP_LOOKAHEAD
- TQPtrVector<Lookahead> ahead; // array of lookaheads
-#endif
-#ifndef TQT_NO_REGEXP_ANCHOR_ALT
- TQMemArray<AnchorAlternation> aa; // array of (a, b) pairs of anchors
-#endif
-#ifndef TQT_NO_REGEXP_OPTIM
- bool caretAnchored; // does the regexp start with ^?
- bool trivial; // is the good-string all that needs to match?
-#endif
- bool valid; // is the regular expression valid?
- bool cs; // case sensitive?
-#ifndef TQT_NO_REGEXP_BACKREF
- int nbrefs; // number of back-references
-#endif
-
-#ifndef TQT_NO_REGEXP_OPTIM
- bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
-
- int goodEarlyStart; // the index where goodStr can first occur in a match
- int goodLateStart; // the index where goodStr can last occur in a match
- TQString goodStr; // the string that any match has to contain
-
- int minl; // the minimum length of a match
- TQMemArray<int> occ1; // first-occurrence array
-#endif
-
- /*
- The class Box is an abstraction for a regular expression
- fragment. It can also be seen as one node in the syntax tree of
- a regular expression with synthetized attributes.
-
- Its interface is ugly for performance reasons.
- */
- class Box
- {
- public:
- Box( TQRegExpEngine *engine );
- Box( const Box& b ) { operator=( b ); }
-
- Box& operator=( const Box& b );
-
- void clear() { operator=( Box(eng) ); }
- void set( TQChar ch );
- void set( const CharClass& cc );
-#ifndef TQT_NO_REGEXP_BACKREF
- void set( int bref );
-#endif
-
- void cat( const Box& b );
- void orx( const Box& b );
- void plus( int atom );
- void opt();
- void catAnchor( int a );
-#ifndef TQT_NO_REGEXP_OPTIM
- void setupHeuristics();
-#endif
-
-#if defined(TQT_DEBUG)
- void dump() const;
-#endif
-
- private:
- void addAnchorsToEngine( const Box& to ) const;
-
- TQRegExpEngine *eng; // the automaton under construction
- TQMemArray<int> ls; // the left states (firstpos)
- TQMemArray<int> rs; // the right states (lastpos)
- TQMap<int, int> lanchors; // the left anchors
- TQMap<int, int> ranchors; // the right anchors
- int skipanchors; // the anchors to match if the box is skipped
-
-#ifndef TQT_NO_REGEXP_OPTIM
- int earlyStart; // the index where str can first occur
- int lateStart; // the index where str can last occur
- TQString str; // a string that has to occur in any match
- TQString leftStr; // a string occurring at the left of this box
- TQString rightStr; // a string occurring at the right of this box
- int maxl; // the maximum length of this box (possibly InftyLen)
-#endif
-
- int minl; // the minimum length of this box
-#ifndef TQT_NO_REGEXP_OPTIM
- TQMemArray<int> occ1; // first-occurrence array
-#endif
- };
- friend class Box;
-
- /*
- This is the lexical analyzer for regular expressions.
- */
- enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen,
- Tok_PosLookahead, Tok_NegLookahead, Tok_RightParen, Tok_CharClass,
- Tok_Caret, Tok_Quantifier, Tok_Bar, Tok_Word, Tok_NonWord,
- Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
- int getChar();
- int getEscape();
-#ifndef TQT_NO_REGEXP_INTERVAL
- int getRep( int def );
-#endif
-#ifndef TQT_NO_REGEXP_LOOKAHEAD
- void skipChars( int n );
-#endif
- void error( const char *msg );
- void startTokenizer( const TQChar *rx, int len );
- int getToken();
-
- const TQChar *yyIn; // a pointer to the input regular expression pattern
- int yyPos0; // the position of yyTok in the input pattern
- int yyPos; // the position of the next character to read
- int yyLen; // the length of yyIn
- int yyCh; // the last character read
- CharClass *yyCharClass; // attribute for Tok_CharClass tokens
- int yyMinRep; // attribute for Tok_Quantifier
- int yyMaxRep; // ditto
- TQString yyError; // syntax error or overflow during parsing?
-
- /*
- This is the syntactic analyzer for regular expressions.
- */
- int parse( const TQChar *rx, int len );
- void parseAtom( Box *box );
- void parseFactor( Box *box );
- void parseTerm( Box *box );
- void parseExpression( Box *box );
-
- int yyTok; // the last token read
- bool yyMayCapture; // set this to FALSE to disable capturing
-
- /*
- This is the engine state during matching.
- */
- const TQString *mmStr; // a pointer to the input TQString
- const TQChar *mmIn; // a pointer to the input string data
- int mmPos; // the current position in the string
- int mmCaretPos;
- int mmLen; // the length of the input string
- bool mmMinimal; // minimal matching?
- TQMemArray<int> mmBigArray; // big TQMemArray<int> array
- int *mmInNextStack; // is state is mmNextStack?
- int *mmCurStack; // stack of current states
- int *mmNextStack; // stack of next states
- int *mmCurCapBegin; // start of current states' captures
- int *mmNextCapBegin; // start of next states' captures
- int *mmCurCapEnd; // end of current states' captures
- int *mmNextCapEnd; // end of next states' captures
- int *mmTempCapBegin; // start of temporary captures
- int *mmTempCapEnd; // end of temporary captures
- int *mmCapBegin; // start of captures for a next state
- int *mmCapEnd; // end of captures for a next state
- int *mmSlideTab; // bump-along slide table for bad-character heuristic
- int mmSlideTabSize; // size of slide table
-#ifndef TQT_NO_REGEXP_BACKREF
- TQIntDict<int> mmSleeping; // dictionary of back-reference sleepers
-#endif
- int mmMatchLen; // length of match
- int mmOneTestMatchedLen; // length of partial match
-};
-
-TQRegExpEngine::TQRegExpEngine( const TQString& rx, bool caseSensitive )
-#ifndef TQT_NO_REGEXP_BACKREF
- : mmSleeping( 101 )
-#endif
-{
- setup( caseSensitive );
- valid = ( parse(rx.tqunicode(), rx.length()) == (int) rx.length() );
- if ( !valid ) {
-#ifndef TQT_NO_REGEXP_OPTIM
- trivial = FALSE;
-#endif
- error( RXERR_LEFTDELIM );
- }
-}
-
-#ifndef TQT_NO_REGEXP_OPTIM
-TQRegExpEngine::~TQRegExpEngine()
-{
-}
-#endif
-
-/*
- Tries to match in str and returns an array of (begin, length) pairs
- for captured text. If there is no match, all pairs are (-1, -1).
-*/
-void TQRegExpEngine::match( const TQString& str, int pos, bool minimal,
- bool oneTest, int caretIndex,
- TQMemArray<int>& captured )
-{
- bool matched = FALSE;
-
-#ifndef TQT_NO_REGEXP_OPTIM
- if ( trivial && !oneTest ) {
- mmPos = str.find( goodStr, pos, cs );
- mmMatchLen = goodStr.length();
- matched = ( mmPos != -1 );
- } else
-#endif
- {
- mmStr = &str;
- mmIn = str.tqunicode();
- if ( mmIn == 0 )
- mmIn = &TQChar::null;
- mmPos = pos;
- mmCaretPos = caretIndex;
- mmLen = str.length();
- mmMinimal = minimal;
- mmMatchLen = 0;
- mmOneTestMatchedLen = 0;
-
- if ( valid && mmPos >= 0 && mmPos <= mmLen ) {
-#ifndef TQT_NO_REGEXP_OPTIM
- if ( oneTest ) {
- matched = matchHere();
- } else {
- if ( mmPos <= mmLen - minl ) {
- if ( caretAnchored ) {
- matched = matchHere();
- } else if ( useGoodStringHeuristic ) {
- matched = goodStringMatch();
- } else {
- matched = badCharMatch();
- }
- }
- }
-#else
- matched = oneTest ? matchHere() : bruteMatch();
-#endif
- }
- }
-
- int capturedSize = 2 + 2 * officialncap;
- captured.detach();
- captured.resize( capturedSize );
- if ( matched ) {
- captured[0] = mmPos;
- captured[1] = mmMatchLen;
- for ( int j = 0; j < officialncap; j++ ) {
- int len = mmCapEnd[j] - mmCapBegin[j];
- captured[2 + 2 * j] = len > 0 ? mmPos + mmCapBegin[j] : 0;
- captured[2 + 2 * j + 1] = len;
- }
- } else {
- // we rely on 2's complement here
- memset( captured.data(), -1, capturedSize * sizeof(int) );
- }
-}
-
-/*
- The three following functions add one state to the automaton and
- return the number of the state.
-*/
-
-int TQRegExpEngine::createState( TQChar ch )
-{
- return setupState( ch.tqunicode() );
-}
-
-int TQRegExpEngine::createState( const CharClass& cc )
-{
-#ifndef TQT_NO_REGEXP_CCLASS
- int n = cl.size();
- cl.resize( n + 1 );
- cl.insert( n, new CharClass(cc) );
- return setupState( CharClassBit | n );
-#else
- TQ_UNUSED( cc );
- return setupState( CharClassBit );
-#endif
-}
-
-#ifndef TQT_NO_REGEXP_BACKREF
-int TQRegExpEngine::createState( int bref )
-{
- if ( bref > nbrefs ) {
- nbrefs = bref;
- if ( nbrefs > MaxBackRefs ) {
- error( RXERR_LIMIT );
- return 0;
- }
- }
- return setupState( BackRefBit | bref );
-}
-#endif
-
-/*
- The two following functions add a transition between all pairs of
- states (i, j) where i is fond in from, and j is found in to.
-
- Cat-transitions are distinguished from plus-transitions for
- capturing.
-*/
-
-void TQRegExpEngine::addCatTransitions( const TQMemArray<int>& from,
- const TQMemArray<int>& to )
-{
- for ( int i = 0; i < (int) from.size(); i++ ) {
- State *st = s[from[i]];
- mergeInto( &st->outs, to );
- }
-}
-
-#ifndef TQT_NO_REGEXP_CAPTURE
-void TQRegExpEngine::addPlusTransitions( const TQMemArray<int>& from,
- const TQMemArray<int>& to, int atom )
-{
- for ( int i = 0; i < (int) from.size(); i++ ) {
- State *st = s[from[i]];
- TQMemArray<int> oldOuts = st->outs.copy();
- mergeInto( &st->outs, to );
- if ( f[atom].capture >= 0 ) {
- if ( st->reenter == 0 )
- st->reenter = new TQMap<int, int>;
- for ( int j = 0; j < (int) to.size(); j++ ) {
- if ( !st->reenter->contains(to[j]) &&
- oldOuts.bsearch(to[j]) < 0 )
- st->reenter->insert( to[j], atom );
- }
- }
- }
-}
-#endif
-
-#ifndef TQT_NO_REGEXP_ANCHOR_ALT
-/*
- Returns an anchor that means a OR b.
-*/
-int TQRegExpEngine::anchorAlternation( int a, int b )
-{
- if ( ((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0 )
- return a & b;
-
- int n = aa.size();
-#ifndef TQT_NO_REGEXP_OPTIM
- if ( n > 0 && aa[n - 1].a == a && aa[n - 1].b == b )
- return Anchor_Alternation | ( n - 1 );
-#endif
-
- aa.resize( n + 1 );
- aa[n].a = a;
- aa[n].b = b;
- return Anchor_Alternation | n;
-}
-
-/*
- Returns an anchor that means a AND b.
-*/
-int TQRegExpEngine::anchorConcatenation( int a, int b )
-{
- if ( ((a | b) & Anchor_Alternation) == 0 )
- return a | b;
- if ( (b & Anchor_Alternation) != 0 )
- tqSwap( a, b );
-
- int aprime = anchorConcatenation( aa[a ^ Anchor_Alternation].a, b );
- int bprime = anchorConcatenation( aa[a ^ Anchor_Alternation].b, b );
- return anchorAlternation( aprime, bprime );
-}
-#endif
-
-/*
- Adds anchor a on a transition caracterised by its from state and
- its to state.
-*/
-void TQRegExpEngine::addAnchors( int from, int to, int a )
-{
- State *st = s[from];
- if ( st->anchors == 0 )
- st->anchors = new TQMap<int, int>;
- if ( st->anchors->contains(to) )
- a = anchorAlternation( (*st->anchors)[to], a );
- st->anchors->insert( to, a );
-}
-
-#ifndef TQT_NO_REGEXP_OPTIM
-/*
- This function chooses between the good-string and the bad-character
- heuristics. It computes two scores and chooses the heuristic with
- the highest score.
-
- Here are some common-sense constraints on the scores that should be
- respected if the formulas are ever modified: (1) If goodStr is
- empty, the good-string heuristic scores 0. (2) If the regular
- expression is trivial, the good-string heuristic should be used.
- (3) If the search is case insensitive, the good-string heuristic
- should be used, unless it scores 0. (Case insensitivity turns all
- entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
- big, the good-string heuristic should score less.
-*/
-void TQRegExpEngine::heuristicallyChooseHeuristic()
-{
- if ( minl == 0 ) {
- useGoodStringHeuristic = FALSE;
- } else if ( trivial ) {
- useGoodStringHeuristic = TRUE;
- } else {
- /*
- Magic formula: The good string has to constitute a good
- proportion of the minimum-length string, and appear at a
- more-or-less known index.
- */
- int goodStringScore = ( 64 * goodStr.length() / minl ) -
- ( goodLateStart - goodEarlyStart );
- /*
- Less magic formula: We pick some characters at random, and
- check whether they are good or bad.
- */
- int badCharScore = 0;
- int step = TQMAX( 1, NumBadChars / 32 );
- for ( int i = 1; i < NumBadChars; i += step ) {
- if ( occ1[i] == NoOccurrence )
- badCharScore += minl;
- else
- badCharScore += occ1[i];
- }
- badCharScore /= minl;
- useGoodStringHeuristic = ( goodStringScore > badCharScore );
- }
-}
-#endif
-
-#if defined(TQT_DEBUG)
-void TQRegExpEngine::dump() const
-{
- int i, j;
- qDebug( "Case %ssensitive engine", cs ? "" : "in" );
- qDebug( " States" );
- for ( i = 0; i < ns; i++ ) {
- qDebug( " %d%s", i,
- i == InitialState ? " (initial)" :
- i == FinalState ? " (final)" : "" );
-#ifndef TQT_NO_REGEXP_CAPTURE
- qDebug( " in atom %d", s[i]->atom );
-#endif
- int m = s[i]->match;
- if ( (m & CharClassBit) != 0 ) {
- qDebug( " match character class %d", m ^ CharClassBit );
-#ifndef TQT_NO_REGEXP_CCLASS
- cl[m ^ CharClassBit]->dump();
-#else
- qDebug( " negative character class" );
-#endif
- } else if ( (m & BackRefBit) != 0 ) {
- qDebug( " match back-reference %d", m ^ BackRefBit );
- } else if ( m >= 0x20 && m <= 0x7e ) {
- qDebug( " match 0x%.4x (%c)", m, m );
- } else {
- qDebug( " match 0x%.4x", m );
- }
- for ( j = 0; j < (int) s[i]->outs.size(); j++ ) {
- int next = s[i]->outs[j];
- qDebug( " -> %d", next );
- if ( s[i]->reenter != 0 && s[i]->reenter->contains(next) )
- qDebug( " [reenter %d]", (*s[i]->reenter)[next] );
- if ( s[i]->anchors != 0 && at(*s[i]->anchors, next) != 0 )
- qDebug( " [anchors 0x%.8x]", (*s[i]->anchors)[next] );
- }
- }
-#ifndef TQT_NO_REGEXP_CAPTURE
- if ( nf > 0 ) {
- qDebug( " Atom Parent Capture" );
- for ( i = 0; i < nf; i++ )
- qDebug( " %6d %6d %6d", i, f[i].parent, f[i].capture );
- }
-#endif
-#ifndef TQT_NO_REGEXP_ANCHOR_ALT
- for ( i = 0; i < (int) aa.size(); i++ )
- qDebug( " Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a,
- aa[i].b );
-#endif
-}
-#endif
-
-void TQRegExpEngine::setup( bool caseSensitive )
-{
- s.setAutoDelete( TRUE );
- s.resize( 32 );
- ns = 0;
-#ifndef TQT_NO_REGEXP_CAPTURE
- f.resize( 32 );
- nf = 0;
- cf = -1;
-#endif
- officialncap = 0;
- ncap = 0;
-#ifndef TQT_NO_REGEXP_CCLASS
- cl.setAutoDelete( TRUE );
-#endif
-#ifndef TQT_NO_REGEXP_LOOKAHEAD
- ahead.setAutoDelete( TRUE );
-#endif
-#ifndef TQT_NO_REGEXP_OPTIM
- caretAnchored = TRUE;
- trivial = TRUE;
-#endif
- valid = FALSE;
- cs = caseSensitive;
-#ifndef TQT_NO_REGEXP_BACKREF
- nbrefs = 0;
-#endif
-#ifndef TQT_NO_REGEXP_OPTIM
- useGoodStringHeuristic = TRUE;
- minl = 0;
- occ1.fill( 0, NumBadChars );
-#endif
-}
-
-int TQRegExpEngine::setupState( int match )
-{
- if ( (ns & (ns + 1)) == 0 && ns + 1 >= (int) s.size() )
- s.resize( (ns + 1) << 1 );
-#ifndef TQT_NO_REGEXP_CAPTURE
- s.insert( ns, new State(cf, match) );
-#else
- s.insert( ns, new State(match) );
-#endif
- return ns++;
-}
-
-#ifndef TQT_NO_REGEXP_CAPTURE
-/*
- Functions startAtom() and finishAtom() should be called to delimit
- atoms. When a state is created, it is assigned to the current atom.
- The information is later used for capturing.
-*/
-int TQRegExpEngine::startAtom( bool capture )
-{
- if ( (nf & (nf + 1)) == 0 && nf + 1 >= (int) f.size() )
- f.resize( (nf + 1) << 1 );
- f[nf].parent = cf;
- cf = nf++;
- f[cf].capture = capture ? ncap++ : -1;
- return cf;
-}
-#endif
-
-#ifndef TQT_NO_REGEXP_LOOKAHEAD
-/*
- Creates a lookahead anchor.
-*/
-int TQRegExpEngine::addLookahead( TQRegExpEngine *eng, bool negative )
-{
- int n = ahead.size();
- if ( n == MaxLookaheads ) {
- error( RXERR_LIMIT );
- return 0;
- }
- ahead.resize( n + 1 );
- ahead.insert( n, new Lookahead(eng, negative) );
- return Anchor_FirstLookahead << n;
-}
-#endif
-
-#ifndef TQT_NO_REGEXP_CAPTURE
-/*
- We want the longest leftmost captures.
-*/
-bool TQRegExpEngine::isBetterCapture( const int *begin1, const int *end1,
- const int *begin2, const int *end2 )
-{
- for ( int i = 0; i < ncap; i++ ) {
- int delta = begin2[i] - begin1[i]; // it has to start early...
- if ( delta == 0 )
- delta = end1[i] - end2[i]; // ...and end late (like a party)
-
- if ( delta != 0 )
- return delta > 0;
- }
- return FALSE;
-}
-#endif
-
-/*
- Returns TRUE if anchor a matches at position mmPos + i in the input
- string, otherwise FALSE.
-*/
-bool TQRegExpEngine::testAnchor( int i, int a, const int *capBegin )
-{
- int j;
-
-#ifndef TQT_NO_REGEXP_ANCHOR_ALT
- if ( (a & Anchor_Alternation) != 0 ) {
- return testAnchor( i, aa[a ^ Anchor_Alternation].a, capBegin ) ||
- testAnchor( i, aa[a ^ Anchor_Alternation].b, capBegin );
- }
-#endif
-
- if ( (a & Anchor_Caret) != 0 ) {
- if ( mmPos + i != mmCaretPos )
- return FALSE;
- }
- if ( (a & Anchor_Dollar) != 0 ) {
- if ( mmPos + i != mmLen )
- return FALSE;
- }
-#ifndef TQT_NO_REGEXP_ESCAPE
- if ( (a & (Anchor_Word | Anchor_NonWord)) != 0 ) {
- bool before = FALSE;
- bool after = FALSE;
- if ( mmPos + i != 0 )
- before = isWord( mmIn[mmPos + i - 1] );
- if ( mmPos + i != mmLen )
- after = isWord( mmIn[mmPos + i] );
- if ( (a & Anchor_Word) != 0 && (before == after) )
- return FALSE;
- if ( (a & Anchor_NonWord) != 0 && (before != after) )
- return FALSE;
- }
-#endif
-#ifndef TQT_NO_REGEXP_LOOKAHEAD
- if ( (a & Anchor_LookaheadMask) != 0 ) {
- TQConstString cstr = TQConstString( (TQChar *) mmIn + mmPos + i,
- mmLen - mmPos - i );
- for ( j = 0; j < (int) ahead.size(); j++ ) {
- if ( (a & (Anchor_FirstLookahead << j)) != 0 ) {
- TQMemArray<int> captured;
- ahead[j]->eng->match( cstr.string(), 0, TRUE, TRUE,
- mmCaretPos - mmPos - i, captured );
- if ( (captured[0] == 0) == ahead[j]->neg )
- return FALSE;
- }
- }
- }
-#endif
-#ifndef TQT_NO_REGEXP_CAPTURE
-#ifndef TQT_NO_REGEXP_BACKREF
- for ( j = 0; j < nbrefs; j++ ) {
- if ( (a & (Anchor_BackRef1Empty << j)) != 0 ) {
- if ( capBegin[j] != EmptyCapture )
- return FALSE;
- }
- }
-#endif
-#endif
- return TRUE;
-}
-
-#ifndef TQT_NO_REGEXP_OPTIM
-/*
- The three following functions are what Jeffrey Friedl would call
- transmissions (or bump-alongs). Using one or the other should make
- no difference except in performance.
-*/
-
-bool TQRegExpEngine::goodStringMatch()
-{
- int k = mmPos + goodEarlyStart;
- while ( (k = mmStr->find(goodStr, k, cs)) != -1 ) {
- int from = k - goodLateStart;
- int to = k - goodEarlyStart;
- if ( from > mmPos )
- mmPos = from;
-
- while ( mmPos <= to ) {
- if ( matchHere() )
- return TRUE;
- mmPos++;
- }
- k++;
- }
- return FALSE;
-}
-
-bool TQRegExpEngine::badCharMatch()
-{
- int slideHead = 0;
- int slideNext = 0;
- int i;
- int lastPos = mmLen - minl;
- memset( mmSlideTab, 0, mmSlideTabSize * sizeof(int) );
-
- /*
- Set up the slide table, used for the bad-character heuristic,
- using the table of first occurrence of each character.
- */
- for ( i = 0; i < minl; i++ ) {
- int sk = occ1[BadChar(mmIn[mmPos + i])];
- if ( sk == NoOccurrence )
- sk = i + 1;
- if ( sk > 0 ) {
- int k = i + 1 - sk;
- if ( k < 0 ) {
- sk = i + 1;
- k = 0;
- }
- if ( sk > mmSlideTab[k] )
- mmSlideTab[k] = sk;
- }
- }
-
- if ( mmPos > lastPos )
- return FALSE;
-
- for ( ;; ) {
- if ( ++slideNext >= mmSlideTabSize )
- slideNext = 0;
- if ( mmSlideTab[slideHead] > 0 ) {
- if ( mmSlideTab[slideHead] - 1 > mmSlideTab[slideNext] )
- mmSlideTab[slideNext] = mmSlideTab[slideHead] - 1;
- mmSlideTab[slideHead] = 0;
- } else {
- if ( matchHere() )
- return TRUE;
- }
-
- if ( mmPos == lastPos )
- break;
-
- /*
- Update the slide table. This code has much in common with
- the initialization code.
- */
- int sk = occ1[BadChar(mmIn[mmPos + minl])];
- if ( sk == NoOccurrence ) {
- mmSlideTab[slideNext] = minl;
- } else if ( sk > 0 ) {
- int k = slideNext + minl - sk;
- if ( k >= mmSlideTabSize )
- k -= mmSlideTabSize;
- if ( sk > mmSlideTab[k] )
- mmSlideTab[k] = sk;
- }
- slideHead = slideNext;
- mmPos++;
- }
- return FALSE;
-}
-#else
-bool TQRegExpEngine::bruteMatch()
-{
- while ( mmPos <= mmLen ) {
- if ( matchHere() )
- return TRUE;
- mmPos++;
- }
- return FALSE;
-}
-#endif
-
-/*
- Here's the core of the engine. It tries to do a match here and now.
-*/
-bool TQRegExpEngine::matchHere()
-{
- int ncur = 1, nnext = 0;
- int i = 0, j, k, m;
- bool stop = FALSE;
-
- mmMatchLen = -1;
- mmOneTestMatchedLen = -1;
- mmCurStack[0] = InitialState;
-
-#ifndef TQT_NO_REGEXP_CAPTURE
- if ( ncap > 0 ) {
- for ( j = 0; j < ncap; j++ ) {
- mmCurCapBegin[j] = EmptyCapture;
- mmCurCapEnd[j] = EmptyCapture;
- }
- }
-#endif
-
-#ifndef TQT_NO_REGEXP_BACKREF
- int *zzZ = 0;
-
- while ( (ncur > 0 || !mmSleeping.isEmpty()) && i <= mmLen - mmPos &&
- !stop )
-#else
- while ( ncur > 0 && i <= mmLen - mmPos && !stop )
-#endif
- {
- int ch = ( i < mmLen - mmPos ) ? mmIn[mmPos + i].tqunicode() : 0;
- for ( j = 0; j < ncur; j++ ) {
- int cur = mmCurStack[j];
- State *scur = s[cur];
- TQMemArray<int>& outs = scur->outs;
- for ( k = 0; k < (int) outs.size(); k++ ) {
- int next = outs[k];
- State *snext = s[next];
- bool in = TRUE;
-#ifndef TQT_NO_REGEXP_BACKREF
- int needSomeSleep = 0;
-#endif
-
- /*
- First, check if the anchors are anchored properly.
- */
- if ( scur->anchors != 0 ) {
- int a = at( *scur->anchors, next );
- if ( a != 0 && !testAnchor(i, a, mmCurCapBegin + j * ncap) )
- in = FALSE;
- }
- /*
- If indeed they are, check if the input character is
- correct for this transition.
- */
- if ( in ) {
- m = snext->match;
- if ( (m & (CharClassBit | BackRefBit)) == 0 ) {
- if ( cs )
- in = ( m == ch );
- else
- in = ( TQChar(m).lower() == TQChar(ch).lower() );
- } else if ( next == FinalState ) {
- mmMatchLen = i;
- stop = mmMinimal;
- in = TRUE;
- } else if ( (m & CharClassBit) != 0 ) {
-#ifndef TQT_NO_REGEXP_CCLASS
- const CharClass *cc = cl[m ^ CharClassBit];
- if ( cs )
- in = cc->in( ch );
- else if ( cc->negative() )
- in = cc->in( TQChar(ch).lower() ) &&
- cc->in( TQChar(ch).upper() );
- else
- in = cc->in( TQChar(ch).lower() ) ||
- cc->in( TQChar(ch).upper() );
-#endif
-#ifndef TQT_NO_REGEXP_BACKREF
- } else { /* ( (m & BackRefBit) != 0 ) */
- int bref = m ^ BackRefBit;
- int ell = j * ncap + ( bref - 1 );
-
- in = bref <= ncap && mmCurCapBegin[ell] != EmptyCapture;
- if ( in ) {
- if ( cs )
- in = ( mmIn[mmPos + mmCurCapBegin[ell]]
- == TQChar(ch) );
- else
- in = ( mmIn[mmPos + mmCurCapBegin[ell]].lower()
- == TQChar(ch).lower() );
- }
-
- if ( in ) {
- int delta;
- if ( mmCurCapEnd[ell] == EmptyCapture )
- delta = i - mmCurCapBegin[ell];
- else
- delta = mmCurCapEnd[ell] - mmCurCapBegin[ell];
-
- in = ( delta <= mmLen - (mmPos + i) );
- if ( in && delta > 1 ) {
- int n = 1;
- if ( cs ) {
- while ( n < delta ) {
- if ( mmIn[mmPos +
- mmCurCapBegin[ell] + n] !=
- mmIn[mmPos + i + n] )
- break;
- n++;
- }
- } else {
- while ( n < delta ) {
- TQChar a = mmIn[mmPos +
- mmCurCapBegin[ell] + n];
- TQChar b = mmIn[mmPos + i + n];
- if ( a.lower() != b.lower() )
- break;
- n++;
- }
- }
- in = ( n == delta );
- if ( in )
- needSomeSleep = delta - 1;
- }
- }
-#endif
- }
- }
-
- /*
- We must now update our data structures.
- */
- if ( in ) {
-#ifndef TQT_NO_REGEXP_CAPTURE
- int *capBegin, *capEnd;
-#endif
- /*
- If the next state was not encountered yet, all
- is fine.
- */
- if ( (m = mmInNextStack[next]) == -1 ) {
- m = nnext++;
- mmNextStack[m] = next;
- mmInNextStack[next] = m;
-#ifndef TQT_NO_REGEXP_CAPTURE
- capBegin = mmNextCapBegin + m * ncap;
- capEnd = mmNextCapEnd + m * ncap;
-
- /*
- Otherwise, we'll first maintain captures in
- temporary arrays, and decide at the end whether
- it's best to keep the previous capture zones or
- the new ones.
- */
- } else {
- capBegin = mmTempCapBegin;
- capEnd = mmTempCapEnd;
-#endif
- }
-
-#ifndef TQT_NO_REGEXP_CAPTURE
- /*
- Updating the capture zones is much of a task.
- */
- if ( ncap > 0 ) {
- memcpy( capBegin, mmCurCapBegin + j * ncap,
- ncap * sizeof(int) );
- memcpy( capEnd, mmCurCapEnd + j * ncap,
- ncap * sizeof(int) );
- int c = scur->atom, n = snext->atom;
- int p = -1, q = -1;
- int cap;
-
- /*
- Lemma 1. For any x in the range [0..nf), we
- have f[x].parent < x.
-
- Proof. By looking at startAtom(), it is
- clear that cf < nf holds all the time, and
- thus that f[nf].parent < nf.
- */
-
- /*
- If we are reentering an atom, we empty all
- capture zones inside it.
- */
- if ( scur->reenter != 0 &&
- (q = at(*scur->reenter, next)) != 0 ) {
- TQBitArray b;
- b.fill( FALSE, nf );
- b.setBit( q, TRUE );
- for ( int ell = q + 1; ell < nf; ell++ ) {
- if ( b.testBit(f[ell].parent) ) {
- b.setBit( ell, TRUE );
- cap = f[ell].capture;
- if ( cap >= 0 ) {
- capBegin[cap] = EmptyCapture;
- capEnd[cap] = EmptyCapture;
- }
- }
- }
- p = f[q].parent;
-
- /*
- Otherwise, close the capture zones we are
- leaving. We are leaving f[c].capture,
- f[f[c].parent].capture,
- f[f[f[c].parent].parent].capture, ...,
- until f[x].capture, with x such that
- f[x].parent is the youngest common ancestor
- for c and n.
-
- We go up along c's and n's ancestry until
- we find x.
- */
- } else {
- p = c;
- q = n;
- while ( p != q ) {
- if ( p > q ) {
- cap = f[p].capture;
- if ( cap >= 0 ) {
- if ( capBegin[cap] == i ) {
- capBegin[cap] = EmptyCapture;
- capEnd[cap] = EmptyCapture;
- } else {
- capEnd[cap] = i;
- }
- }
- p = f[p].parent;
- } else {
- q = f[q].parent;
- }
- }
- }
-
- /*
- In any case, we now open the capture zones
- we are entering. We work upwards from n
- until we reach p (the parent of the atom we
- reenter or the youngest common ancestor).
- */
- while ( n > p ) {
- cap = f[n].capture;
- if ( cap >= 0 ) {
- capBegin[cap] = i;
- capEnd[cap] = EmptyCapture;
- }
- n = f[n].parent;
- }
- /*
- If the next state was already in
- mmNextStack, we must choose carefully which
- capture zones we want to keep.
- */
- if ( capBegin == mmTempCapBegin &&
- isBetterCapture(capBegin, capEnd,
- mmNextCapBegin + m * ncap,
- mmNextCapEnd + m * ncap) ) {
- memcpy( mmNextCapBegin + m * ncap, capBegin,
- ncap * sizeof(int) );
- memcpy( mmNextCapEnd + m * ncap, capEnd,
- ncap * sizeof(int) );
- }
- }
-#ifndef TQT_NO_REGEXP_BACKREF
- /*
- We are done with updating the capture zones.
- It's now time to put the next state to sleep,
- if it needs to, and to remove it from
- mmNextStack.
- */
- if ( needSomeSleep > 0 ) {
- zzZ = new int[1 + 2 * ncap];
- zzZ[0] = next;
- if ( ncap > 0 ) {
- memcpy( zzZ + 1, capBegin, ncap * sizeof(int) );
- memcpy( zzZ + 1 + ncap, capEnd,
- ncap * sizeof(int) );
- }
- mmInNextStack[mmNextStack[--nnext]] = -1;
- mmSleeping.insert( i + needSomeSleep, zzZ );
- }
-#endif
-#endif
- }
- }
- }
-#ifndef TQT_NO_REGEXP_CAPTURE
- /*
- If we reached the final state, hurray! Copy the captured
- zone.
- */
- if ( ncap > 0 && (m = mmInNextStack[FinalState]) != -1 ) {
- memcpy( mmCapBegin, mmNextCapBegin + m * ncap, ncap * sizeof(int) );
- memcpy( mmCapEnd, mmNextCapEnd + m * ncap, ncap * sizeof(int) );
- }
-#ifndef TQT_NO_REGEXP_BACKREF
- /*
- It's time to wake up the sleepers.
- */
- if ( !mmSleeping.isEmpty() ) {
- while ( (zzZ = mmSleeping.take(i)) != 0 ) {
- int next = zzZ[0];
- int *capBegin = zzZ + 1;
- int *capEnd = zzZ + 1 + ncap;
- bool copyOver = TRUE;
-
- if ( (m = mmInNextStack[zzZ[0]]) == -1 ) {
- m = nnext++;
- mmNextStack[m] = next;
- mmInNextStack[next] = m;
- } else {
- copyOver = isBetterCapture( mmNextCapBegin + m * ncap,
- mmNextCapEnd + m * ncap,
- capBegin, capEnd );
- }
- if ( copyOver ) {
- memcpy( mmNextCapBegin + m * ncap, capBegin,
- ncap * sizeof(int) );
- memcpy( mmNextCapEnd + m * ncap, capEnd,
- ncap * sizeof(int) );
- }
- delete[] zzZ;
- }
- }
-#endif
-#endif
- for ( j = 0; j < nnext; j++ )
- mmInNextStack[mmNextStack[j]] = -1;
-
- // avoid needless iteration that confuses mmOneTestMatchedLen
- if ( nnext == 1 && mmNextStack[0] == FinalState
-#ifndef TQT_NO_REGEXP_BACKREF
- && mmSleeping.isEmpty()
-#endif
- )
- stop = TRUE;
-
- tqSwap( mmCurStack, mmNextStack );
-#ifndef TQT_NO_REGEXP_CAPTURE
- tqSwap( mmCurCapBegin, mmNextCapBegin );
- tqSwap( mmCurCapEnd, mmNextCapEnd );
-#endif
- ncur = nnext;
- nnext = 0;
- i++;
- }
-
-#ifndef TQT_NO_REGEXP_BACKREF
- /*
- If minimal matching is enabled, we might have some sleepers
- left.
- */
- while ( !mmSleeping.isEmpty() ) {
- zzZ = mmSleeping.take( *TQIntDictIterator<int>(mmSleeping) );
- delete[] zzZ;
- }
-#endif
-
- mmOneTestMatchedLen = i - 1;
- return ( mmMatchLen >= 0 );
-}
-
-#ifndef TQT_NO_REGEXP_CCLASS
-
-TQRegExpEngine::CharClass::CharClass()
- : c( 0 ), n( FALSE )
-{
-#ifndef TQT_NO_REGEXP_OPTIM
- occ1.fill( NoOccurrence, NumBadChars );
-#endif
-}
-
-TQRegExpEngine::CharClass& TQRegExpEngine::CharClass::operator=(
- const CharClass& cc )
-{
- c = cc.c;
- r = cc.r.copy();
- n = cc.n;
-#ifndef TQT_NO_REGEXP_OPTIM
- occ1 = cc.occ1;
-#endif
- return *this;
-}
-
-void TQRegExpEngine::CharClass::clear()
-{
- c = 0;
- r.resize( 0 );
- n = FALSE;
-}
-
-void TQRegExpEngine::CharClass::setNegative( bool negative )
-{
- n = negative;
-#ifndef TQT_NO_REGEXP_OPTIM
- occ1.fill( 0, NumBadChars );
-#endif
-}
-
-void TQRegExpEngine::CharClass::addCategories( int cats )
-{
- c |= cats;
-#ifndef TQT_NO_REGEXP_OPTIM
- occ1.fill( 0, NumBadChars );
-#endif
-}
-
-void TQRegExpEngine::CharClass::addRange( ushort from, ushort to )
-{
- if ( from > to )
- tqSwap( from, to );
- int m = r.size();
- r.resize( m + 1 );
- r[m].from = from;
- r[m].to = to;
-
-#ifndef TQT_NO_REGEXP_OPTIM
- int i;
-
- if ( to - from < NumBadChars ) {
- occ1.detach();
- if ( from % NumBadChars <= to % NumBadChars ) {
- for ( i = from % NumBadChars; i <= to % NumBadChars; i++ )
- occ1[i] = 0;
- } else {
- for ( i = 0; i <= to % NumBadChars; i++ )
- occ1[i] = 0;
- for ( i = from % NumBadChars; i < NumBadChars; i++ )
- occ1[i] = 0;
- }
- } else {
- occ1.fill( 0, NumBadChars );
- }
-#endif
-}
-
-bool TQRegExpEngine::CharClass::in( TQChar ch ) const
-{
-#ifndef TQT_NO_REGEXP_OPTIM
- if ( occ1[BadChar(ch)] == NoOccurrence )
- return n;
-#endif
-
- if ( c != 0 && (c & (1 << (int) ch.category())) != 0 )
- return !n;
- for ( int i = 0; i < (int) r.size(); i++ ) {
- if ( ch.tqunicode() >= r[i].from && ch.tqunicode() <= r[i].to )
- return !n;
- }
- return n;
-}
-
-#if defined(TQT_DEBUG)
-void TQRegExpEngine::CharClass::dump() const
-{
- int i;
- qDebug( " %stive character class", n ? "nega" : "posi" );
-#ifndef TQT_NO_REGEXP_CCLASS
- if ( c != 0 )
- qDebug( " categories 0x%.8x", c );
-#endif
- for ( i = 0; i < (int) r.size(); i++ )
- qDebug( " 0x%.4x through 0x%.4x", r[i].from, r[i].to );
-}
-#endif
-#endif
-
-TQRegExpEngine::Box::Box( TQRegExpEngine *engine )
- : eng( engine ), skipanchors( 0 )
-#ifndef TQT_NO_REGEXP_OPTIM
- , earlyStart( 0 ), lateStart( 0 ), maxl( 0 )
-#endif
-{
-#ifndef TQT_NO_REGEXP_OPTIM
- occ1.fill( NoOccurrence, NumBadChars );
-#endif
- minl = 0;
-}
-
-TQRegExpEngine::Box& TQRegExpEngine::Box::operator=( const Box& b )
-{
- eng = b.eng;
- ls = b.ls;
- rs = b.rs;
- lanchors = b.lanchors;
- ranchors = b.ranchors;
- skipanchors = b.skipanchors;
-#ifndef TQT_NO_REGEXP_OPTIM
- earlyStart = b.earlyStart;
- lateStart = b.lateStart;
- str = b.str;
- leftStr = b.leftStr;
- rightStr = b.rightStr;
- maxl = b.maxl;
- occ1 = b.occ1;
-#endif
- minl = b.minl;
- return *this;
-}
-
-void TQRegExpEngine::Box::set( TQChar ch )
-{
- ls.resize( 1 );
- ls[0] = eng->createState( ch );
- rs = ls;
- rs.detach();
-#ifndef TQT_NO_REGEXP_OPTIM
- str = ch;
- leftStr = ch;
- rightStr = ch;
- maxl = 1;
- occ1.detach();
- occ1[BadChar(ch)] = 0;
-#endif
- minl = 1;
-}
-
-void TQRegExpEngine::Box::set( const CharClass& cc )
-{
- ls.resize( 1 );
- ls[0] = eng->createState( cc );
- rs = ls;
- rs.detach();
-#ifndef TQT_NO_REGEXP_OPTIM
- maxl = 1;
- occ1 = cc.firstOccurrence();
-#endif
- minl = 1;
-}
-
-#ifndef TQT_NO_REGEXP_BACKREF
-void TQRegExpEngine::Box::set( int bref )
-{
- ls.resize( 1 );
- ls[0] = eng->createState( bref );
- rs = ls;
- rs.detach();
- if ( bref >= 1 && bref <= MaxBackRefs )
- skipanchors = Anchor_BackRef0Empty << bref;
-#ifndef TQT_NO_REGEXP_OPTIM
- maxl = InftyLen;
-#endif
- minl = 0;
-}
-#endif
-
-void TQRegExpEngine::Box::cat( const Box& b )
-{
- eng->addCatTransitions( rs, b.ls );
- addAnchorsToEngine( b );
- if ( minl == 0 ) {
- mergeInto( &lanchors, b.lanchors );
- if ( skipanchors != 0 ) {
- for ( int i = 0; i < (int) b.ls.size(); i++ ) {
- int a = eng->anchorConcatenation( at(lanchors, b.ls[i]),
- skipanchors );
- lanchors.insert( b.ls[i], a );
- }
- }
- mergeInto( &ls, b.ls );
- }
- if ( b.minl == 0 ) {
- mergeInto( &ranchors, b.ranchors );
- if ( b.skipanchors != 0 ) {
- for ( int i = 0; i < (int) rs.size(); i++ ) {
- int a = eng->anchorConcatenation( at(ranchors, rs[i]),
- b.skipanchors );
- ranchors.insert( rs[i], a );
- }
- }
- mergeInto( &rs, b.rs );
- } else {
- ranchors = b.ranchors;
- rs = b.rs;
- }
-
-#ifndef TQT_NO_REGEXP_OPTIM
- if ( maxl != InftyLen ) {
- if ( rightStr.length() + b.leftStr.length() >
- TQMAX(str.length(), b.str.length()) ) {
- earlyStart = minl - rightStr.length();
- lateStart = maxl - rightStr.length();
- str = rightStr + b.leftStr;
- } else if ( b.str.length() > str.length() ) {
- earlyStart = minl + b.earlyStart;
- lateStart = maxl + b.lateStart;
- str = b.str;
- }
- }
-
- if ( (int) leftStr.length() == maxl )
- leftStr += b.leftStr;
-
- if ( (int) b.rightStr.length() == b.maxl ) {
- rightStr += b.rightStr;
- } else {
- rightStr = b.rightStr;
- }
-
- if ( maxl == InftyLen || b.maxl == InftyLen ) {
- maxl = InftyLen;
- } else {
- maxl += b.maxl;
- }
-
- occ1.detach();
- for ( int i = 0; i < NumBadChars; i++ ) {
- if ( b.occ1[i] != NoOccurrence && minl + b.occ1[i] < occ1[i] )
- occ1[i] = minl + b.occ1[i];
- }
-#endif
-
- minl += b.minl;
- if ( minl == 0 )
- skipanchors = eng->anchorConcatenation( skipanchors, b.skipanchors );
- else
- skipanchors = 0;
-}
-
-void TQRegExpEngine::Box::orx( const Box& b )
-{
- mergeInto( &ls, b.ls );
- mergeInto( &lanchors, b.lanchors );
- mergeInto( &rs, b.rs );
- mergeInto( &ranchors, b.ranchors );
-
- if ( b.minl == 0 ) {
- if ( minl == 0 )
- skipanchors = eng->anchorAlternation( skipanchors, b.skipanchors );
- else
- skipanchors = b.skipanchors;
- }
-
-#ifndef TQT_NO_REGEXP_OPTIM
- occ1.detach();
- for ( int i = 0; i < NumBadChars; i++ ) {
- if ( occ1[i] > b.occ1[i] )
- occ1[i] = b.occ1[i];
- }
- earlyStart = 0;
- lateStart = 0;
- str = TQString();
- leftStr = TQString();
- rightStr = TQString();
- if ( b.maxl > maxl )
- maxl = b.maxl;
-#endif
- if ( b.minl < minl )
- minl = b.minl;
-}
-
-void TQRegExpEngine::Box::plus( int atom )
-{
-#ifndef TQT_NO_REGEXP_CAPTURE
- eng->addPlusTransitions( rs, ls, atom );
-#else
- TQ_UNUSED( atom );
- eng->addCatTransitions( rs, ls );
-#endif
- addAnchorsToEngine( *this );
-#ifndef TQT_NO_REGEXP_OPTIM
- maxl = InftyLen;
-#endif
-}
-
-void TQRegExpEngine::Box::opt()
-{
-#ifndef TQT_NO_REGEXP_OPTIM
- earlyStart = 0;
- lateStart = 0;
- str = TQString();
- leftStr = TQString();
- rightStr = TQString();
-#endif
- skipanchors = 0;
- minl = 0;
-}
-
-void TQRegExpEngine::Box::catAnchor( int a )
-{
- if ( a != 0 ) {
- for ( int i = 0; i < (int) rs.size(); i++ ) {
- a = eng->anchorConcatenation( at(ranchors, rs[i]), a );
- ranchors.insert( rs[i], a );
- }
- if ( minl == 0 )
- skipanchors = eng->anchorConcatenation( skipanchors, a );
- }
-}
-
-#ifndef TQT_NO_REGEXP_OPTIM
-void TQRegExpEngine::Box::setupHeuristics()
-{
- eng->goodEarlyStart = earlyStart;
- eng->goodLateStart = lateStart;
- eng->goodStr = eng->cs ? str : str.lower();
-
- eng->minl = minl;
- if ( eng->cs ) {
- /*
- A regular expression such as 112|1 has occ1['2'] = 2 and minl =
- 1 at this point. An entry of occ1 has to be at most minl or
- infinity for the rest of the algorithm to go well.
-
- We waited until here before normalizing these cases (instead of
- doing it in Box::orx()) because sometimes things improve by
- themselves. Consider for example (112|1)34.
- */
- for ( int i = 0; i < NumBadChars; i++ ) {
- if ( occ1[i] != NoOccurrence && occ1[i] >= minl )
- occ1[i] = minl;
- }
- eng->occ1 = occ1;
- } else {
- eng->occ1.fill( 0, NumBadChars );
- }
-
- eng->heuristicallyChooseHeuristic();
-}
-#endif
-
-#if defined(TQT_DEBUG)
-void TQRegExpEngine::Box::dump() const
-{
- int i;
- qDebug( "Box of at least %d character%s", minl, minl == 1 ? "" : "s" );
- qDebug( " Left states:" );
- for ( i = 0; i < (int) ls.size(); i++ ) {
- if ( at(lanchors, ls[i]) == 0 )
- qDebug( " %d", ls[i] );
- else
- qDebug( " %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]] );
- }
- qDebug( " Right states:" );
- for ( i = 0; i < (int) rs.size(); i++ ) {
- if ( at(ranchors, rs[i]) == 0 )
- qDebug( " %d", rs[i] );
- else
- qDebug( " %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]] );
- }
- qDebug( " Skip anchors: 0x%.8x", skipanchors );
-}
-#endif
-
-void TQRegExpEngine::Box::addAnchorsToEngine( const Box& to ) const
-{
- for ( int i = 0; i < (int) to.ls.size(); i++ ) {
- for ( int j = 0; j < (int) rs.size(); j++ ) {
- int a = eng->anchorConcatenation( at(ranchors, rs[j]),
- at(to.lanchors, to.ls[i]) );
- eng->addAnchors( rs[j], to.ls[i], a );
- }
- }
-}
-
-int TQRegExpEngine::getChar()
-{
- return ( yyPos == yyLen ) ? EOS : yyIn[yyPos++].tqunicode();
-}
-
-int TQRegExpEngine::getEscape()
-{
-#ifndef TQT_NO_REGEXP_ESCAPE
- const char tab[] = "afnrtv"; // no b, as \b means word boundary
- const char backTab[] = "\a\f\n\r\t\v";
- ushort low;
- int i;
-#endif
- ushort val;
- int prevCh = yyCh;
-
- if ( prevCh == EOS ) {
- error( RXERR_END );
- return Tok_Char | '\\';
- }
- yyCh = getChar();
-#ifndef TQT_NO_REGEXP_ESCAPE
- if ( (prevCh & ~0xff) == 0 ) {
- const char *p = strchr( tab, prevCh );
- if ( p != 0 )
- return Tok_Char | backTab[p - tab];
- }
-#endif
-
- switch ( prevCh ) {
-#ifndef TQT_NO_REGEXP_ESCAPE
- case '0':
- val = 0;
- for ( i = 0; i < 3; i++ ) {
- if ( yyCh >= '0' && yyCh <= '7' )
- val = ( val << 3 ) | ( yyCh - '0' );
- else
- break;
- yyCh = getChar();
- }
- if ( (val & ~0377) != 0 )
- error( RXERR_OCTAL );
- return Tok_Char | val;
-#endif
-#ifndef TQT_NO_REGEXP_ESCAPE
- case 'B':
- return Tok_NonWord;
-#endif
-#ifndef TQT_NO_REGEXP_CCLASS
- case 'D':
- // see TQChar::isDigit()
- yyCharClass->addCategories( 0x7fffffef );
- return Tok_CharClass;
- case 'S':
- // see TQChar::isSpace()
- yyCharClass->addCategories( 0x7ffff87f );
- yyCharClass->addRange( 0x0000, 0x0008 );
- yyCharClass->addRange( 0x000e, 0x001f );
- yyCharClass->addRange( 0x007f, 0x009f );
- return Tok_CharClass;
- case 'W':
- // see TQChar::isLetterOrNumber()
- yyCharClass->addCategories( 0x7fe07f8f );
- yyCharClass->addRange( 0x203f, 0x2040 );
- yyCharClass->addSingleton( 0x2040 );
- yyCharClass->addSingleton( 0x30fb );
- yyCharClass->addRange( 0xfe33, 0xfe34 );
- yyCharClass->addRange( 0xfe4d, 0xfe4f );
- yyCharClass->addSingleton( 0xff3f );
- yyCharClass->addSingleton( 0xff65 );
- return Tok_CharClass;
-#endif
-#ifndef TQT_NO_REGEXP_ESCAPE
- case 'b':
- return Tok_Word;
-#endif
-#ifndef TQT_NO_REGEXP_CCLASS
- case 'd':
- // see TQChar::isDigit()
- yyCharClass->addCategories( 0x00000010 );
- return Tok_CharClass;
- case 's':
- // see TQChar::isSpace()
- yyCharClass->addCategories( 0x00000380 );
- yyCharClass->addRange( 0x0009, 0x000d );
- return Tok_CharClass;
- case 'w':
- // see TQChar::isLetterOrNumber()
- yyCharClass->addCategories( 0x000f8070 );
- yyCharClass->addSingleton( 0x005f ); // '_'
- return Tok_CharClass;
-#endif
-#ifndef TQT_NO_REGEXP_ESCAPE
- case 'x':
- val = 0;
- for ( i = 0; i < 4; i++ ) {
- low = TQChar( yyCh ).lower();
- if ( low >= '0' && low <= '9' )
- val = ( val << 4 ) | ( low - '0' );
- else if ( low >= 'a' && low <= 'f' )
- val = ( val << 4 ) | ( low - 'a' + 10 );
- else
- break;
- yyCh = getChar();
- }
- return Tok_Char | val;
-#endif
- default:
- if ( prevCh >= '1' && prevCh <= '9' ) {
-#ifndef TQT_NO_REGEXP_BACKREF
- val = prevCh - '0';
- while ( yyCh >= '0' && yyCh <= '9' ) {
- val = ( val * 10 ) + ( yyCh - '0' );
- yyCh = getChar();
- }
- return Tok_BackRef | val;
-#else
- error( RXERR_DISABLED );
-#endif
- }
- return Tok_Char | prevCh;
- }
-}
-
-#ifndef TQT_NO_REGEXP_INTERVAL
-int TQRegExpEngine::getRep( int def )
-{
- if ( yyCh >= '0' && yyCh <= '9' ) {
- int rep = 0;
- do {
- rep = 10 * rep + yyCh - '0';
- if ( rep >= InftyRep ) {
- error( RXERR_REPETITION );
- rep = def;
- }
- yyCh = getChar();
- } while ( yyCh >= '0' && yyCh <= '9' );
- return rep;
- } else {
- return def;
- }
-}
-#endif
-
-#ifndef TQT_NO_REGEXP_LOOKAHEAD
-void TQRegExpEngine::skipChars( int n )
-{
- if ( n > 0 ) {
- yyPos += n - 1;
- yyCh = getChar();
- }
-}
-#endif
-
-void TQRegExpEngine::error( const char *msg )
-{
- if ( yyError.isEmpty() )
- yyError = TQString::tqfromLatin1( msg );
-}
-
-void TQRegExpEngine::startTokenizer( const TQChar *rx, int len )
-{
- yyIn = rx;
- yyPos0 = 0;
- yyPos = 0;
- yyLen = len;
- yyCh = getChar();
- yyCharClass = new CharClass;
- yyMinRep = 0;
- yyMaxRep = 0;
- yyError = TQString();
-}
-
-int TQRegExpEngine::getToken()
-{
-#ifndef TQT_NO_REGEXP_CCLASS
- ushort pendingCh = 0;
- bool charPending;
- bool rangePending;
- int tok;
-#endif
- int prevCh = yyCh;
-
- yyPos0 = yyPos - 1;
-#ifndef TQT_NO_REGEXP_CCLASS
- yyCharClass->clear();
-#endif
- yyMinRep = 0;
- yyMaxRep = 0;
- yyCh = getChar();
-
- switch ( prevCh ) {
- case EOS:
- yyPos0 = yyPos;
- return Tok_Eos;
- case '$':
- return Tok_Dollar;
- case '(':
- if ( yyCh == '?' ) {
- prevCh = getChar();
- yyCh = getChar();
- switch ( prevCh ) {
-#ifndef TQT_NO_REGEXP_LOOKAHEAD
- case '!':
- return Tok_NegLookahead;
- case '=':
- return Tok_PosLookahead;
-#endif
- case ':':
- return Tok_MagicLeftParen;
- default:
- error( RXERR_LOOKAHEAD );
- return Tok_MagicLeftParen;
- }
- } else {
- return Tok_LeftParen;
- }
- case ')':
- return Tok_RightParen;
- case '*':
- yyMinRep = 0;
- yyMaxRep = InftyRep;
- return Tok_Quantifier;
- case '+':
- yyMinRep = 1;
- yyMaxRep = InftyRep;
- return Tok_Quantifier;
- case '.':
-#ifndef TQT_NO_REGEXP_CCLASS
- yyCharClass->setNegative( TRUE );
-#endif
- return Tok_CharClass;
- case '?':
- yyMinRep = 0;
- yyMaxRep = 1;
- return Tok_Quantifier;
- case '[':
-#ifndef TQT_NO_REGEXP_CCLASS
- if ( yyCh == '^' ) {
- yyCharClass->setNegative( TRUE );
- yyCh = getChar();
- }
- charPending = FALSE;
- rangePending = FALSE;
- do {
- if ( yyCh == '-' && charPending && !rangePending ) {
- rangePending = TRUE;
- yyCh = getChar();
- } else {
- if ( charPending && !rangePending ) {
- yyCharClass->addSingleton( pendingCh );
- charPending = FALSE;
- }
- if ( yyCh == '\\' ) {
- yyCh = getChar();
- tok = getEscape();
- if ( tok == Tok_Word )
- tok = '\b';
- } else {
- tok = Tok_Char | yyCh;
- yyCh = getChar();
- }
- if ( tok == Tok_CharClass ) {
- if ( rangePending ) {
- yyCharClass->addSingleton( '-' );
- yyCharClass->addSingleton( pendingCh );
- charPending = FALSE;
- rangePending = FALSE;
- }
- } else if ( (tok & Tok_Char) != 0 ) {
- if ( rangePending ) {
- yyCharClass->addRange( pendingCh, tok ^ Tok_Char );
- charPending = FALSE;
- rangePending = FALSE;
- } else {
- pendingCh = tok ^ Tok_Char;
- charPending = TRUE;
- }
- } else {
- error( RXERR_CHARCLASS );
- }
- }
- } while ( yyCh != ']' && yyCh != EOS );
- if ( rangePending )
- yyCharClass->addSingleton( '-' );
- if ( charPending )
- yyCharClass->addSingleton( pendingCh );
- if ( yyCh == EOS )
- error( RXERR_END );
- else
- yyCh = getChar();
- return Tok_CharClass;
-#else
- error( RXERR_END );
- return Tok_Char | '[';
-#endif
- case '\\':
- return getEscape();
- case ']':
- error( RXERR_LEFTDELIM );
- return Tok_Char | ']';
- case '^':
- return Tok_Caret;
- case '{':
-#ifndef TQT_NO_REGEXP_INTERVAL
- yyMinRep = getRep( 0 );
- yyMaxRep = yyMinRep;
- if ( yyCh == ',' ) {
- yyCh = getChar();
- yyMaxRep = getRep( InftyRep );
- }
- if ( yyMaxRep < yyMinRep )
- tqSwap( yyMinRep, yyMaxRep );
- if ( yyCh != '}' )
- error( RXERR_REPETITION );
- yyCh = getChar();
- return Tok_Quantifier;
-#else
- error( RXERR_DISABLED );
- return Tok_Char | '{';
-#endif
- case '|':
- return Tok_Bar;
- case '}':
- error( RXERR_LEFTDELIM );
- return Tok_Char | '}';
- default:
- return Tok_Char | prevCh;
- }
-}
-
-int TQRegExpEngine::parse( const TQChar *pattern, int len )
-{
- valid = TRUE;
- startTokenizer( pattern, len );
- yyTok = getToken();
-#ifndef TQT_NO_REGEXP_CAPTURE
- yyMayCapture = TRUE;
-#else
- yyMayCapture = FALSE;
-#endif
-
-#ifndef TQT_NO_REGEXP_CAPTURE
- int atom = startAtom( FALSE );
-#endif
- CharClass anything;
- Box box( this ); // create InitialState
- box.set( anything );
- Box rightBox( this ); // create FinalState
- rightBox.set( anything );
-
- Box middleBox( this );
- parseExpression( &middleBox );
-#ifndef TQT_NO_REGEXP_CAPTURE
- finishAtom( atom );
-#endif
-#ifndef TQT_NO_REGEXP_OPTIM
- middleBox.setupHeuristics();
-#endif
- box.cat( middleBox );
- box.cat( rightBox );
- delete yyCharClass;
- yyCharClass = 0;
-
- officialncap = ncap;
-#ifndef TQT_NO_REGEXP_BACKREF
- if ( nbrefs > ncap )
- ncap = nbrefs;
-#endif
-
- /*
- We use one TQMemArray<int> for all the big data used a lot in
- matchHere() and friends.
- */
-#ifndef TQT_NO_REGEXP_OPTIM
- mmSlideTabSize = TQMAX( minl + 1, 16 );
-#else
- mmSlideTabSize = 0;
-#endif
- mmBigArray.resize( (3 + 4 * ncap) * ns + 4 * ncap + mmSlideTabSize );
-
- mmInNextStack = mmBigArray.data();
- memset( mmInNextStack, -1, ns * sizeof(int) );
- mmCurStack = mmInNextStack + ns;
- mmNextStack = mmInNextStack + 2 * ns;
-
- mmCurCapBegin = mmInNextStack + 3 * ns;
- mmNextCapBegin = mmCurCapBegin + ncap * ns;
- mmCurCapEnd = mmCurCapBegin + 2 * ncap * ns;
- mmNextCapEnd = mmCurCapBegin + 3 * ncap * ns;
-
- mmTempCapBegin = mmCurCapBegin + 4 * ncap * ns;
- mmTempCapEnd = mmTempCapBegin + ncap;
- mmCapBegin = mmTempCapBegin + 2 * ncap;
- mmCapEnd = mmTempCapBegin + 3 * ncap;
-
- mmSlideTab = mmTempCapBegin + 4 * ncap;
-
- if ( !yyError.isEmpty() )
- return -1;
-
-#ifndef TQT_NO_REGEXP_OPTIM
- State *sinit = s[InitialState];
- caretAnchored = ( sinit->anchors != 0 );
- if ( caretAnchored ) {
- TQMap<int, int>& anchors = *sinit->anchors;
- TQMap<int, int>::ConstIterator a;
- for ( a = anchors.begin(); a != anchors.end(); ++a ) {
- if (
-#ifndef TQT_NO_REGEXP_ANCHOR_ALT
- (*a & Anchor_Alternation) != 0 ||
-#endif
- (*a & Anchor_Caret) == 0 ) {
- caretAnchored = FALSE;
- break;
- }
- }
- }
-#endif
- return yyPos0;
-}
-
-void TQRegExpEngine::parseAtom( Box *box )
-{
-#ifndef TQT_NO_REGEXP_LOOKAHEAD
- TQRegExpEngine *eng = 0;
- bool neg;
- int len;
-#endif
-
- if ( (yyTok & Tok_Char) != 0 ) {
- box->set( TQChar(yyTok ^ Tok_Char) );
- } else {
-#ifndef TQT_NO_REGEXP_OPTIM
- trivial = FALSE;
-#endif
- switch ( yyTok ) {
- case Tok_Dollar:
- box->catAnchor( Anchor_Dollar );
- break;
- case Tok_Caret:
- box->catAnchor( Anchor_Caret );
- break;
-#ifndef TQT_NO_REGEXP_LOOKAHEAD
- case Tok_PosLookahead:
- case Tok_NegLookahead:
- neg = ( yyTok == Tok_NegLookahead );
- eng = new TQRegExpEngine( cs );
- len = eng->parse( yyIn + yyPos - 1, yyLen - yyPos + 1 );
- if ( len >= 0 )
- skipChars( len );
- else
- error( RXERR_LOOKAHEAD );
- box->catAnchor( addLookahead(eng, neg) );
- yyTok = getToken();
- if ( yyTok != Tok_RightParen )
- error( RXERR_LOOKAHEAD );
- break;
-#endif
-#ifndef TQT_NO_REGEXP_ESCAPE
- case Tok_Word:
- box->catAnchor( Anchor_Word );
- break;
- case Tok_NonWord:
- box->catAnchor( Anchor_NonWord );
- break;
-#endif
- case Tok_LeftParen:
- case Tok_MagicLeftParen:
- yyTok = getToken();
- parseExpression( box );
- if ( yyTok != Tok_RightParen )
- error( RXERR_END );
- break;
- case Tok_CharClass:
- box->set( *yyCharClass );
- break;
- case Tok_Quantifier:
- error( RXERR_REPETITION );
- break;
- default:
-#ifndef TQT_NO_REGEXP_BACKREF
- if ( (yyTok & Tok_BackRef) != 0 )
- box->set( yyTok ^ Tok_BackRef );
- else
-#endif
- error( RXERR_DISABLED );
- }
- }
- yyTok = getToken();
-}
-
-void TQRegExpEngine::parseFactor( Box *box )
-{
-#ifndef TQT_NO_REGEXP_CAPTURE
- int atom = startAtom( yyMayCapture && yyTok == Tok_LeftParen );
-#else
- static const int atom = 0;
-#endif
-
-#ifndef TQT_NO_REGEXP_INTERVAL
-#define YYREDO() \
- yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
- *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
-
- const TQChar *in = yyIn;
- int pos0 = yyPos0;
- int pos = yyPos;
- int len = yyLen;
- int ch = yyCh;
- CharClass charClass;
- if ( yyTok == Tok_CharClass )
- charClass = *yyCharClass;
- int tok = yyTok;
- bool mayCapture = yyMayCapture;
-#endif
-
- parseAtom( box );
-#ifndef TQT_NO_REGEXP_CAPTURE
- finishAtom( atom );
-#endif
-
- if ( yyTok == Tok_Quantifier ) {
-#ifndef TQT_NO_REGEXP_OPTIM
- trivial = FALSE;
-#endif
- if ( yyMaxRep == InftyRep ) {
- box->plus( atom );
-#ifndef TQT_NO_REGEXP_INTERVAL
- } else if ( yyMaxRep == 0 ) {
- box->clear();
-#endif
- }
- if ( yyMinRep == 0 )
- box->opt();
-
-#ifndef TQT_NO_REGEXP_INTERVAL
- yyMayCapture = FALSE;
- int alpha = ( yyMinRep == 0 ) ? 0 : yyMinRep - 1;
- int beta = ( yyMaxRep == InftyRep ) ? 0 : yyMaxRep - ( alpha + 1 );
-
- Box rightBox( this );
- int i;
-
- for ( i = 0; i < beta; i++ ) {
- YYREDO();
- Box leftBox( this );
- parseAtom( &leftBox );
- leftBox.cat( rightBox );
- leftBox.opt();
- rightBox = leftBox;
- }
- for ( i = 0; i < alpha; i++ ) {
- YYREDO();
- Box leftBox( this );
- parseAtom( &leftBox );
- leftBox.cat( rightBox );
- rightBox = leftBox;
- }
- rightBox.cat( *box );
- *box = rightBox;
-#endif
- yyTok = getToken();
-#ifndef TQT_NO_REGEXP_INTERVAL
- yyMayCapture = mayCapture;
-#endif
- }
-#undef YYREDO
-}
-
-void TQRegExpEngine::parseTerm( Box *box )
-{
-#ifndef TQT_NO_REGEXP_OPTIM
- if ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar )
- parseFactor( box );
-#endif
- while ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar ) {
- Box rightBox( this );
- parseFactor( &rightBox );
- box->cat( rightBox );
- }
-}
-
-void TQRegExpEngine::parseExpression( Box *box )
-{
- parseTerm( box );
- while ( yyTok == Tok_Bar ) {
-#ifndef TQT_NO_REGEXP_OPTIM
- trivial = FALSE;
-#endif
- Box rightBox( this );
- yyTok = getToken();
- parseTerm( &rightBox );
- box->orx( rightBox );
- }
-}
-
-/*
- The struct TQRegExpPrivate contains the private data of a regular
- expression other than the automaton. It makes it possible for many
- TQRegExp objects to use the same TQRegExpEngine object with different
- TQRegExpPrivate objects.
-*/
-struct TQRegExpPrivate
-{
- TQString pattern; // regular-expression or wildcard pattern
- TQString rxpattern; // regular-expression pattern
-#ifndef TQT_NO_REGEXP_WILDCARD
- bool wc : 1; // wildcard mode?
-#endif
- bool min : 1; // minimal matching? (instead of maximal)
- bool cs : 1; // case sensitive?
-#ifndef TQT_NO_REGEXP_CAPTURE
- TQString t; // last string passed to TQRegExp::search() or searchRev()
- TQStringList capturedCache; // what TQRegExp::capturedTexts() returned last
-#endif
- TQMemArray<int> captured; // what TQRegExpEngine::search() returned last
-
- TQRegExpPrivate() { captured.fill( -1, 2 ); }
-};
-
-#ifndef TQT_NO_REGEXP_OPTIM
-static TQSingleCleanupHandler<TQCache<TQRegExpEngine> > cleanup_cache;
-# ifndef TQT_THREAD_SUPPORT
-static TQCache<TQRegExpEngine> *engineCache = 0;
-# endif // TQT_THREAD_SUPPORT
-#endif // TQT_NO_REGEXP_OPTIM
-
-static void regexpEngine( TQRegExpEngine *&eng, const TQString &pattern,
- bool caseSensitive, bool deref )
-{
-# ifdef TQT_THREAD_SUPPORT
- static TQThreadStorage<TQCache<TQRegExpEngine> *> engineCaches;
- TQCache<TQRegExpEngine> *engineCache = 0;
- TQThreadInstance *currentThread = TQThreadInstance::current();
- if (currentThread)
- engineCache = engineCaches.localData();
-#endif // TQT_THREAD_SUPPORT
-
- if ( !deref ) {
-#ifndef TQT_NO_REGEXP_OPTIM
-# ifdef TQT_THREAD_SUPPORT
- if ( currentThread )
-# endif
- {
- if ( engineCache != 0 ) {
- eng = engineCache->take( pattern );
- if ( eng == 0 || eng->caseSensitive() != caseSensitive ) {
- delete eng;
- } else {
- eng->ref();
- return;
- }
- }
- }
-#endif // TQT_NO_REGEXP_OPTIM
- eng = new TQRegExpEngine( pattern, caseSensitive );
- return;
- }
-
- if ( eng->deref() ) {
-#ifndef TQT_NO_REGEXP_OPTIM
-# ifdef TQT_THREAD_SUPPORT
- if ( currentThread )
-# endif
- {
- if ( engineCache == 0 ) {
- engineCache = new TQCache<TQRegExpEngine>;
- engineCache->setAutoDelete( TRUE );
-# ifdef TQT_THREAD_SUPPORT
- engineCaches.setLocalData(engineCache);
-# else
- cleanup_cache.set( &engineCache );
-# endif // !TQT_THREAD_SUPPORT
- }
- if ( !pattern.isNull() &&
- engineCache->insert(pattern, eng, 4 + pattern.length() / 4) )
- return;
- }
-#else
- TQ_UNUSED( pattern );
-#endif // TQT_NO_REGEXP_OPTIM
- delete eng;
- eng = 0;
- }
-}
-
-/*!
- \enum TQRegExp::CaretMode
-
- The CaretMode enum defines the different meanings of the caret
- (<b>^</b>) in a regular expression. The possible values are:
-
- \value CaretAtZero
- The caret corresponds to index 0 in the searched string.
-
- \value CaretAtOffset
- The caret corresponds to the start offset of the search.
-
- \value CaretWontMatch
- The caret never matches.
-*/
-
-/*!
- Constructs an empty regexp.
-
- \sa isValid() errorString()
-*/
-TQRegExp::TQRegExp()
- : eng( 0 )
-{
- priv = new TQRegExpPrivate;
-#ifndef TQT_NO_REGEXP_WILDCARD
- priv->wc = FALSE;
-#endif
- priv->min = FALSE;
- priv->cs = TRUE;
-}
-
-/*!
- Constructs a regular expression object for the given \a pattern
- string. The pattern must be given using wildcard notation if \a
- wildcard is TRUE (default is FALSE). The pattern is case
- sensitive, unless \a caseSensitive is FALSE. Matching is greedy
- (maximal), but can be changed by calling setMinimal().
-
- \sa setPattern() setCaseSensitive() setWildcard() setMinimal()
-*/
-TQRegExp::TQRegExp( const TQString& pattern, bool caseSensitive, bool wildcard )
- : eng( 0 )
-{
- priv = new TQRegExpPrivate;
- priv->pattern = pattern;
-#ifndef TQT_NO_REGEXP_WILDCARD
- priv->wc = wildcard;
-#endif
- priv->min = FALSE;
- priv->cs = caseSensitive;
-}
-
-/*!
- Constructs a regular expression as a copy of \a rx.
-
- \sa operator=()
-*/
-TQRegExp::TQRegExp( const TQRegExp& rx )
- : eng( 0 )
-{
- priv = new TQRegExpPrivate;
- operator=( rx );
-}
-
-/*!
- Destroys the regular expression and cleans up its internal data.
-*/
-TQRegExp::~TQRegExp()
-{
- invalidateEngine();
- delete priv;
-}
-
-/*!
- Copies the regular expression \a rx and returns a reference to the
- copy. The case sensitivity, wildcard and minimal matching options
- are also copied.
-*/
-TQRegExp& TQRegExp::operator=( const TQRegExp& rx )
-{
- TQRegExpEngine *otherEng = rx.eng;
- if ( otherEng != 0 )
- otherEng->ref();
- invalidateEngine();
- eng = otherEng;
- priv->pattern = rx.priv->pattern;
- priv->rxpattern = rx.priv->rxpattern;
-#ifndef TQT_NO_REGEXP_WILDCARD
- priv->wc = rx.priv->wc;
-#endif
- priv->min = rx.priv->min;
- priv->cs = rx.priv->cs;
-#ifndef TQT_NO_REGEXP_CAPTURE
- priv->t = rx.priv->t;
- priv->capturedCache = rx.priv->capturedCache;
-#endif
- priv->captured = rx.priv->captured;
- return *this;
-}
-
-/*!
- Returns TRUE if this regular expression is equal to \a rx;
- otherwise returns FALSE.
-
- Two TQRegExp objects are equal if they have the same pattern
- strings and the same settings for case sensitivity, wildcard and
- minimal matching.
-*/
-bool TQRegExp::operator==( const TQRegExp& rx ) const
-{
- return priv->pattern == rx.priv->pattern &&
-#ifndef TQT_NO_REGEXP_WILDCARD
- priv->wc == rx.priv->wc &&
-#endif
- priv->min == rx.priv->min &&
- priv->cs == rx.priv->cs;
-}
-
-/*!
- \fn bool TQRegExp::operator!=( const TQRegExp& rx ) const
-
- Returns TRUE if this regular expression is not equal to \a rx;
- otherwise returns FALSE.
-
- \sa operator==()
-*/
-
-/*!
- Returns TRUE if the pattern string is empty; otherwise returns
- FALSE.
-
- If you call exactMatch() with an empty pattern on an empty string
- it will return TRUE; otherwise it returns FALSE since it operates
- over the whole string. If you call search() with an empty pattern
- on \e any string it will return the start offset (0 by default)
- because the empty pattern matches the 'emptiness' at the start of
- the string. In this case the length of the match returned by
- matchedLength() will be 0.
-
- See TQString::isEmpty().
-*/
-
-bool TQRegExp::isEmpty() const
-{
- return priv->pattern.isEmpty();
-}
-
-/*!
- Returns TRUE if the regular expression is valid; otherwise returns
- FALSE. An invalid regular expression never matches.
-
- The pattern <b>[a-z</b> is an example of an invalid pattern, since
- it lacks a closing square bracket.
-
- Note that the validity of a regexp may also depend on the setting
- of the wildcard flag, for example <b>*.html</b> is a valid
- wildcard regexp but an invalid full regexp.
-
- \sa errorString()
-*/
-bool TQRegExp::isValid() const
-{
- if ( priv->pattern.isEmpty() ) {
- return TRUE;
- } else {
- prepareEngine();
- return eng->isValid();
- }
-}
-
-/*!
- Returns the pattern string of the regular expression. The pattern
- has either regular expression syntax or wildcard syntax, depending
- on wildcard().
-
- \sa setPattern()
-*/
-TQString TQRegExp::pattern() const
-{
- return priv->pattern;
-}
-
-/*!
- Sets the pattern string to \a pattern. The case sensitivity,
- wildcard and minimal matching options are not changed.
-
- \sa pattern()
-*/
-void TQRegExp::setPattern( const TQString& pattern )
-{
- if ( priv->pattern != pattern ) {
- priv->pattern = pattern;
- invalidateEngine();
- }
-}
-
-/*!
- Returns TRUE if case sensitivity is enabled; otherwise returns
- FALSE. The default is TRUE.
-
- \sa setCaseSensitive()
-*/
-bool TQRegExp::caseSensitive() const
-{
- return priv->cs;
-}
-
-/*!
- Sets case sensitive matching to \a sensitive.
-
- If \a sensitive is TRUE, <b>\\.txt$</b> matches \c{readme.txt} but
- not \c{README.TXT}.
-
- \sa caseSensitive()
-*/
-void TQRegExp::setCaseSensitive( bool sensitive )
-{
- if ( sensitive != priv->cs ) {
- priv->cs = sensitive;
- invalidateEngine();
- }
-}
-
-#ifndef TQT_NO_REGEXP_WILDCARD
-/*!
- Returns TRUE if wildcard mode is enabled; otherwise returns FALSE.
- The default is FALSE.
-
- \sa setWildcard()
-*/
-bool TQRegExp::wildcard() const
-{
- return priv->wc;
-}
-
-/*!
- Sets the wildcard mode for the regular expression. The default is
- FALSE.
-
- Setting \a wildcard to TRUE enables simple shell-like wildcard
- matching. (See \link #wildcard-matching wildcard matching
- (globbing) \endlink.)
-
- For example, <b>r*.txt</b> matches the string \c{readme.txt} in
- wildcard mode, but does not match \c{readme}.
-
- \sa wildcard()
-*/
-void TQRegExp::setWildcard( bool wildcard )
-{
- if ( wildcard != priv->wc ) {
- priv->wc = wildcard;
- invalidateEngine();
- }
-}
-#endif
-
-/*!
- Returns TRUE if minimal (non-greedy) matching is enabled;
- otherwise returns FALSE.
-
- \sa setMinimal()
-*/
-bool TQRegExp::minimal() const
-{
- return priv->min;
-}
-
-/*!
- Enables or disables minimal matching. If \a minimal is FALSE,
- matching is greedy (maximal) which is the default.
-
- For example, suppose we have the input string "We must be
- \<b>bold\</b>, very \<b>bold\</b>!" and the pattern
- <b>\<b>.*\</b></b>. With the default greedy (maximal) matching,
- the match is "We must be <u>\<b>bold\</b>, very
- \<b>bold\</b></u>!". But with minimal (non-greedy) matching the
- first match is: "We must be <u>\<b>bold\</b></u>, very
- \<b>bold\</b>!" and the second match is "We must be \<b>bold\</b>,
- very <u>\<b>bold\</b></u>!". In practice we might use the pattern
- <b>\<b>[^\<]+\</b></b> instead, although this will still fail for
- nested tags.
-
- \sa minimal()
-*/
-void TQRegExp::setMinimal( bool minimal )
-{
- priv->min = minimal;
-}
-
-/*!
- Returns TRUE if \a str is matched exactly by this regular
- expression; otherwise returns FALSE. You can determine how much of
- the string was matched by calling matchedLength().
-
- For a given regexp string, R, exactMatch("R") is the equivalent of
- search("^R$") since exactMatch() effectively encloses the regexp
- in the start of string and end of string anchors, except that it
- sets matchedLength() differently.
-
- For example, if the regular expression is <b>blue</b>, then
- exactMatch() returns TRUE only for input \c blue. For inputs \c
- bluebell, \c blutak and \c lightblue, exactMatch() returns FALSE
- and matchedLength() will return 4, 3 and 0 respectively.
-
- Although const, this function sets matchedLength(),
- capturedTexts() and pos().
-
- \sa search() searchRev() TQRegExpValidator
-*/
-bool TQRegExp::exactMatch( const TQString& str ) const
-{
- prepareEngineForMatch( str );
- eng->match( str, 0, priv->min, TRUE, 0, priv->captured );
- if ( priv->captured[1] == (int) str.length() ) {
- return TRUE;
- } else {
- priv->captured[0] = 0;
- priv->captured[1] = eng->partialMatchLength();
- return FALSE;
- }
-}
-
-#ifndef TQT_NO_COMPAT
-/*! \obsolete
-
- Attempts to match in \a str, starting from position \a index.
- Returns the position of the match, or -1 if there was no match.
-
- The length of the match is stored in \a *len, unless \a len is a
- null pointer.
-
- If \a indexIsStart is TRUE (the default), the position \a index in
- the string will match the start of string anchor, <b>^</b>, in the
- regexp, if present. Otherwise, position 0 in \a str will match.
-
- Use search() and matchedLength() instead of this function.
-
- \sa TQString::mid() TQConstString
-*/
-int TQRegExp::match( const TQString& str, int index, int *len,
- bool indexIsStart ) const
-{
- int pos = search( str, index, indexIsStart ? CaretAtOffset : CaretAtZero );
- if ( len != 0 )
- *len = matchedLength();
- return pos;
-}
-#endif // TQT_NO_COMPAT
-
-int TQRegExp::search( const TQString& str, int offset ) const
-{
- return search( str, offset, CaretAtZero );
-}
-
-/*!
- Attempts to find a match in \a str from position \a offset (0 by
- default). If \a offset is -1, the search starts at the last
- character; if -2, at the next to last character; etc.
-
- Returns the position of the first match, or -1 if there was no
- match.
-
- The \a caretMode parameter can be used to instruct whether <b>^</b>
- should match at index 0 or at \a offset.
-
- You might prefer to use TQString::find(), TQString::contains() or
- even TQStringList::grep(). To replace matches use
- TQString::replace().
-
- Example:
- \code
- TQString str = "offsets: 1.23 .50 71.00 6.00";
- TQRegExp rx( "\\d*\\.\\d+" ); // primitive floating point matching
- int count = 0;
- int pos = 0;
- while ( (pos = rx.search(str, pos)) != -1 ) {
- count++;
- pos += rx.matchedLength();
- }
- // pos will be 9, 14, 18 and finally 24; count will end up as 4
- \endcode
-
- Although const, this function sets matchedLength(),
- capturedTexts() and pos().
-
- \sa searchRev() exactMatch()
-*/
-
-int TQRegExp::search( const TQString& str, int offset, CaretMode caretMode ) const
-{
- prepareEngineForMatch( str );
- if ( offset < 0 )
- offset += str.length();
- eng->match( str, offset, priv->min, FALSE, caretIndex(offset, caretMode),
- priv->captured );
- return priv->captured[0];
-}
-
-
-int TQRegExp::searchRev( const TQString& str, int offset ) const
-{
- return searchRev( str, offset, CaretAtZero );
-}
-
-/*!
- Attempts to find a match backwards in \a str from position \a
- offset. If \a offset is -1 (the default), the search starts at the
- last character; if -2, at the next to last character; etc.
-
- Returns the position of the first match, or -1 if there was no
- match.
-
- The \a caretMode parameter can be used to instruct whether <b>^</b>
- should match at index 0 or at \a offset.
-
- Although const, this function sets matchedLength(),
- capturedTexts() and pos().
-
- \warning Searching backwards is much slower than searching
- forwards.
-
- \sa search() exactMatch()
-*/
-
-int TQRegExp::searchRev( const TQString& str, int offset,
- CaretMode caretMode ) const
-{
- prepareEngineForMatch( str );
- if ( offset < 0 )
- offset += str.length();
- if ( offset < 0 || offset > (int) str.length() ) {
- priv->captured.detach();
- priv->captured.fill( -1 );
- return -1;
- }
-
- while ( offset >= 0 ) {
- eng->match( str, offset, priv->min, TRUE, caretIndex(offset, caretMode),
- priv->captured );
- if ( priv->captured[0] == offset )
- return offset;
- offset--;
- }
- return -1;
-}
-
-/*!
- Returns the length of the last matched string, or -1 if there was
- no match.
-
- \sa exactMatch() search() searchRev()
-*/
-int TQRegExp::matchedLength() const
-{
- return priv->captured[1];
-}
-
-#ifndef TQT_NO_REGEXP_CAPTURE
-/*!
- Returns the number of captures contained in the regular expression.
- */
-int TQRegExp::numCaptures() const
-{
- prepareEngine();
- return eng->numCaptures();
-}
-
-/*!
- Returns a list of the captured text strings.
-
- The first string in the list is the entire matched string. Each
- subsequent list element contains a string that matched a
- (capturing) subexpression of the regexp.
-
- For example:
- \code
- TQRegExp rx( "(\\d+)(\\s*)(cm|inch(es)?)" );
- int pos = rx.search( "Length: 36 inches" );
- TQStringList list = rx.capturedTexts();
- // list is now ( "36 inches", "36", " ", "inches", "es" )
- \endcode
-
- The above example also captures elements that may be present but
- which we have no interest in. This problem can be solved by using
- non-capturing parentheses:
-
- \code
- TQRegExp rx( "(\\d+)(?:\\s*)(cm|inch(?:es)?)" );
- int pos = rx.search( "Length: 36 inches" );
- TQStringList list = rx.capturedTexts();
- // list is now ( "36 inches", "36", "inches" )
- \endcode
-
- Note that if you want to iterate over the list, you should iterate
- over a copy, e.g.
- \code
- TQStringList list = rx.capturedTexts();
- TQStringList::Iterator it = list.begin();
- while( it != list.end() ) {
- myProcessing( *it );
- ++it;
- }
- \endcode
-
- Some regexps can match an indeterminate number of times. For
- example if the input string is "Offsets: 12 14 99 231 7" and the
- regexp, \c{rx}, is <b>(\\d+)+</b>, we would hope to get a list of
- all the numbers matched. However, after calling
- \c{rx.search(str)}, capturedTexts() will return the list ( "12",
- "12" ), i.e. the entire match was "12" and the first subexpression
- matched was "12". The correct approach is to use cap() in a \link
- #cap_in_a_loop loop \endlink.
-
- The order of elements in the string list is as follows. The first
- element is the entire matching string. Each subsequent element
- corresponds to the next capturing open left parentheses. Thus
- capturedTexts()[1] is the text of the first capturing parentheses,
- capturedTexts()[2] is the text of the second and so on
- (corresponding to $1, $2, etc., in some other regexp languages).
-
- \sa cap() pos() exactMatch() search() searchRev()
-*/
-TQStringList TQRegExp::capturedTexts()
-{
- if ( priv->capturedCache.isEmpty() ) {
- for ( int i = 0; i < (int) priv->captured.size(); i += 2 ) {
- TQString m;
- if ( priv->captured[i + 1] == 0 )
- m = TQString::tqfromLatin1( "" );
- else if ( priv->captured[i] >= 0 )
- m = priv->t.mid( priv->captured[i],
- priv->captured[i + 1] );
- priv->capturedCache.append( m );
- }
- priv->t = TQString::null;
- }
- return priv->capturedCache;
-}
-
-/*!
- Returns the text captured by the \a nth subexpression. The entire
- match has index 0 and the parenthesized subexpressions have
- indices starting from 1 (excluding non-capturing parentheses).
-
- \code
- TQRegExp rxlen( "(\\d+)(?:\\s*)(cm|inch)" );
- int pos = rxlen.search( "Length: 189cm" );
- if ( pos > -1 ) {
- TQString value = rxlen.cap( 1 ); // "189"
- TQString unit = rxlen.cap( 2 ); // "cm"
- // ...
- }
- \endcode
-
- The order of elements matched by cap() is as follows. The first
- element, cap(0), is the entire matching string. Each subsequent
- element corresponds to the next capturing open left parentheses.
- Thus cap(1) is the text of the first capturing parentheses, cap(2)
- is the text of the second, and so on.
-
- \target cap_in_a_loop
- Some patterns may lead to a number of matches which cannot be
- determined in advance, for example:
-
- \code
- TQRegExp rx( "(\\d+)" );
- str = "Offsets: 12 14 99 231 7";
- TQStringList list;
- pos = 0;
- while ( pos >= 0 ) {
- pos = rx.search( str, pos );
- if ( pos > -1 ) {
- list += rx.cap( 1 );
- pos += rx.matchedLength();
- }
- }
- // list contains "12", "14", "99", "231", "7"
- \endcode
-
- \sa capturedTexts() pos() exactMatch() search() searchRev()
-*/
-TQString TQRegExp::cap( int nth )
-{
- if ( nth < 0 || nth >= (int) priv->captured.size() / 2 ) {
- return TQString::null;
- } else {
- return capturedTexts()[nth];
- }
-}
-
-/*!
- Returns the position of the \a nth captured text in the searched
- string. If \a nth is 0 (the default), pos() returns the position
- of the whole match.
-
- Example:
- \code
- TQRegExp rx( "/([a-z]+)/([a-z]+)" );
- rx.search( "Output /dev/null" ); // returns 7 (position of /dev/null)
- rx.pos( 0 ); // returns 7 (position of /dev/null)
- rx.pos( 1 ); // returns 8 (position of dev)
- rx.pos( 2 ); // returns 12 (position of null)
- \endcode
-
- For zero-length matches, pos() always returns -1. (For example, if
- cap(4) would return an empty string, pos(4) returns -1.) This is
- due to an implementation tradeoff.
-
- \sa capturedTexts() exactMatch() search() searchRev()
-*/
-int TQRegExp::pos( int nth )
-{
- if ( nth < 0 || nth >= (int) priv->captured.size() / 2 )
- return -1;
- else
- return priv->captured[2 * nth];
-}
-
-/*!
- Returns a text string that explains why a regexp pattern is
- invalid the case being; otherwise returns "no error occurred".
-
- \sa isValid()
-*/
-TQString TQRegExp::errorString()
-{
- if ( isValid() ) {
- return TQString( RXERR_OK );
- } else {
- return eng->errorString();
- }
-}
-#endif
-
-/*!
- Returns the string \a str with every regexp special character
- escaped with a backslash. The special characters are $, (, ), *, +,
- ., ?, [, \, ], ^, {, | and }.
-
- Example:
- \code
- s1 = TQRegExp::escape( "bingo" ); // s1 == "bingo"
- s2 = TQRegExp::escape( "f(x)" ); // s2 == "f\\(x\\)"
- \endcode
-
- This function is useful to construct regexp patterns dynamically:
-
- \code
- TQRegExp rx( "(" + TQRegExp::escape(name) +
- "|" + TQRegExp::escape(alias) + ")" );
- \endcode
-*/
-TQString TQRegExp::escape( const TQString& str )
-{
- static const char meta[] = "$()*+.?[\\]^{|}";
- TQString quoted = str;
- int i = 0;
-
- while ( i < (int) quoted.length() ) {
- if ( strchr(meta, quoted[i].latin1()) != 0 )
- quoted.insert( i++, "\\" );
- i++;
- }
- return quoted;
-}
-
-void TQRegExp::prepareEngine() const
-{
- if ( eng == 0 ) {
-#ifndef TQT_NO_REGEXP_WILDCARD
- if ( priv->wc )
- priv->rxpattern = wc2rx( priv->pattern );
- else
-#endif
- priv->rxpattern = priv->pattern.isNull() ? TQString::tqfromLatin1( "" )
- : priv->pattern;
- TQRegExp *that = (TQRegExp *) this;
- // that->eng = newEngine( priv->rxpattern, priv->cs );
- regexpEngine( that->eng, priv->rxpattern, priv->cs, FALSE );
- priv->captured.detach();
- priv->captured.fill( -1, 2 + 2 * eng->numCaptures() );
- }
-}
-
-void TQRegExp::prepareEngineForMatch( const TQString& str ) const
-{
- prepareEngine();
-#ifndef TQT_NO_REGEXP_CAPTURE
- priv->t = str;
- priv->capturedCache.clear();
-#else
- TQ_UNUSED( str );
-#endif
-}
-
-void TQRegExp::invalidateEngine()
-{
- if ( eng != 0 ) {
- regexpEngine( eng, priv->rxpattern, priv->cs, TRUE );
- priv->rxpattern = TQString();
- eng = 0;
- }
-}
-
-int TQRegExp::caretIndex( int offset, CaretMode caretMode )
-{
- if ( caretMode == CaretAtZero ) {
- return 0;
- } else if ( caretMode == CaretAtOffset ) {
- return offset;
- } else { // CaretWontMatch
- return -1;
- }
-}
-
-#endif // USE_QT4
-
-#endif // TQT_NO_REGEXP