summaryrefslogtreecommitdiffstats
path: root/kjs/regexp.cpp
diff options
context:
space:
mode:
authorMichele Calgaro <[email protected]>2024-08-17 22:26:29 +0900
committerMichele Calgaro <[email protected]>2024-08-18 20:14:02 +0900
commit7740e825a683a9cc84f8422c94109c5fcc4beb8e (patch)
tree0f4cb4d307e3536232cea590e607d14f9edb5e76 /kjs/regexp.cpp
parentb59d51c67903335d27ada24d51be77137f664cb3 (diff)
downloadtdelibs-7740e825a683a9cc84f8422c94109c5fcc4beb8e.tar.gz
tdelibs-7740e825a683a9cc84f8422c94109c5fcc4beb8e.zip
kjs: use libpcre2 instead of libpcre
Signed-off-by: Michele Calgaro <[email protected]>
Diffstat (limited to 'kjs/regexp.cpp')
-rw-r--r--kjs/regexp.cpp229
1 files changed, 105 insertions, 124 deletions
diff --git a/kjs/regexp.cpp b/kjs/regexp.cpp
index 0c2675588..a693fdc1a 100644
--- a/kjs/regexp.cpp
+++ b/kjs/regexp.cpp
@@ -30,21 +30,17 @@
using namespace KJS;
-#ifdef PCRE_CONFIG_UTF8
RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
-#endif
RegExp::RegExp(const UString &p, int f)
: pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
{
// Determine whether libpcre has unicode support if need be..
-#ifdef PCRE_CONFIG_UTF8
if (utf8Support == Unknown) {
- int supported;
- pcre_config(PCRE_CONFIG_UTF8, (void*)&supported);
- utf8Support = supported ? Supported : Unsupported;
+ uint32_t supported;
+ pcre2_config(PCRE2_CONFIG_COMPILED_WIDTHS, (void*)&supported);
+ utf8Support = (supported & 0x0001) ? Supported : Unsupported;
}
-#endif
nrSubPatterns = 0; // determined in match() with POSIX regex.
@@ -63,33 +59,33 @@ RegExp::RegExp(const UString &p, int f)
escape = false;
// we only care about \u
if (c == 'u') {
- // standard unicode escape sequence looks like \uxxxx but
- // other browsers also accept less then 4 hex digits
- unsigned short u = 0;
- int j = 0;
- for (j = 0; j < 4; ++j) {
- if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) {
- u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
- ++i;
- } else {
- // sequence incomplete. restore index.
- // TODO: cleaner way to propagate warning
- fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j);
- i -= j;
- break;
- }
- }
- if (j < 4) {
- // sequence was incomplete. treat \u as u which IE always
- // and FF sometimes does.
- intern.append(UString('u'));
- } else {
+ // standard unicode escape sequence looks like \uxxxx but
+ // other browsers also accept less then 4 hex digits
+ unsigned short u = 0;
+ int j = 0;
+ for (j = 0; j < 4; ++j) {
+ if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) {
+ u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
+ ++i;
+ } else {
+ // sequence incomplete. restore index.
+ // TODO: cleaner way to propagate warning
+ fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j);
+ i -= j;
+ break;
+ }
+ }
+ if (j < 4) {
+ // sequence was incomplete. treat \u as u which IE always
+ // and FF sometimes does.
+ intern.append(UString('u'));
+ } else {
c = UChar(u);
switch (u) {
case 0:
- // Make sure to encode 0, to avoid terminating the string
- intern += UString(nil);
- break;
+ // Make sure to encode 0, to avoid terminating the string
+ intern += UString(nil);
+ break;
case '^':
case '$':
case '\\':
@@ -101,13 +97,13 @@ RegExp::RegExp(const UString &p, int f)
case '{': case '}':
case '[': case ']':
case '|':
- // escape pattern characters have to remain escaped
- intern.append(UString('\\'));
- // intentional fallthrough
+ // escape pattern characters have to remain escaped
+ intern.append(UString('\\'));
+ // intentional fallthrough
default:
- intern += UString(&c, 1);
- break;
- }
+ intern += UString(&c, 1);
+ break;
+ }
}
continue;
}
@@ -126,45 +122,46 @@ RegExp::RegExp(const UString &p, int f)
intern = p;
}
-#ifdef HAVE_PCREPOSIX
- int pcreflags = 0;
- const char *perrormsg;
- int errorOffset;
+#ifdef HAVE_PCRE2POSIX
+ uint32_t pcre2flags = 0;
+ int errorCode;
+ PCRE2_SIZE errorOffset;
if (flgs & IgnoreCase)
- pcreflags |= PCRE_CASELESS;
+ pcre2flags |= PCRE2_CASELESS;
if (flgs & Multiline)
- pcreflags |= PCRE_MULTILINE;
+ pcre2flags |= PCRE2_MULTILINE;
-#ifdef PCRE_CONFIG_UTF8
if (utf8Support == Supported)
- pcreflags |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK);
-#endif
+ pcre2flags |= (PCRE2_UTF | PCRE2_NO_UTF_CHECK);
// Fill our buffer with an encoded version, whether utf-8, or,
// if PCRE is incapable, truncated.
prepareMatch(intern);
- pcregex = pcre_compile(buffer, pcreflags,
- &perrormsg, &errorOffset, NULL);
+ pcregex = pcre2_compile(buffer, PCRE2_ZERO_TERMINATED, pcre2flags,
+ &errorCode, &errorOffset, NULL);
doneMatch(); // Cleanup buffers
if (!pcregex) {
#ifndef NDEBUG
- fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg);
+ PCRE2_UCHAR errorMsg[256];
+ pcre2_get_error_message(errorCode, errorMsg, sizeof(errorMsg));
+ fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMsg);
#endif
valid = false;
return;
}
-#ifdef PCRE_INFO_CAPTURECOUNT
// Get number of subpatterns that will be returned
- int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
+ int rc = pcre2_pattern_info(pcregex, PCRE2_INFO_CAPTURECOUNT, &nrSubPatterns);
if (rc != 0)
-#endif
+ {
nrSubPatterns = 0; // fallback. We always need the first pair of offsets.
+ }
-#else /* HAVE_PCREPOSIX */
+ match_data = pcre2_match_data_create_from_pattern(pcregex, NULL);
+#else
int regflags = 0;
#ifdef REG_EXTENDED
@@ -195,9 +192,15 @@ RegExp::RegExp(const UString &p, int f)
RegExp::~RegExp()
{
doneMatch(); // Be 100% sure buffers are freed
-#ifdef HAVE_PCREPOSIX
+#ifdef HAVE_PCRE2POSIX
+ if (match_data)
+ {
+ pcre2_match_data_free(match_data);
+ }
if (pcregex)
- pcre_free(pcregex);
+ {
+ pcre2_code_free(pcregex);
+ }
#else
/* TODO: is this really okay after an error ? */
regfree(&preg);
@@ -208,7 +211,7 @@ void RegExp::prepareUtf8(const UString& s)
{
// Allocate a buffer big enough to hold all the characters plus \0
const int length = s.size();
- buffer = new char[length * 3 + 1];
+ buffer = new buftype_t[length * 3 + 1];
// Also create buffer for positions. We need one extra character in there,
// even past the \0 since the non-empty handling may jump one past the end
@@ -217,7 +220,7 @@ void RegExp::prepareUtf8(const UString& s)
// Convert to runs of 8-bit characters, and generate indeces
// Note that we do NOT combine surrogate pairs here, as
// regexps operate on them as separate characters
- char *p = buffer;
+ buftype_t *p = buffer;
int *posOut = originalPos;
const UChar *d = s.data();
for (int i = 0; i != length; ++i) {
@@ -225,16 +228,16 @@ void RegExp::prepareUtf8(const UString& s)
int sequenceLen;
if (c < 0x80) {
- *p++ = (char)c;
+ *p++ = (buftype_t)c;
sequenceLen = 1;
} else if (c < 0x800) {
- *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
- *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
+ *p++ = (buftype_t)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
+ *p++ = (buftype_t)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
sequenceLen = 2;
} else {
- *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
- *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
- *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
+ *p++ = (buftype_t)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
+ *p++ = (buftype_t)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
+ *p++ = (buftype_t)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
sequenceLen = 3;
}
@@ -262,7 +265,7 @@ void RegExp::prepareASCII (const UString& s)
// when we don't have utf 8 available -- use
// truncated version, and pray for the best
CString truncated = s.cstring();
- buffer = new char[truncated.size() + 1];
+ buffer = new buftype_t[truncated.size() + 1];
memcpy(buffer, truncated.c_str(), truncated.size());
buffer[truncated.size()] = '\0'; // For _compile use
bufferSize = truncated.size();
@@ -272,11 +275,9 @@ void RegExp::prepareMatch(const UString &s)
{
delete[] originalPos; // Just to be sure..
delete[] buffer;
-#ifdef PCRE_CONFIG_UTF8
if (utf8Support == Supported)
prepareUtf8(s);
else
-#endif
prepareASCII(s);
#ifndef NDEBUG
@@ -308,17 +309,16 @@ UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
if (i > s.size() || s.isNull())
return UString::null;
-#ifdef HAVE_PCREPOSIX
- int ovecsize = (nrSubPatterns+1)*3; // see pcre docu
- if (ovector) *ovector = new int[ovecsize];
- if (!pcregex)
+#ifdef HAVE_PCRE2POSIX
+ if (!pcregex || !match_data)
+ return UString::null;
+ if (!ovector)
return UString::null;
int startPos;
int nextPos;
-
-#ifdef PCRE_CONFIG_UTF8
- if (utf8Support == Supported) {
+ if (utf8Support == Supported)
+ {
startPos = i;
while (originalPos[startPos] < i)
++startPos;
@@ -328,53 +328,59 @@ UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
while (originalPos[nextPos] < (i + 1))
++nextPos;
}
- } else
-#endif
+ }
+ else
{
startPos = i;
nextPos = i + (i < s.size() ? 1 : 0);
}
- int baseFlags =
-#ifdef PCRE_CONFIG_UTF8
- utf8Support == Supported ? PCRE_NO_UTF8_CHECK :
-#endif
- 0;
- int numMatches = pcre_exec(pcregex, NULL, buffer, bufferSize, startPos,
- m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED | baseFlags) : baseFlags, // see man pcretest
- ovector ? *ovector : 0L, ovecsize);
- if (numMatches < 0)
+ uint32_t baseFlags = (utf8Support == Supported ? PCRE2_NO_UTF_CHECK : 0);
+ if (m_notEmpty)
+ {
+ baseFlags |= PCRE2_NOTEMPTY | PCRE2_ANCHORED;
+ }
+ int numMatches = pcre2_match(pcregex, buffer, PCRE2_ZERO_TERMINATED, startPos, baseFlags, match_data, NULL);
+ if (numMatches <= 0)
{
// Failed to match.
- if (numMatches == PCRE_ERROR_NOMATCH && (flgs & Global) && m_notEmpty && ovector && startPos < nextPos)
+ if (numMatches == PCRE2_ERROR_NOMATCH && (flgs & Global) && m_notEmpty && startPos < nextPos)
{
// We set m_notEmpty ourselves, to look for a non-empty match
- // (see man pcretest or pcretest.c for details).
// So we don't stop here, we want to try again at i+1.
#ifdef KJS_VERBOSE
fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");
#endif
m_notEmpty = 0;
- numMatches = pcre_exec(pcregex, NULL, buffer, bufferSize, nextPos, baseFlags,
- ovector ? *ovector : 0L, ovecsize);
- if (numMatches < 0)
+ baseFlags = (utf8Support == Supported ? PCRE2_NO_UTF_CHECK : 0);
+ numMatches = pcre2_match(pcregex, buffer, PCRE2_ZERO_TERMINATED, nextPos, baseFlags, match_data, NULL);
+ if (numMatches <= 0)
return UString::null;
}
- else // done
+ else
return UString::null;
}
- // Got a match, proceed with it.
- // But fix up the ovector if need be..
- if (ovector && originalPos) {
- for (unsigned c = 0; c < 2 * TQMIN((unsigned)numMatches, nrSubPatterns+1); ++c) {
- if ((*ovector)[c] != -1)
- (*ovector)[c] = originalPos[(*ovector)[c]];
+ PCRE2_SIZE *pcre2_ovector = pcre2_get_ovector_pointer(match_data);
+ if (!pcre2_ovector)
+ return UString::null;
+
+ uint32_t pcre2_ovecCount = pcre2_get_ovector_count(match_data);
+ *ovector = new int[pcre2_ovecCount * 2];
+ if (originalPos)
+ {
+ for (size_t c = 0; c < 2 * pcre2_ovecCount; ++c)
+ {
+ (*ovector)[c] = (pcre2_ovector[c] != -1) ? originalPos[pcre2_ovector[c]] : -1;
+ }
+ }
+ else
+ {
+ for (size_t c = 0; c < 2 * pcre2_ovecCount; ++c)
+ {
+ (*ovector)[c] = pcre2_ovector[c];
}
}
-
- if (!ovector)
- return UString::null; // don't rely on the return value if you pass ovector==0
#else
const uint maxMatch = 10;
regmatch_t rmatch[maxMatch];
@@ -419,28 +425,3 @@ UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
}
return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
}
-
-#if 0 // unused
-bool RegExp::test(const UString &s, int)
-{
-#ifdef HAVE_PCREPOSIX
- int ovector[300];
- CString buffer(s.cstring());
-
- if (s.isNull() ||
- pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
- 0, ovector, 300) == PCRE_ERROR_NOMATCH)
- return false;
- else
- return true;
-
-#else
-
- char *str = strdup(s.ascii());
- int r = regexec(&preg, str, 0, 0, 0);
- free(str);
-
- return r == 0;
-#endif
-}
-#endif