summaryrefslogtreecommitdiffstats
path: root/libksieve/parser
diff options
context:
space:
mode:
Diffstat (limited to 'libksieve/parser')
-rw-r--r--libksieve/parser/Makefile.am12
-rw-r--r--libksieve/parser/lexer.cpp666
-rw-r--r--libksieve/parser/parser.cpp651
-rw-r--r--libksieve/parser/utf8validator.cpp141
4 files changed, 1470 insertions, 0 deletions
diff --git a/libksieve/parser/Makefile.am b/libksieve/parser/Makefile.am
new file mode 100644
index 000000000..044d045cf
--- /dev/null
+++ b/libksieve/parser/Makefile.am
@@ -0,0 +1,12 @@
+# final breaks static use:
+# If you feel like "fixing" it, better talk to [email protected] first :)
+KDE_OPTIONS = nofinal
+
+INCLUDES = -I$(top_srcdir)/libksieve $(all_includes)
+
+noinst_LTLIBRARIES = libksieve_parser.la
+
+libksieve_parser_la_SOURCES = utf8validator.cpp lexer.cpp parser.cpp
+libksieve_parser_la_LIBADD = ../shared/libksieve_shared.la
+libksieve_parser_la_LDFLAGS = $(all_libraries) -no-undefined
+
diff --git a/libksieve/parser/lexer.cpp b/libksieve/parser/lexer.cpp
new file mode 100644
index 000000000..d8b76da71
--- /dev/null
+++ b/libksieve/parser/lexer.cpp
@@ -0,0 +1,666 @@
+/* -*- c++ -*-
+ parser/lexer.cpp
+
+ This file is part of KSieve,
+ the KDE internet mail/usenet news message filtering library.
+ Copyright (c) 2002-2003 Marc Mutz <[email protected]>
+
+ KSieve is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License, version 2, as
+ published by the Free Software Foundation.
+
+ KSieve is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ In addition, as a special exception, the copyright holders give
+ permission to link the code of this program with any edition of
+ the Qt library by Trolltech AS, Norway (or with modified versions
+ of Qt that use the same license as Qt), and distribute linked
+ combinations including the two. You must obey the GNU General
+ Public License in all respects for all of the code used other than
+ Qt. If you modify this file, you may extend this exception to
+ your version of the file, but you are not obligated to do so. If
+ you do not wish to do so, delete this exception statement from
+ your version.
+*/
+
+#include <config.h>
+
+#include <ksieve/lexer.h>
+#include <impl/lexer.h>
+
+#include <impl/utf8validator.h>
+#include <ksieve/error.h>
+
+#include <qstring.h>
+#include <qstringlist.h>
+#include <qtextcodec.h>
+
+#include <memory> // std::auto_ptr
+
+#include <assert.h>
+#include <ctype.h> // isdigit
+
+#ifdef STR_DIM
+# undef STR_DIM
+#endif
+#define STR_DIM(x) (sizeof(x) - 1)
+
+namespace KSieve {
+
+ //
+ //
+ // Lexer Bridge implementation
+ //
+ //
+
+ Lexer::Lexer( const char * scursor, const char * send, int options )
+ : i( 0 )
+ {
+ i = new Impl( scursor, send, options );
+ }
+
+ Lexer::~Lexer() {
+ delete i; i = 0;
+ }
+
+ bool Lexer::ignoreComments() const {
+ assert( i );
+ return i->ignoreComments();
+ }
+
+ const Error & Lexer::error() const {
+ assert( i );
+ return i->error();
+ }
+
+ bool Lexer::atEnd() const {
+ assert( i );
+ return i->atEnd();
+ }
+
+ int Lexer::column() const {
+ assert( i );
+ return i->column();
+ }
+
+ int Lexer::line() const {
+ assert( i );
+ return i->line();
+ }
+
+ void Lexer::save() {
+ assert( i );
+ i->save();
+ }
+
+ void Lexer::restore() {
+ assert( i );
+ i->restore();
+ }
+
+ Lexer::Token Lexer::nextToken( QString & result ) {
+ assert( i );
+ return i->nextToken( result );
+ }
+
+} // namespace KSieve
+
+
+// none except a-zA-Z0-9_
+static const unsigned char iTextMap[16] = {
+ 0x00, 0x00, 0x00, 0x00, // CTLs: none
+ 0x00, 0x00, 0xFF, 0xC0, // SP ... '?': 0-9
+ 0x7F, 0xFF, 0xFF, 0xE1, // '@' ... '_': A-Z_
+ 0x7F, 0xFF, 0xFF, 0xE0 // '`' ... DEL: a-z
+};
+
+// SP, HT, CR, LF, {}[]();,#/
+// ### exclude '['? Why would one want to write identifier["foo"]?
+static const unsigned char delimMap[16] = {
+ 0x00, 0x64, 0x00, 0x00, // CTLs: CR, HT, LF
+ 0x90, 0xC9, 0x00, 0x10, // SP ... '?': SP, #(),;
+ 0x00, 0x00, 0x00, 0x16, // '@' ... '_': []
+ 0x00, 0x00, 0x00, 0x16 // '`' ... DEL: {}
+};
+
+// All except iText, delim, "*:
+static const unsigned char illegalMap[16] = {
+ 0xFF, 0x9B, 0xFF, 0xFF,
+ 0x4F, 0x16, 0x00, 0x0F,
+ 0x80, 0x00, 0x00, 0x0A,
+ 0x80, 0x00, 0x00, 0x0A
+};
+
+static inline bool isOfSet( const unsigned char map[16], unsigned char ch ) {
+ assert( ch < 128 );
+ return ( map[ ch/8 ] & 0x80 >> ch%8 );
+}
+
+static inline bool isIText( unsigned char ch ) {
+ return ch <= 'z' && isOfSet( iTextMap, ch );
+}
+
+static inline bool isDelim( unsigned char ch ) {
+ return ch <= '}' && isOfSet( delimMap, ch );
+}
+
+static inline bool isIllegal( unsigned char ch ) {
+ return ch >= '~' || isOfSet( illegalMap, ch );
+}
+
+static inline bool is8Bit( signed char ch ) {
+ return ch < 0;
+}
+
+static QString removeCRLF( const QString & s ) {
+ const bool CRLF = s.endsWith( "\r\n" );
+ const bool LF = !CRLF && s.endsWith( "\n" );
+
+ const int e = CRLF ? 2 : LF ? 1 : 0 ; // what to chop off at the end
+
+ return s.left( s.length() - e );
+}
+
+static QString removeDotStuff( const QString & s ) {
+ return s.startsWith( ".." ) ? s.mid( 1 ) : s ;
+}
+
+namespace KSieve {
+
+ //
+ //
+ // Lexer Implementation
+ //
+ //
+
+ Lexer::Impl::Impl( const char * scursor, const char * send, int options )
+ : mState( scursor ? scursor : send ),
+ mEnd( send ? send : scursor ),
+ mIgnoreComments( options & IgnoreComments ),
+ mIgnoreLF( options & IgnoreLineFeeds )
+ {
+ if ( !scursor || !send )
+ assert( atEnd() );
+ }
+
+ Lexer::Token Lexer::Impl::nextToken( QString & result ) {
+ assert( !atEnd() );
+ result = QString::null;
+ //clearErrors();
+
+ const int oldLine = line();
+
+ const bool eatingWSSucceeded = ignoreComments() ? eatCWS() : eatWS() ;
+
+ if ( !ignoreLineFeeds() && oldLine != line() ) {
+ result.setNum( line() - oldLine ); // return number of linefeeds encountered
+ return LineFeeds;
+ }
+
+ if ( !eatingWSSucceeded )
+ return None;
+
+ if ( atEnd() )
+ return None;
+
+ switch ( *mState.cursor ) {
+ case '#': // HashComment
+ assert( !ignoreComments() );
+ ++mState.cursor;
+ if ( !atEnd() )
+ parseHashComment( result, true );
+ return HashComment;
+ case '/': // BracketComment
+ assert( !ignoreComments() );
+ ++mState.cursor; // eat slash
+ if ( atEnd() || *mState.cursor != '*' ) {
+ makeError( Error::SlashWithoutAsterisk );
+ return BracketComment;
+ }
+ ++mState.cursor; // eat asterisk
+ if ( atEnd() ) {
+ makeError( Error::UnfinishedBracketComment );
+ return BracketComment;
+ }
+ parseBracketComment( result, true );
+ return BracketComment;
+ case ':': // Tag
+ ++mState.cursor;
+ if ( atEnd() ) {
+ makeError( Error::UnexpectedCharacter, line(), column() - 1 );
+ return Tag;
+ }
+ if ( !isIText( *mState.cursor ) ) {
+ makeIllegalCharError( *mState.cursor );
+ return Tag;
+ }
+ parseTag( result );
+ return Tag;
+ case '"': // QuotedString
+ ++mState.cursor;
+ parseQuotedString( result );
+ return QuotedString;
+ case '{':
+ case '}':
+ case '[':
+ case ']':
+ case '(':
+ case ')':
+ case ';':
+ case ',': // Special
+ result = *mState.cursor++;
+ return Special;
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9': // Number
+ parseNumber( result );
+ return Number;
+ case 't': // maybe MultiLineString, else Identifier
+ if ( _strnicmp( mState.cursor, "text:", STR_DIM("text:") ) == 0 ) {
+ // MultiLineString
+ mState.cursor += STR_DIM("text:");
+ parseMultiLine( result );
+ // ### FIXME: There can be a hash-comment between "text:"
+ // and CRLF! That should be preserved somehow...
+ return MultiLineString;
+ }
+ // else fall through:
+ default: // Identifier (first must not be 0-9, and can't (caught by Number above))
+ if ( !isIText( *mState.cursor ) ) {
+ makeError( Error::IllegalCharacter );
+ return None;
+ }
+ parseIdentifier( result );
+ return Identifier;
+ }
+ }
+
+ bool Lexer::Impl::eatWS() {
+ while ( !atEnd() )
+ switch ( *mState.cursor ) {
+ case '\r':
+ case '\n':
+ if ( !eatCRLF() )
+ return false;
+ break;
+ case ' ':
+ case '\t':
+ ++mState.cursor;
+ break;
+ default:
+ return true;
+ }
+
+ // at end:
+ return true;
+ }
+
+ bool Lexer::Impl::eatCRLF() {
+ assert( !atEnd() );
+ assert( *mState.cursor == '\n' || *mState.cursor == '\r' );
+
+ if ( *mState.cursor == '\r' ) {
+ ++mState.cursor;
+ if ( atEnd() || *mState.cursor != '\n' ) {
+ // CR w/o LF -> error
+ makeError( Error::CRWithoutLF );
+ return false;
+ } else {
+ // good CRLF
+ newLine();
+ return true;
+ }
+ } else /* *mState.cursor == '\n' */ {
+ // good, LF only
+ newLine();
+ return true;
+ }
+ }
+
+
+ bool Lexer::Impl::parseHashComment( QString & result, bool reallySave ) {
+ // hash-comment := "#" *CHAR-NOT-CRLF CRLF
+
+ // check that the caller plays by the rules:
+ assert( *(mState.cursor-1) == '#' );
+
+ const char * const commentStart = mState.cursor;
+
+ // find next CRLF:
+ while ( !atEnd() ) {
+ if ( *mState.cursor == '\n' || *mState.cursor == '\r' ) break;
+ ++mState.cursor;
+ }
+
+ const char * const commentEnd = mState.cursor - 1;
+
+ if ( commentEnd == commentStart ) return true; // # was last char in script...
+
+ if ( atEnd() || eatCRLF() ) {
+ const int commentLength = commentEnd - commentStart + 1;
+ if ( commentLength > 0 ) {
+ if ( !isValidUtf8( commentStart, commentLength ) ) {
+ makeError( Error::InvalidUTF8 );
+ return false;
+ }
+ if ( reallySave )
+ result += QString::fromUtf8( commentStart, commentLength );
+ }
+ return true;
+ }
+
+ return false;
+ }
+
+ bool Lexer::Impl::parseBracketComment( QString & result, bool reallySave ) {
+ // bracket-comment := "/*" *(CHAR-NOT-STAR / ("*" CHAR-NOT-SLASH )) "*/"
+
+ // check that caller plays by the rules:
+ assert( *(mState.cursor-2) == '/' );
+ assert( *(mState.cursor-1) == '*' );
+
+ const char * const commentStart = mState.cursor;
+ const int commentCol = column() - 2;
+ const int commentLine = line();
+
+ // find next asterisk:
+ do {
+ if ( !skipTo( '*' ) ) {
+ if ( !error() )
+ makeError( Error::UnfinishedBracketComment, commentLine, commentCol );
+ return false;
+ }
+ } while ( !atEnd() && *++mState.cursor != '/' );
+
+ if ( atEnd() ) {
+ makeError( Error::UnfinishedBracketComment, commentLine, commentCol );
+ return false;
+ }
+
+ assert( *mState.cursor == '/' );
+
+ const int commentLength = mState.cursor - commentStart - 1;
+ if ( commentLength > 0 ) {
+ if ( !isValidUtf8( commentStart, commentLength ) ) {
+ makeError( Error::InvalidUTF8 );
+ return false;
+ }
+ if ( reallySave ) {
+ QString tmp = QString::fromUtf8( commentStart, commentLength );
+ result += tmp.remove( '\r' ); // get rid of CR in CRLF pairs
+ }
+ }
+
+ ++mState.cursor; // eat '/'
+ return true;
+ }
+
+ bool Lexer::Impl::parseComment( QString & result, bool reallySave ) {
+ // comment := hash-comment / bracket-comment
+
+ switch( *mState.cursor ) {
+ case '#':
+ ++mState.cursor;
+ return parseHashComment( result, reallySave );
+ case '/':
+ if ( charsLeft() < 2 || mState.cursor[1] != '*' ) {
+ makeError( Error::IllegalCharacter );
+ return false;
+ } else {
+ mState.cursor += 2; // eat "/*"
+ return parseBracketComment( result, reallySave );
+ }
+ default:
+ return false; // don't set an error here - there was no comment
+ }
+ }
+
+ bool Lexer::Impl::eatCWS() {
+ // white-space := 1*(SP / CRLF / HTAB / comment )
+
+ while ( !atEnd() ) {
+ switch( *mState.cursor ) {
+ case ' ':
+ case '\t': // SP / HTAB
+ ++mState.cursor;
+ break;;
+ case '\n':
+ case '\r': // CRLF
+ if ( !eatCRLF() )
+ return false;
+ break;
+ case '#':
+ case '/': // comments
+ {
+ QString dummy;
+ if ( !parseComment( dummy ) )
+ return false;
+ }
+ break;
+ default:
+ return true;
+ }
+ }
+ return true;
+ }
+
+ bool Lexer::Impl::parseIdentifier( QString & result ) {
+ // identifier := (ALPHA / "_") *(ALPHA DIGIT "_")
+
+ assert( isIText( *mState.cursor ) );
+
+ const char * const identifierStart = mState.cursor;
+
+ // first char:
+ if ( isdigit( *mState.cursor ) ) { // no digits for the first
+ makeError( Error::NoLeadingDigits );
+ return false;
+ }
+
+ // rest of identifier chars ( now digits are allowed ):
+ for ( ++mState.cursor ; !atEnd() && isIText( *mState.cursor ) ; ++mState.cursor );
+
+ const int identifierLength = mState.cursor - identifierStart;
+
+ // Can use the fast fromLatin1 here, since identifiers are always
+ // in the us-ascii subset:
+ result += QString::fromLatin1( identifierStart, identifierLength );
+
+ if ( atEnd() || isDelim( *mState.cursor ) )
+ return true;
+
+ makeIllegalCharError( *mState.cursor );
+ return false;
+ }
+
+ bool Lexer::Impl::parseTag( QString & result ) {
+ // tag := ":" identifier
+
+ // check that the caller plays by the rules:
+ assert( *(mState.cursor-1) == ':' );
+ assert( !atEnd() );
+ assert( isIText( *mState.cursor ) );
+
+ return parseIdentifier( result );
+ }
+
+ bool Lexer::Impl::parseNumber( QString & result ) {
+ // number := 1*DIGIT [QUANTIFIER]
+ // QUANTIFIER := "K" / "M" / "G"
+
+ assert( isdigit( *mState.cursor ) );
+
+ while ( !atEnd() && isdigit( *mState.cursor ) )
+ result += *mState.cursor++;
+
+ if ( atEnd() || isDelim( *mState.cursor ) )
+ return true;
+
+ switch ( *mState.cursor ) {
+ case 'G':
+ case 'g':
+ case 'M':
+ case 'm':
+ case 'K':
+ case 'k':
+ result += *mState.cursor++;
+ break;
+ default:
+ makeIllegalCharError();
+ return false;
+ }
+
+ // quantifier found. Check for delimiter:
+ if ( atEnd() || isDelim( *mState.cursor ) )
+ return true;
+ makeIllegalCharError();
+ return false;
+ }
+
+ bool Lexer::Impl::parseMultiLine( QString & result ) {
+ // multi-line := "text:" *(SP / HTAB) (hash-comment / CRLF)
+ // *(multi-line-literal / multi-line-dotstuff)
+ // "." CRLF
+ // multi-line-literal := [CHAR-NOT-DOT *CHAR-NOT-CRLF] CRLF
+ // multi-line-dotstuff := "." 1*CHAR-NOT-CRLF CRLF
+ // ;; A line containing only "." ends the multi-line.
+ // ;; Remove a leading '.' if followed by another '.'.
+
+ assert( _strnicmp( mState.cursor - 5, "text:", STR_DIM("text:") ) == 0 );
+
+ const int mlBeginLine = line();
+ const int mlBeginCol = column() - 5;
+
+ while ( !atEnd() ) {
+ switch ( *mState.cursor ) {
+ case ' ':
+ case '\t':
+ ++mState.cursor;
+ break;
+ case '#':
+ {
+ ++mState.cursor;
+ QString dummy;
+ if ( !parseHashComment( dummy ) )
+ return false;
+ goto MultiLineStart; // break from switch _and_ while
+ }
+ case '\n':
+ case '\r':
+ if ( !eatCRLF() ) return false;
+ goto MultiLineStart; // break from switch _and_ while
+ default:
+ makeError( Error::NonCWSAfterTextColon );
+ return false;
+ }
+ }
+
+ MultiLineStart:
+ if ( atEnd() ) {
+ makeError( Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol );
+ return false;
+ }
+
+ // Now, collect the single lines until one with only a single dot is found:
+ QStringList lines;
+ while ( !atEnd() ) {
+ const char * const oldBeginOfLine = beginOfLine();
+ if ( !skipToCRLF() )
+ return false;
+ const int lineLength = mState.cursor - oldBeginOfLine;
+ if ( lineLength > 0 ) {
+ if ( !isValidUtf8( oldBeginOfLine, lineLength ) ) {
+ makeError( Error::InvalidUTF8 );
+ return false;
+ }
+ const QString line = removeCRLF( QString::fromUtf8( oldBeginOfLine, lineLength ) );
+ lines.push_back( removeDotStuff( line ) );
+ if ( line == "." )
+ break;
+ } else {
+ lines.push_back( QString::null );
+ }
+ }
+
+ if ( lines.back() != "." ) {
+ makeError( Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol );
+ return false;
+ }
+
+ assert( !lines.empty() );
+ lines.erase( --lines.end() ); // don't include the lone dot.
+ result = lines.join("\n");
+ return true;
+ }
+
+ bool Lexer::Impl::parseQuotedString( QString & result ) {
+ // quoted-string := DQUOTE *CHAR DQUOTE
+
+ // check that caller plays by the rules:
+ assert( *(mState.cursor-1) == '"' );
+
+ const int qsBeginCol = column() - 1;
+ const int qsBeginLine = line();
+
+ const QTextCodec * const codec = QTextCodec::codecForMib( 106 ); // UTF-8
+ assert( codec );
+ const std::auto_ptr<QTextDecoder> dec( codec->makeDecoder() );
+ assert( dec.get() );
+
+ while ( !atEnd() )
+ switch ( *mState.cursor ) {
+ case '"':
+ ++mState.cursor;
+ return true;
+ case '\r':
+ case '\n':
+ if ( !eatCRLF() )
+ return false;
+ result += '\n';
+ break;
+ case '\\':
+ ++mState.cursor;
+ if ( atEnd() )
+ break;
+ // else fall through:
+ default:
+ if ( !is8Bit( *mState.cursor ) )
+ result += *mState.cursor++;
+ else { // probably UTF-8
+ const char * const eightBitBegin = mState.cursor;
+ skipTo8BitEnd();
+ const int eightBitLen = mState.cursor - eightBitBegin;
+ assert( eightBitLen > 0 );
+ if ( isValidUtf8( eightBitBegin, eightBitLen ) )
+ result += dec->toUnicode( eightBitBegin, eightBitLen );
+ else {
+ assert( column() >= eightBitLen );
+ makeError( Error::InvalidUTF8, line(), column() - eightBitLen );
+ return false;
+ }
+ }
+ }
+
+ makeError( Error::PrematureEndOfQuotedString, qsBeginLine, qsBeginCol );
+ return false;
+ }
+
+ void Lexer::Impl::makeIllegalCharError( char ch ) {
+ makeError( isIllegal( ch ) ? Error::IllegalCharacter : Error::UnexpectedCharacter );
+ }
+
+} // namespace KSieve
diff --git a/libksieve/parser/parser.cpp b/libksieve/parser/parser.cpp
new file mode 100644
index 000000000..8c2db050e
--- /dev/null
+++ b/libksieve/parser/parser.cpp
@@ -0,0 +1,651 @@
+/* -*- c++ -*-
+ parser/parser.cpp
+
+ This file is part of KSieve,
+ the KDE internet mail/usenet news message filtering library.
+ Copyright (c) 2002-2003 Marc Mutz <[email protected]>
+
+ KSieve is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License, version 2, as
+ published by the Free Software Foundation.
+
+ KSieve is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ In addition, as a special exception, the copyright holders give
+ permission to link the code of this program with any edition of
+ the Qt library by Trolltech AS, Norway (or with modified versions
+ of Qt that use the same license as Qt), and distribute linked
+ combinations including the two. You must obey the GNU General
+ Public License in all respects for all of the code used other than
+ Qt. If you modify this file, you may extend this exception to
+ your version of the file, but you are not obligated to do so. If
+ you do not wish to do so, delete this exception statement from
+ your version.
+*/
+
+#include <config.h>
+
+#include <ksieve/parser.h>
+#include <impl/parser.h>
+
+#include <ksieve/error.h>
+
+#include <qstring.h>
+
+#include <assert.h>
+#include <limits.h> // ULONG_MAX
+#include <ctype.h> // isdigit
+
+namespace KSieve {
+
+ //
+ //
+ // Parser Bridge implementation
+ //
+ //
+
+ Parser::Parser( const char * scursor, const char * const send, int options )
+ : i( 0 )
+ {
+ i = new Impl( scursor, send, options );
+ }
+
+ Parser::~Parser() {
+ delete i; i = 0;
+ }
+
+ void Parser::setScriptBuilder( ScriptBuilder * builder ) {
+ assert( i );
+ i->mBuilder = builder;
+ }
+
+ ScriptBuilder * Parser::scriptBuilder() const {
+ assert( i );
+ return i->mBuilder;
+ }
+
+ const Error & Parser::error() const {
+ assert( i );
+ return i->error();
+ }
+
+ bool Parser::parse() {
+ assert( i );
+ return i->parse();
+ }
+
+}
+
+static inline unsigned long factorForQuantifier( char ch ) {
+ switch ( ch ) {
+ case 'g':
+ case 'G':
+ return 1024*1024*1024;
+ case 'm':
+ case 'M':
+ return 1024*1024;
+ case 'k':
+ case 'K':
+ return 1024;
+ default:
+ assert( 0 ); // lexer should prohibit this
+ return 1; // make compiler happy
+ }
+}
+
+static inline bool willOverflowULong( unsigned long result, unsigned long add ) {
+ static const unsigned long maxULongByTen = (unsigned long)(ULONG_MAX / 10.0) ;
+ return result > maxULongByTen || ULONG_MAX - 10 * result < add ;
+}
+
+namespace KSieve {
+
+ //
+ //
+ // Parser Implementation
+ //
+ //
+
+ Parser::Impl::Impl( const char * scursor, const char * const send, int options )
+ : mToken( Lexer::None ),
+ lexer( scursor, send, options ),
+ mBuilder( 0 )
+ {
+
+ }
+
+ bool Parser::Impl::isStringToken() const {
+ return token() == Lexer::QuotedString ||
+ token() == Lexer::MultiLineString ;
+ }
+
+
+ bool Parser::Impl::isArgumentToken() const {
+ return isStringToken() ||
+ token() == Lexer::Number ||
+ token() == Lexer::Tag ||
+ token() == Lexer::Special && mTokenValue == "[" ;
+ }
+
+ bool Parser::Impl::obtainToken() {
+ while ( !mToken && !lexer.atEnd() && !lexer.error() ) {
+ mToken = lexer.nextToken( mTokenValue );
+ if ( lexer.error() )
+ break;
+ // comments and line feeds are semantically invisible and may
+ // appear anywhere, so we handle them here centrally:
+ switch ( token() ) {
+ case Lexer::HashComment:
+ if ( scriptBuilder() )
+ scriptBuilder()->hashComment( tokenValue() );
+ consumeToken();
+ break;
+ case Lexer::BracketComment:
+ if ( scriptBuilder() )
+ scriptBuilder()->bracketComment( tokenValue() );
+ consumeToken();
+ break;
+ case Lexer::LineFeeds:
+ for ( unsigned int i = 0, end = tokenValue().toUInt() ; i < end ; ++i )
+ if ( scriptBuilder() ) // better check every iteration, b/c
+ // we call out to ScriptBuilder,
+ // where nasty things might happen!
+ scriptBuilder()->lineFeed();
+ consumeToken();
+ break;
+ default: ; // make compiler happy
+ }
+ }
+ if ( lexer.error() && scriptBuilder() )
+ scriptBuilder()->error( lexer.error() );
+ return !lexer.error();
+ }
+
+ bool Parser::Impl::parse() {
+ // this is the entry point: START := command-list
+ if ( !parseCommandList() )
+ return false;
+ if ( !atEnd() ) {
+ makeUnexpectedTokenError( Error::ExpectedCommand );
+ return false;
+ }
+ if ( scriptBuilder() )
+ scriptBuilder()->finished();
+ return true;
+ }
+
+
+ bool Parser::Impl::parseCommandList() {
+ // our ABNF:
+ // command-list := *comand
+
+ while ( !atEnd() ) {
+ if ( !obtainToken() )
+ return false;
+ if ( token() == Lexer::None )
+ continue;
+ if ( token() != Lexer::Identifier )
+ return true;
+ if ( !parseCommand() ) {
+ assert( error() );
+ return false;
+ }
+ }
+ return true;
+ }
+
+
+ bool Parser::Impl::parseCommand() {
+ // command := identifier arguments ( ";" / block )
+ // arguments := *argument [ test / test-list ]
+ // block := "{" *command "}"
+ // our ABNF:
+ // block := "{" [ command-list ] "}"
+
+ if ( atEnd() )
+ return false;
+
+ //
+ // identifier
+ //
+
+ if ( !obtainToken() || token() != Lexer::Identifier )
+ return false;
+
+ if ( scriptBuilder() )
+ scriptBuilder()->commandStart( tokenValue() );
+ consumeToken();
+
+ //
+ // *argument
+ //
+
+ if ( !obtainToken() )
+ return false;
+
+ if ( atEnd() ) {
+ makeError( Error::MissingSemicolonOrBlock );
+ return false;
+ }
+
+ if ( isArgumentToken() && !parseArgumentList() ) {
+ assert( error() );
+ return false;
+ }
+
+ //
+ // test / test-list
+ //
+
+ if ( !obtainToken() )
+ return false;
+
+ if ( atEnd() ) {
+ makeError( Error::MissingSemicolonOrBlock );
+ return false;
+ }
+
+ if ( token() == Lexer::Special && tokenValue() == "(" ) { // test-list
+ if ( !parseTestList() ) {
+ assert( error() );
+ return false;
+ }
+ } else if ( token() == Lexer::Identifier ) { // should be test:
+ if ( !parseTest() ) {
+ assert( error() );
+ return false;
+ }
+ }
+
+ //
+ // ";" / block
+ //
+
+ if ( !obtainToken() )
+ return false;
+
+ if ( atEnd() ) {
+ makeError( Error::MissingSemicolonOrBlock );
+ return false;
+ }
+
+ if ( token() != Lexer::Special ) {
+ makeUnexpectedTokenError( Error::ExpectedBlockOrSemicolon );
+ return false;
+ }
+
+ if ( tokenValue() == ";" )
+ consumeToken();
+ else if ( tokenValue() == "{" ) { // block
+ if ( !parseBlock() )
+ return false; // it's an error since we saw '{'
+ } else {
+ makeError( Error::MissingSemicolonOrBlock );
+ return false;
+ }
+
+ if ( scriptBuilder() )
+ scriptBuilder()->commandEnd();
+ return true;
+ }
+
+
+ bool Parser::Impl::parseArgumentList() {
+ // our ABNF:
+ // argument-list := *argument
+
+ while ( !atEnd() ) {
+ if ( !obtainToken() )
+ return false;
+ if ( !isArgumentToken() )
+ return true;
+ if ( !parseArgument() )
+ return !error();
+ }
+ return true;
+ }
+
+
+ bool Parser::Impl::parseArgument() {
+ // argument := string-list / number / tag
+
+ if ( !obtainToken() || atEnd() )
+ return false;
+
+ if ( token() == Lexer::Number ) {
+ if ( !parseNumber() ) {
+ assert( error() );
+ return false;
+ }
+ return true;
+ } else if ( token() == Lexer::Tag ) {
+ if ( scriptBuilder() )
+ scriptBuilder()->taggedArgument( tokenValue() );
+ consumeToken();
+ return true;
+ } else if ( isStringToken() ) {
+ if ( scriptBuilder() )
+ scriptBuilder()->stringArgument( tokenValue(), token() == Lexer::MultiLineString, QString::null );
+ consumeToken();
+ return true;
+ } else if ( token() == Lexer::Special && tokenValue() == "[" ) {
+ if ( !parseStringList() ) {
+ assert( error() );
+ return false;
+ }
+ return true;
+ }
+
+ return false;
+ }
+
+
+ bool Parser::Impl::parseTestList() {
+ // test-list := "(" test *("," test) ")"
+
+ if ( !obtainToken() || atEnd() )
+ return false;
+
+ if ( token() != Lexer::Special || tokenValue() != "(" )
+ return false;
+ if ( scriptBuilder() )
+ scriptBuilder()->testListStart();
+ consumeToken();
+
+ // generic while/switch construct for comma-separated lists. See
+ // parseStringList() for another one. Any fix here is like to apply there, too.
+ bool lastWasComma = true;
+ while ( !atEnd() ) {
+ if ( !obtainToken() )
+ return false;
+
+ switch ( token() ) {
+ case Lexer::None:
+ break;
+ case Lexer::Special:
+ assert( tokenValue().length() == 1 );
+ assert( tokenValue()[0].latin1() );
+ switch ( tokenValue()[0].latin1() ) {
+ case ')':
+ consumeToken();
+ if ( lastWasComma ) {
+ makeError( Error::ConsecutiveCommasInTestList );
+ return false;
+ }
+ if ( scriptBuilder() )
+ scriptBuilder()->testListEnd();
+ return true;
+ case ',':
+ consumeToken();
+ if( lastWasComma ) {
+ makeError( Error::ConsecutiveCommasInTestList );
+ return false;
+ }
+ lastWasComma = true;
+ break;
+ default:
+ makeError( Error::NonStringInStringList );
+ return false;
+ }
+ break;
+
+ case Lexer::Identifier:
+ if ( !lastWasComma ) {
+ makeError( Error::MissingCommaInTestList );
+ return false;
+ } else {
+ lastWasComma = false;
+ if ( !parseTest() ) {
+ assert( error() );
+ return false;
+ }
+ }
+ break;
+
+ default:
+ makeUnexpectedTokenError( Error::NonTestInTestList );
+ return false;
+ }
+ }
+
+ makeError( Error::PrematureEndOfTestList );
+ return false;
+ }
+
+
+ bool Parser::Impl::parseTest() {
+ // test := identifier arguments
+ // arguments := *argument [ test / test-list ]
+
+ //
+ // identifier
+ //
+
+ if ( !obtainToken() || atEnd() )
+ return false;
+
+ if ( token() != Lexer::Identifier )
+ return false;
+
+ if ( scriptBuilder() )
+ scriptBuilder()->testStart( tokenValue() );
+ consumeToken();
+
+ //
+ // *argument
+ //
+
+ if ( !obtainToken() )
+ return false;
+
+ if ( atEnd() ) // a test w/o args
+ goto TestEnd;
+
+ if ( isArgumentToken() && !parseArgumentList() ) {
+ assert( error() );
+ return false;
+ }
+
+ //
+ // test / test-list
+ //
+
+ if ( !obtainToken() )
+ return false;
+
+ if ( atEnd() ) // a test w/o nested tests
+ goto TestEnd;
+
+ if ( token() == Lexer::Special && tokenValue() == "(" ) { // test-list
+ if ( !parseTestList() ) {
+ assert( error() );
+ return false;
+ }
+ } else if ( token() == Lexer::Identifier ) { // should be test:
+ if ( !parseTest() ) {
+ assert( error() );
+ return false;
+ }
+ }
+
+ TestEnd:
+ if ( scriptBuilder() )
+ scriptBuilder()->testEnd();
+ return true;
+ }
+
+
+ bool Parser::Impl::parseBlock() {
+ // our ABNF:
+ // block := "{" [ command-list ] "}"
+
+ if ( !obtainToken() || atEnd() )
+ return false;
+
+ if ( token() != Lexer::Special || tokenValue() != "{" )
+ return false;
+ if ( scriptBuilder() )
+ scriptBuilder()->blockStart();
+ consumeToken();
+
+ if ( !obtainToken() )
+ return false;
+
+ if ( atEnd() ) {
+ makeError( Error::PrematureEndOfBlock );
+ return false;
+ }
+
+ if ( token() == Lexer::Identifier ) {
+ if ( !parseCommandList() ) {
+ assert( error() );
+ return false;
+ }
+ }
+
+ if ( !obtainToken() )
+ return false;
+
+ if ( atEnd() ) {
+ makeError( Error::PrematureEndOfBlock );
+ return false;
+ }
+
+ if ( token() != Lexer::Special || tokenValue() != "}" ) {
+ makeError( Error::NonCommandInCommandList );
+ return false;
+ }
+ if ( scriptBuilder() )
+ scriptBuilder()->blockEnd();
+ consumeToken();
+ return true;
+ }
+
+ bool Parser::Impl::parseStringList() {
+ // string-list := "[" string *("," string) "]" / string
+ // ;; if there is only a single string, the brackets are optional
+ //
+ // However, since strings are already handled separately from
+ // string lists in parseArgument(), our ABNF is modified to:
+ // string-list := "[" string *("," string) "]"
+
+ if ( !obtainToken() || atEnd() )
+ return false;
+
+ if ( token() != Lexer::Special || tokenValue() != "[" )
+ return false;
+
+ if ( scriptBuilder() )
+ scriptBuilder()->stringListArgumentStart();
+ consumeToken();
+
+ // generic while/switch construct for comma-separated lists. See
+ // parseTestList() for another one. Any fix here is like to apply there, too.
+ bool lastWasComma = true;
+ while ( !atEnd() ) {
+ if ( !obtainToken() )
+ return false;
+
+ switch ( token() ) {
+ case Lexer::None:
+ break;
+ case Lexer::Special:
+ assert( tokenValue().length() == 1 );
+ switch ( tokenValue()[0].latin1() ) {
+ case ']':
+ consumeToken();
+ if ( lastWasComma ) {
+ makeError( Error::ConsecutiveCommasInStringList );
+ return false;
+ }
+ if ( scriptBuilder() )
+ scriptBuilder()->stringListArgumentEnd();
+ return true;
+ case ',':
+ consumeToken();
+ if ( lastWasComma ) {
+ makeError( Error::ConsecutiveCommasInStringList );
+ return false;
+ }
+ lastWasComma = true;
+ break;
+ default:
+ makeError( Error::NonStringInStringList );
+ return false;
+ }
+ break;
+
+ case Lexer::QuotedString:
+ case Lexer::MultiLineString:
+ if ( !lastWasComma ) {
+ makeError( Error::MissingCommaInStringList );
+ return false;
+ }
+ lastWasComma = false;
+ if ( scriptBuilder() )
+ scriptBuilder()->stringListEntry( tokenValue(), token() == Lexer::MultiLineString, QString::null );
+ consumeToken();
+ break;
+
+ default:
+ makeError( Error::NonStringInStringList );
+ return false;
+ }
+ }
+
+ makeError( Error::PrematureEndOfStringList );
+ return false;
+ }
+
+ bool Parser::Impl::parseNumber() {
+ // The lexer returns the number including the quantifier as a
+ // single token value. Here, we split is an check that the number
+ // is not out of range:
+
+ if ( !obtainToken() || atEnd() )
+ return false;
+
+ if ( token() != Lexer::Number )
+ return false;
+
+ // number:
+ unsigned long result = 0;
+ unsigned int i = 0;
+ const QCString s = tokenValue().latin1();
+ for ( const unsigned int len = s.length() ; i < len && isdigit( s[i] ) ; ++i ) {
+ const unsigned long digitValue = s[i] - '0' ;
+ if ( willOverflowULong( result, digitValue ) ) {
+ makeError( Error::NumberOutOfRange );
+ return false;
+ } else {
+ result *= 10 ; result += digitValue ;
+ }
+ }
+
+ // optional quantifier:
+ char quantifier = '\0';
+ if ( i < s.length() ) {
+ assert( i + 1 == s.length() );
+ quantifier = s[i];
+ const unsigned long factor = factorForQuantifier( quantifier );
+ if ( result > double(ULONG_MAX) / double(factor) ) {
+ makeError( Error::NumberOutOfRange );
+ return false;
+ }
+ result *= factor;
+ }
+
+ if ( scriptBuilder() )
+ scriptBuilder()->numberArgument( result, quantifier );
+ consumeToken();
+ return true;
+ }
+
+} // namespace KSieve
diff --git a/libksieve/parser/utf8validator.cpp b/libksieve/parser/utf8validator.cpp
new file mode 100644
index 000000000..248a1f5e9
--- /dev/null
+++ b/libksieve/parser/utf8validator.cpp
@@ -0,0 +1,141 @@
+/* -*- c++ -*-
+ utf8validator.cpp
+
+ This file is part of KSieve,
+ the KDE internet mail/usenet news message filtering library.
+ Copyright (c) 2003 Marc Mutz <[email protected]>
+
+ KSieve is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License, version 2, as
+ published by the Free Software Foundation.
+
+ KSieve is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ In addition, as a special exception, the copyright holders give
+ permission to link the code of this program with any edition of
+ the Qt library by Trolltech AS, Norway (or with modified versions
+ of Qt that use the same license as Qt), and distribute linked
+ combinations including the two. You must obey the GNU General
+ Public License in all respects for all of the code used other than
+ Qt. If you modify this file, you may extend this exception to
+ your version of the file, but you are not obligated to do so. If
+ you do not wish to do so, delete this exception statement from
+ your version.
+*/
+
+#include <impl/utf8validator.h>
+
+#include <qglobal.h>
+#include <qcstring.h>
+
+static inline bool is8Bit( signed char ch ) {
+ return ch < 0;
+}
+
+static inline bool isUtf8TupelIndicator( unsigned char ch ) {
+ return (ch & 0xE0) == 0xC0; // 110x xxxx
+}
+
+static inline bool isUtf8OverlongTupel( unsigned char ch ) {
+ return (ch & 0xFE) == 0xC0;
+}
+
+static inline bool isUtf8TripleIndicator( unsigned char ch ) {
+ return (ch & 0xF0) == 0xE0; // 1110 xxxx
+}
+
+static inline bool isUtf8OverlongTriple( unsigned char ch1, unsigned char ch2 ) {
+ return (ch1 & 0xFF) == 0xE0 && (ch2 & 0xE0) == 0x80 ;
+}
+
+static inline bool isUtf8QuartetIndicator( unsigned char ch ) {
+ return (ch & 0xF8) == 0xF0; // 1111 0xxx
+}
+
+static inline bool isUtf8OverlongQuartet( unsigned char ch1, unsigned char ch2 ) {
+ return (ch1 & 0xFF) == 0xF0 && (ch2 & 0xF0) == 0x80 ;
+}
+
+static inline bool isUtf8QuintetIndicator( unsigned char ch ) {
+ return (ch & 0xFC) == 0xF8; // 1111 10xx
+}
+
+static inline bool isUtf8OverlongQuintet( unsigned char ch1, unsigned char ch2 ) {
+ return (ch1 & 0xFF) == 0xF8 && (ch2 & 0xF8) == 0x80 ;
+}
+
+static inline bool isUtf8SextetIndicator( unsigned char ch ) {
+ return (ch & 0xFE) == 0xFC; // 1111 110x
+}
+
+static inline bool isUtf8OverlongSextet( unsigned char ch1, unsigned char ch2 ) {
+ return (ch1 & 0xFF) == 0xFC && (ch2 & 0xFC) == 0x80 ;
+}
+
+static inline bool isUtf8Continuation( unsigned char ch ) {
+ return (ch & 0xC0) == 0x80;
+}
+
+bool KSieve::isValidUtf8( const char * s, unsigned int len ) {
+ for ( unsigned int i = 0 ; i < len ; ++i ) {
+ const unsigned char ch = s[i];
+ if ( !is8Bit( ch ) )
+ continue;
+ if ( isUtf8TupelIndicator( ch ) ) {
+ if ( len - i < 1 ) // too short
+ return false;
+ if ( isUtf8OverlongTupel( ch ) ) // not minimally encoded
+ return false;
+ if ( !isUtf8Continuation( s[i+1] ) ) // not followed by 10xx xxxx
+ return false;
+ i += 1;
+ } else if ( isUtf8TripleIndicator( ch ) ) {
+ if ( len - i < 2 ) // too short
+ return false;
+ if ( isUtf8OverlongTriple( ch, s[i+1] ) ) // not minimally encoded
+ return false;
+ if ( !isUtf8Continuation( s[i+2] ) ) // not followed by 10xx xxxx
+ return false;
+ i += 2;
+ } else if ( isUtf8QuartetIndicator( ch ) ) {
+ if ( len - i < 3 ) // too short
+ return false;
+ if ( isUtf8OverlongQuartet( ch, s[i+1] ) ) // not minimally encoded
+ return false;
+ if ( !isUtf8Continuation( s[i+2] ) ||
+ !isUtf8Continuation( s[i+3] ) ) // not followed by 2x 10xx xxxx
+ return false;
+ i += 3;
+ } else if ( isUtf8QuintetIndicator( ch ) ) {
+ if ( len - i < 4 ) // too short
+ return false;
+ if ( isUtf8OverlongQuintet( ch, s[i+1] ) ) // not minimally encoded
+ return false;
+ if ( !isUtf8Continuation( s[i+2] ) ||
+ !isUtf8Continuation( s[i+3] ) ||
+ !isUtf8Continuation( s[i+4] ) ) // not followed by 3x 10xx xxxx
+ return false;
+ i += 4;
+ } else if ( isUtf8SextetIndicator( ch ) ) {
+ if ( len - i < 5 ) // too short
+ return false;
+ if ( isUtf8OverlongSextet( ch, s[i+1] ) ) // not minimally encoded
+ return false;
+ if ( !isUtf8Continuation( s[i+2] ) ||
+ !isUtf8Continuation( s[i+3] ) ||
+ !isUtf8Continuation( s[i+4] ) ||
+ !isUtf8Continuation( s[i+5] ) ) // not followed by 4x 10xx xxxx
+ return false;
+ i += 5;
+ } else
+ return false;
+ }
+ return true;
+}