diff options
Diffstat (limited to 'libksieve/parser')
-rw-r--r-- | libksieve/parser/Makefile.am | 12 | ||||
-rw-r--r-- | libksieve/parser/lexer.cpp | 666 | ||||
-rw-r--r-- | libksieve/parser/parser.cpp | 651 | ||||
-rw-r--r-- | libksieve/parser/utf8validator.cpp | 141 |
4 files changed, 1470 insertions, 0 deletions
diff --git a/libksieve/parser/Makefile.am b/libksieve/parser/Makefile.am new file mode 100644 index 000000000..044d045cf --- /dev/null +++ b/libksieve/parser/Makefile.am @@ -0,0 +1,12 @@ +# final breaks static use: +# If you feel like "fixing" it, better talk to [email protected] first :) +KDE_OPTIONS = nofinal + +INCLUDES = -I$(top_srcdir)/libksieve $(all_includes) + +noinst_LTLIBRARIES = libksieve_parser.la + +libksieve_parser_la_SOURCES = utf8validator.cpp lexer.cpp parser.cpp +libksieve_parser_la_LIBADD = ../shared/libksieve_shared.la +libksieve_parser_la_LDFLAGS = $(all_libraries) -no-undefined + diff --git a/libksieve/parser/lexer.cpp b/libksieve/parser/lexer.cpp new file mode 100644 index 000000000..d8b76da71 --- /dev/null +++ b/libksieve/parser/lexer.cpp @@ -0,0 +1,666 @@ +/* -*- c++ -*- + parser/lexer.cpp + + This file is part of KSieve, + the KDE internet mail/usenet news message filtering library. + Copyright (c) 2002-2003 Marc Mutz <[email protected]> + + KSieve is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License, version 2, as + published by the Free Software Foundation. + + KSieve is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + + In addition, as a special exception, the copyright holders give + permission to link the code of this program with any edition of + the Qt library by Trolltech AS, Norway (or with modified versions + of Qt that use the same license as Qt), and distribute linked + combinations including the two. You must obey the GNU General + Public License in all respects for all of the code used other than + Qt. If you modify this file, you may extend this exception to + your version of the file, but you are not obligated to do so. If + you do not wish to do so, delete this exception statement from + your version. +*/ + +#include <config.h> + +#include <ksieve/lexer.h> +#include <impl/lexer.h> + +#include <impl/utf8validator.h> +#include <ksieve/error.h> + +#include <qstring.h> +#include <qstringlist.h> +#include <qtextcodec.h> + +#include <memory> // std::auto_ptr + +#include <assert.h> +#include <ctype.h> // isdigit + +#ifdef STR_DIM +# undef STR_DIM +#endif +#define STR_DIM(x) (sizeof(x) - 1) + +namespace KSieve { + + // + // + // Lexer Bridge implementation + // + // + + Lexer::Lexer( const char * scursor, const char * send, int options ) + : i( 0 ) + { + i = new Impl( scursor, send, options ); + } + + Lexer::~Lexer() { + delete i; i = 0; + } + + bool Lexer::ignoreComments() const { + assert( i ); + return i->ignoreComments(); + } + + const Error & Lexer::error() const { + assert( i ); + return i->error(); + } + + bool Lexer::atEnd() const { + assert( i ); + return i->atEnd(); + } + + int Lexer::column() const { + assert( i ); + return i->column(); + } + + int Lexer::line() const { + assert( i ); + return i->line(); + } + + void Lexer::save() { + assert( i ); + i->save(); + } + + void Lexer::restore() { + assert( i ); + i->restore(); + } + + Lexer::Token Lexer::nextToken( QString & result ) { + assert( i ); + return i->nextToken( result ); + } + +} // namespace KSieve + + +// none except a-zA-Z0-9_ +static const unsigned char iTextMap[16] = { + 0x00, 0x00, 0x00, 0x00, // CTLs: none + 0x00, 0x00, 0xFF, 0xC0, // SP ... '?': 0-9 + 0x7F, 0xFF, 0xFF, 0xE1, // '@' ... '_': A-Z_ + 0x7F, 0xFF, 0xFF, 0xE0 // '`' ... DEL: a-z +}; + +// SP, HT, CR, LF, {}[]();,#/ +// ### exclude '['? Why would one want to write identifier["foo"]? +static const unsigned char delimMap[16] = { + 0x00, 0x64, 0x00, 0x00, // CTLs: CR, HT, LF + 0x90, 0xC9, 0x00, 0x10, // SP ... '?': SP, #(),; + 0x00, 0x00, 0x00, 0x16, // '@' ... '_': [] + 0x00, 0x00, 0x00, 0x16 // '`' ... DEL: {} +}; + +// All except iText, delim, "*: +static const unsigned char illegalMap[16] = { + 0xFF, 0x9B, 0xFF, 0xFF, + 0x4F, 0x16, 0x00, 0x0F, + 0x80, 0x00, 0x00, 0x0A, + 0x80, 0x00, 0x00, 0x0A +}; + +static inline bool isOfSet( const unsigned char map[16], unsigned char ch ) { + assert( ch < 128 ); + return ( map[ ch/8 ] & 0x80 >> ch%8 ); +} + +static inline bool isIText( unsigned char ch ) { + return ch <= 'z' && isOfSet( iTextMap, ch ); +} + +static inline bool isDelim( unsigned char ch ) { + return ch <= '}' && isOfSet( delimMap, ch ); +} + +static inline bool isIllegal( unsigned char ch ) { + return ch >= '~' || isOfSet( illegalMap, ch ); +} + +static inline bool is8Bit( signed char ch ) { + return ch < 0; +} + +static QString removeCRLF( const QString & s ) { + const bool CRLF = s.endsWith( "\r\n" ); + const bool LF = !CRLF && s.endsWith( "\n" ); + + const int e = CRLF ? 2 : LF ? 1 : 0 ; // what to chop off at the end + + return s.left( s.length() - e ); +} + +static QString removeDotStuff( const QString & s ) { + return s.startsWith( ".." ) ? s.mid( 1 ) : s ; +} + +namespace KSieve { + + // + // + // Lexer Implementation + // + // + + Lexer::Impl::Impl( const char * scursor, const char * send, int options ) + : mState( scursor ? scursor : send ), + mEnd( send ? send : scursor ), + mIgnoreComments( options & IgnoreComments ), + mIgnoreLF( options & IgnoreLineFeeds ) + { + if ( !scursor || !send ) + assert( atEnd() ); + } + + Lexer::Token Lexer::Impl::nextToken( QString & result ) { + assert( !atEnd() ); + result = QString::null; + //clearErrors(); + + const int oldLine = line(); + + const bool eatingWSSucceeded = ignoreComments() ? eatCWS() : eatWS() ; + + if ( !ignoreLineFeeds() && oldLine != line() ) { + result.setNum( line() - oldLine ); // return number of linefeeds encountered + return LineFeeds; + } + + if ( !eatingWSSucceeded ) + return None; + + if ( atEnd() ) + return None; + + switch ( *mState.cursor ) { + case '#': // HashComment + assert( !ignoreComments() ); + ++mState.cursor; + if ( !atEnd() ) + parseHashComment( result, true ); + return HashComment; + case '/': // BracketComment + assert( !ignoreComments() ); + ++mState.cursor; // eat slash + if ( atEnd() || *mState.cursor != '*' ) { + makeError( Error::SlashWithoutAsterisk ); + return BracketComment; + } + ++mState.cursor; // eat asterisk + if ( atEnd() ) { + makeError( Error::UnfinishedBracketComment ); + return BracketComment; + } + parseBracketComment( result, true ); + return BracketComment; + case ':': // Tag + ++mState.cursor; + if ( atEnd() ) { + makeError( Error::UnexpectedCharacter, line(), column() - 1 ); + return Tag; + } + if ( !isIText( *mState.cursor ) ) { + makeIllegalCharError( *mState.cursor ); + return Tag; + } + parseTag( result ); + return Tag; + case '"': // QuotedString + ++mState.cursor; + parseQuotedString( result ); + return QuotedString; + case '{': + case '}': + case '[': + case ']': + case '(': + case ')': + case ';': + case ',': // Special + result = *mState.cursor++; + return Special; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': // Number + parseNumber( result ); + return Number; + case 't': // maybe MultiLineString, else Identifier + if ( _strnicmp( mState.cursor, "text:", STR_DIM("text:") ) == 0 ) { + // MultiLineString + mState.cursor += STR_DIM("text:"); + parseMultiLine( result ); + // ### FIXME: There can be a hash-comment between "text:" + // and CRLF! That should be preserved somehow... + return MultiLineString; + } + // else fall through: + default: // Identifier (first must not be 0-9, and can't (caught by Number above)) + if ( !isIText( *mState.cursor ) ) { + makeError( Error::IllegalCharacter ); + return None; + } + parseIdentifier( result ); + return Identifier; + } + } + + bool Lexer::Impl::eatWS() { + while ( !atEnd() ) + switch ( *mState.cursor ) { + case '\r': + case '\n': + if ( !eatCRLF() ) + return false; + break; + case ' ': + case '\t': + ++mState.cursor; + break; + default: + return true; + } + + // at end: + return true; + } + + bool Lexer::Impl::eatCRLF() { + assert( !atEnd() ); + assert( *mState.cursor == '\n' || *mState.cursor == '\r' ); + + if ( *mState.cursor == '\r' ) { + ++mState.cursor; + if ( atEnd() || *mState.cursor != '\n' ) { + // CR w/o LF -> error + makeError( Error::CRWithoutLF ); + return false; + } else { + // good CRLF + newLine(); + return true; + } + } else /* *mState.cursor == '\n' */ { + // good, LF only + newLine(); + return true; + } + } + + + bool Lexer::Impl::parseHashComment( QString & result, bool reallySave ) { + // hash-comment := "#" *CHAR-NOT-CRLF CRLF + + // check that the caller plays by the rules: + assert( *(mState.cursor-1) == '#' ); + + const char * const commentStart = mState.cursor; + + // find next CRLF: + while ( !atEnd() ) { + if ( *mState.cursor == '\n' || *mState.cursor == '\r' ) break; + ++mState.cursor; + } + + const char * const commentEnd = mState.cursor - 1; + + if ( commentEnd == commentStart ) return true; // # was last char in script... + + if ( atEnd() || eatCRLF() ) { + const int commentLength = commentEnd - commentStart + 1; + if ( commentLength > 0 ) { + if ( !isValidUtf8( commentStart, commentLength ) ) { + makeError( Error::InvalidUTF8 ); + return false; + } + if ( reallySave ) + result += QString::fromUtf8( commentStart, commentLength ); + } + return true; + } + + return false; + } + + bool Lexer::Impl::parseBracketComment( QString & result, bool reallySave ) { + // bracket-comment := "/*" *(CHAR-NOT-STAR / ("*" CHAR-NOT-SLASH )) "*/" + + // check that caller plays by the rules: + assert( *(mState.cursor-2) == '/' ); + assert( *(mState.cursor-1) == '*' ); + + const char * const commentStart = mState.cursor; + const int commentCol = column() - 2; + const int commentLine = line(); + + // find next asterisk: + do { + if ( !skipTo( '*' ) ) { + if ( !error() ) + makeError( Error::UnfinishedBracketComment, commentLine, commentCol ); + return false; + } + } while ( !atEnd() && *++mState.cursor != '/' ); + + if ( atEnd() ) { + makeError( Error::UnfinishedBracketComment, commentLine, commentCol ); + return false; + } + + assert( *mState.cursor == '/' ); + + const int commentLength = mState.cursor - commentStart - 1; + if ( commentLength > 0 ) { + if ( !isValidUtf8( commentStart, commentLength ) ) { + makeError( Error::InvalidUTF8 ); + return false; + } + if ( reallySave ) { + QString tmp = QString::fromUtf8( commentStart, commentLength ); + result += tmp.remove( '\r' ); // get rid of CR in CRLF pairs + } + } + + ++mState.cursor; // eat '/' + return true; + } + + bool Lexer::Impl::parseComment( QString & result, bool reallySave ) { + // comment := hash-comment / bracket-comment + + switch( *mState.cursor ) { + case '#': + ++mState.cursor; + return parseHashComment( result, reallySave ); + case '/': + if ( charsLeft() < 2 || mState.cursor[1] != '*' ) { + makeError( Error::IllegalCharacter ); + return false; + } else { + mState.cursor += 2; // eat "/*" + return parseBracketComment( result, reallySave ); + } + default: + return false; // don't set an error here - there was no comment + } + } + + bool Lexer::Impl::eatCWS() { + // white-space := 1*(SP / CRLF / HTAB / comment ) + + while ( !atEnd() ) { + switch( *mState.cursor ) { + case ' ': + case '\t': // SP / HTAB + ++mState.cursor; + break;; + case '\n': + case '\r': // CRLF + if ( !eatCRLF() ) + return false; + break; + case '#': + case '/': // comments + { + QString dummy; + if ( !parseComment( dummy ) ) + return false; + } + break; + default: + return true; + } + } + return true; + } + + bool Lexer::Impl::parseIdentifier( QString & result ) { + // identifier := (ALPHA / "_") *(ALPHA DIGIT "_") + + assert( isIText( *mState.cursor ) ); + + const char * const identifierStart = mState.cursor; + + // first char: + if ( isdigit( *mState.cursor ) ) { // no digits for the first + makeError( Error::NoLeadingDigits ); + return false; + } + + // rest of identifier chars ( now digits are allowed ): + for ( ++mState.cursor ; !atEnd() && isIText( *mState.cursor ) ; ++mState.cursor ); + + const int identifierLength = mState.cursor - identifierStart; + + // Can use the fast fromLatin1 here, since identifiers are always + // in the us-ascii subset: + result += QString::fromLatin1( identifierStart, identifierLength ); + + if ( atEnd() || isDelim( *mState.cursor ) ) + return true; + + makeIllegalCharError( *mState.cursor ); + return false; + } + + bool Lexer::Impl::parseTag( QString & result ) { + // tag := ":" identifier + + // check that the caller plays by the rules: + assert( *(mState.cursor-1) == ':' ); + assert( !atEnd() ); + assert( isIText( *mState.cursor ) ); + + return parseIdentifier( result ); + } + + bool Lexer::Impl::parseNumber( QString & result ) { + // number := 1*DIGIT [QUANTIFIER] + // QUANTIFIER := "K" / "M" / "G" + + assert( isdigit( *mState.cursor ) ); + + while ( !atEnd() && isdigit( *mState.cursor ) ) + result += *mState.cursor++; + + if ( atEnd() || isDelim( *mState.cursor ) ) + return true; + + switch ( *mState.cursor ) { + case 'G': + case 'g': + case 'M': + case 'm': + case 'K': + case 'k': + result += *mState.cursor++; + break; + default: + makeIllegalCharError(); + return false; + } + + // quantifier found. Check for delimiter: + if ( atEnd() || isDelim( *mState.cursor ) ) + return true; + makeIllegalCharError(); + return false; + } + + bool Lexer::Impl::parseMultiLine( QString & result ) { + // multi-line := "text:" *(SP / HTAB) (hash-comment / CRLF) + // *(multi-line-literal / multi-line-dotstuff) + // "." CRLF + // multi-line-literal := [CHAR-NOT-DOT *CHAR-NOT-CRLF] CRLF + // multi-line-dotstuff := "." 1*CHAR-NOT-CRLF CRLF + // ;; A line containing only "." ends the multi-line. + // ;; Remove a leading '.' if followed by another '.'. + + assert( _strnicmp( mState.cursor - 5, "text:", STR_DIM("text:") ) == 0 ); + + const int mlBeginLine = line(); + const int mlBeginCol = column() - 5; + + while ( !atEnd() ) { + switch ( *mState.cursor ) { + case ' ': + case '\t': + ++mState.cursor; + break; + case '#': + { + ++mState.cursor; + QString dummy; + if ( !parseHashComment( dummy ) ) + return false; + goto MultiLineStart; // break from switch _and_ while + } + case '\n': + case '\r': + if ( !eatCRLF() ) return false; + goto MultiLineStart; // break from switch _and_ while + default: + makeError( Error::NonCWSAfterTextColon ); + return false; + } + } + + MultiLineStart: + if ( atEnd() ) { + makeError( Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol ); + return false; + } + + // Now, collect the single lines until one with only a single dot is found: + QStringList lines; + while ( !atEnd() ) { + const char * const oldBeginOfLine = beginOfLine(); + if ( !skipToCRLF() ) + return false; + const int lineLength = mState.cursor - oldBeginOfLine; + if ( lineLength > 0 ) { + if ( !isValidUtf8( oldBeginOfLine, lineLength ) ) { + makeError( Error::InvalidUTF8 ); + return false; + } + const QString line = removeCRLF( QString::fromUtf8( oldBeginOfLine, lineLength ) ); + lines.push_back( removeDotStuff( line ) ); + if ( line == "." ) + break; + } else { + lines.push_back( QString::null ); + } + } + + if ( lines.back() != "." ) { + makeError( Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol ); + return false; + } + + assert( !lines.empty() ); + lines.erase( --lines.end() ); // don't include the lone dot. + result = lines.join("\n"); + return true; + } + + bool Lexer::Impl::parseQuotedString( QString & result ) { + // quoted-string := DQUOTE *CHAR DQUOTE + + // check that caller plays by the rules: + assert( *(mState.cursor-1) == '"' ); + + const int qsBeginCol = column() - 1; + const int qsBeginLine = line(); + + const QTextCodec * const codec = QTextCodec::codecForMib( 106 ); // UTF-8 + assert( codec ); + const std::auto_ptr<QTextDecoder> dec( codec->makeDecoder() ); + assert( dec.get() ); + + while ( !atEnd() ) + switch ( *mState.cursor ) { + case '"': + ++mState.cursor; + return true; + case '\r': + case '\n': + if ( !eatCRLF() ) + return false; + result += '\n'; + break; + case '\\': + ++mState.cursor; + if ( atEnd() ) + break; + // else fall through: + default: + if ( !is8Bit( *mState.cursor ) ) + result += *mState.cursor++; + else { // probably UTF-8 + const char * const eightBitBegin = mState.cursor; + skipTo8BitEnd(); + const int eightBitLen = mState.cursor - eightBitBegin; + assert( eightBitLen > 0 ); + if ( isValidUtf8( eightBitBegin, eightBitLen ) ) + result += dec->toUnicode( eightBitBegin, eightBitLen ); + else { + assert( column() >= eightBitLen ); + makeError( Error::InvalidUTF8, line(), column() - eightBitLen ); + return false; + } + } + } + + makeError( Error::PrematureEndOfQuotedString, qsBeginLine, qsBeginCol ); + return false; + } + + void Lexer::Impl::makeIllegalCharError( char ch ) { + makeError( isIllegal( ch ) ? Error::IllegalCharacter : Error::UnexpectedCharacter ); + } + +} // namespace KSieve diff --git a/libksieve/parser/parser.cpp b/libksieve/parser/parser.cpp new file mode 100644 index 000000000..8c2db050e --- /dev/null +++ b/libksieve/parser/parser.cpp @@ -0,0 +1,651 @@ +/* -*- c++ -*- + parser/parser.cpp + + This file is part of KSieve, + the KDE internet mail/usenet news message filtering library. + Copyright (c) 2002-2003 Marc Mutz <[email protected]> + + KSieve is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License, version 2, as + published by the Free Software Foundation. + + KSieve is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + + In addition, as a special exception, the copyright holders give + permission to link the code of this program with any edition of + the Qt library by Trolltech AS, Norway (or with modified versions + of Qt that use the same license as Qt), and distribute linked + combinations including the two. You must obey the GNU General + Public License in all respects for all of the code used other than + Qt. If you modify this file, you may extend this exception to + your version of the file, but you are not obligated to do so. If + you do not wish to do so, delete this exception statement from + your version. +*/ + +#include <config.h> + +#include <ksieve/parser.h> +#include <impl/parser.h> + +#include <ksieve/error.h> + +#include <qstring.h> + +#include <assert.h> +#include <limits.h> // ULONG_MAX +#include <ctype.h> // isdigit + +namespace KSieve { + + // + // + // Parser Bridge implementation + // + // + + Parser::Parser( const char * scursor, const char * const send, int options ) + : i( 0 ) + { + i = new Impl( scursor, send, options ); + } + + Parser::~Parser() { + delete i; i = 0; + } + + void Parser::setScriptBuilder( ScriptBuilder * builder ) { + assert( i ); + i->mBuilder = builder; + } + + ScriptBuilder * Parser::scriptBuilder() const { + assert( i ); + return i->mBuilder; + } + + const Error & Parser::error() const { + assert( i ); + return i->error(); + } + + bool Parser::parse() { + assert( i ); + return i->parse(); + } + +} + +static inline unsigned long factorForQuantifier( char ch ) { + switch ( ch ) { + case 'g': + case 'G': + return 1024*1024*1024; + case 'm': + case 'M': + return 1024*1024; + case 'k': + case 'K': + return 1024; + default: + assert( 0 ); // lexer should prohibit this + return 1; // make compiler happy + } +} + +static inline bool willOverflowULong( unsigned long result, unsigned long add ) { + static const unsigned long maxULongByTen = (unsigned long)(ULONG_MAX / 10.0) ; + return result > maxULongByTen || ULONG_MAX - 10 * result < add ; +} + +namespace KSieve { + + // + // + // Parser Implementation + // + // + + Parser::Impl::Impl( const char * scursor, const char * const send, int options ) + : mToken( Lexer::None ), + lexer( scursor, send, options ), + mBuilder( 0 ) + { + + } + + bool Parser::Impl::isStringToken() const { + return token() == Lexer::QuotedString || + token() == Lexer::MultiLineString ; + } + + + bool Parser::Impl::isArgumentToken() const { + return isStringToken() || + token() == Lexer::Number || + token() == Lexer::Tag || + token() == Lexer::Special && mTokenValue == "[" ; + } + + bool Parser::Impl::obtainToken() { + while ( !mToken && !lexer.atEnd() && !lexer.error() ) { + mToken = lexer.nextToken( mTokenValue ); + if ( lexer.error() ) + break; + // comments and line feeds are semantically invisible and may + // appear anywhere, so we handle them here centrally: + switch ( token() ) { + case Lexer::HashComment: + if ( scriptBuilder() ) + scriptBuilder()->hashComment( tokenValue() ); + consumeToken(); + break; + case Lexer::BracketComment: + if ( scriptBuilder() ) + scriptBuilder()->bracketComment( tokenValue() ); + consumeToken(); + break; + case Lexer::LineFeeds: + for ( unsigned int i = 0, end = tokenValue().toUInt() ; i < end ; ++i ) + if ( scriptBuilder() ) // better check every iteration, b/c + // we call out to ScriptBuilder, + // where nasty things might happen! + scriptBuilder()->lineFeed(); + consumeToken(); + break; + default: ; // make compiler happy + } + } + if ( lexer.error() && scriptBuilder() ) + scriptBuilder()->error( lexer.error() ); + return !lexer.error(); + } + + bool Parser::Impl::parse() { + // this is the entry point: START := command-list + if ( !parseCommandList() ) + return false; + if ( !atEnd() ) { + makeUnexpectedTokenError( Error::ExpectedCommand ); + return false; + } + if ( scriptBuilder() ) + scriptBuilder()->finished(); + return true; + } + + + bool Parser::Impl::parseCommandList() { + // our ABNF: + // command-list := *comand + + while ( !atEnd() ) { + if ( !obtainToken() ) + return false; + if ( token() == Lexer::None ) + continue; + if ( token() != Lexer::Identifier ) + return true; + if ( !parseCommand() ) { + assert( error() ); + return false; + } + } + return true; + } + + + bool Parser::Impl::parseCommand() { + // command := identifier arguments ( ";" / block ) + // arguments := *argument [ test / test-list ] + // block := "{" *command "}" + // our ABNF: + // block := "{" [ command-list ] "}" + + if ( atEnd() ) + return false; + + // + // identifier + // + + if ( !obtainToken() || token() != Lexer::Identifier ) + return false; + + if ( scriptBuilder() ) + scriptBuilder()->commandStart( tokenValue() ); + consumeToken(); + + // + // *argument + // + + if ( !obtainToken() ) + return false; + + if ( atEnd() ) { + makeError( Error::MissingSemicolonOrBlock ); + return false; + } + + if ( isArgumentToken() && !parseArgumentList() ) { + assert( error() ); + return false; + } + + // + // test / test-list + // + + if ( !obtainToken() ) + return false; + + if ( atEnd() ) { + makeError( Error::MissingSemicolonOrBlock ); + return false; + } + + if ( token() == Lexer::Special && tokenValue() == "(" ) { // test-list + if ( !parseTestList() ) { + assert( error() ); + return false; + } + } else if ( token() == Lexer::Identifier ) { // should be test: + if ( !parseTest() ) { + assert( error() ); + return false; + } + } + + // + // ";" / block + // + + if ( !obtainToken() ) + return false; + + if ( atEnd() ) { + makeError( Error::MissingSemicolonOrBlock ); + return false; + } + + if ( token() != Lexer::Special ) { + makeUnexpectedTokenError( Error::ExpectedBlockOrSemicolon ); + return false; + } + + if ( tokenValue() == ";" ) + consumeToken(); + else if ( tokenValue() == "{" ) { // block + if ( !parseBlock() ) + return false; // it's an error since we saw '{' + } else { + makeError( Error::MissingSemicolonOrBlock ); + return false; + } + + if ( scriptBuilder() ) + scriptBuilder()->commandEnd(); + return true; + } + + + bool Parser::Impl::parseArgumentList() { + // our ABNF: + // argument-list := *argument + + while ( !atEnd() ) { + if ( !obtainToken() ) + return false; + if ( !isArgumentToken() ) + return true; + if ( !parseArgument() ) + return !error(); + } + return true; + } + + + bool Parser::Impl::parseArgument() { + // argument := string-list / number / tag + + if ( !obtainToken() || atEnd() ) + return false; + + if ( token() == Lexer::Number ) { + if ( !parseNumber() ) { + assert( error() ); + return false; + } + return true; + } else if ( token() == Lexer::Tag ) { + if ( scriptBuilder() ) + scriptBuilder()->taggedArgument( tokenValue() ); + consumeToken(); + return true; + } else if ( isStringToken() ) { + if ( scriptBuilder() ) + scriptBuilder()->stringArgument( tokenValue(), token() == Lexer::MultiLineString, QString::null ); + consumeToken(); + return true; + } else if ( token() == Lexer::Special && tokenValue() == "[" ) { + if ( !parseStringList() ) { + assert( error() ); + return false; + } + return true; + } + + return false; + } + + + bool Parser::Impl::parseTestList() { + // test-list := "(" test *("," test) ")" + + if ( !obtainToken() || atEnd() ) + return false; + + if ( token() != Lexer::Special || tokenValue() != "(" ) + return false; + if ( scriptBuilder() ) + scriptBuilder()->testListStart(); + consumeToken(); + + // generic while/switch construct for comma-separated lists. See + // parseStringList() for another one. Any fix here is like to apply there, too. + bool lastWasComma = true; + while ( !atEnd() ) { + if ( !obtainToken() ) + return false; + + switch ( token() ) { + case Lexer::None: + break; + case Lexer::Special: + assert( tokenValue().length() == 1 ); + assert( tokenValue()[0].latin1() ); + switch ( tokenValue()[0].latin1() ) { + case ')': + consumeToken(); + if ( lastWasComma ) { + makeError( Error::ConsecutiveCommasInTestList ); + return false; + } + if ( scriptBuilder() ) + scriptBuilder()->testListEnd(); + return true; + case ',': + consumeToken(); + if( lastWasComma ) { + makeError( Error::ConsecutiveCommasInTestList ); + return false; + } + lastWasComma = true; + break; + default: + makeError( Error::NonStringInStringList ); + return false; + } + break; + + case Lexer::Identifier: + if ( !lastWasComma ) { + makeError( Error::MissingCommaInTestList ); + return false; + } else { + lastWasComma = false; + if ( !parseTest() ) { + assert( error() ); + return false; + } + } + break; + + default: + makeUnexpectedTokenError( Error::NonTestInTestList ); + return false; + } + } + + makeError( Error::PrematureEndOfTestList ); + return false; + } + + + bool Parser::Impl::parseTest() { + // test := identifier arguments + // arguments := *argument [ test / test-list ] + + // + // identifier + // + + if ( !obtainToken() || atEnd() ) + return false; + + if ( token() != Lexer::Identifier ) + return false; + + if ( scriptBuilder() ) + scriptBuilder()->testStart( tokenValue() ); + consumeToken(); + + // + // *argument + // + + if ( !obtainToken() ) + return false; + + if ( atEnd() ) // a test w/o args + goto TestEnd; + + if ( isArgumentToken() && !parseArgumentList() ) { + assert( error() ); + return false; + } + + // + // test / test-list + // + + if ( !obtainToken() ) + return false; + + if ( atEnd() ) // a test w/o nested tests + goto TestEnd; + + if ( token() == Lexer::Special && tokenValue() == "(" ) { // test-list + if ( !parseTestList() ) { + assert( error() ); + return false; + } + } else if ( token() == Lexer::Identifier ) { // should be test: + if ( !parseTest() ) { + assert( error() ); + return false; + } + } + + TestEnd: + if ( scriptBuilder() ) + scriptBuilder()->testEnd(); + return true; + } + + + bool Parser::Impl::parseBlock() { + // our ABNF: + // block := "{" [ command-list ] "}" + + if ( !obtainToken() || atEnd() ) + return false; + + if ( token() != Lexer::Special || tokenValue() != "{" ) + return false; + if ( scriptBuilder() ) + scriptBuilder()->blockStart(); + consumeToken(); + + if ( !obtainToken() ) + return false; + + if ( atEnd() ) { + makeError( Error::PrematureEndOfBlock ); + return false; + } + + if ( token() == Lexer::Identifier ) { + if ( !parseCommandList() ) { + assert( error() ); + return false; + } + } + + if ( !obtainToken() ) + return false; + + if ( atEnd() ) { + makeError( Error::PrematureEndOfBlock ); + return false; + } + + if ( token() != Lexer::Special || tokenValue() != "}" ) { + makeError( Error::NonCommandInCommandList ); + return false; + } + if ( scriptBuilder() ) + scriptBuilder()->blockEnd(); + consumeToken(); + return true; + } + + bool Parser::Impl::parseStringList() { + // string-list := "[" string *("," string) "]" / string + // ;; if there is only a single string, the brackets are optional + // + // However, since strings are already handled separately from + // string lists in parseArgument(), our ABNF is modified to: + // string-list := "[" string *("," string) "]" + + if ( !obtainToken() || atEnd() ) + return false; + + if ( token() != Lexer::Special || tokenValue() != "[" ) + return false; + + if ( scriptBuilder() ) + scriptBuilder()->stringListArgumentStart(); + consumeToken(); + + // generic while/switch construct for comma-separated lists. See + // parseTestList() for another one. Any fix here is like to apply there, too. + bool lastWasComma = true; + while ( !atEnd() ) { + if ( !obtainToken() ) + return false; + + switch ( token() ) { + case Lexer::None: + break; + case Lexer::Special: + assert( tokenValue().length() == 1 ); + switch ( tokenValue()[0].latin1() ) { + case ']': + consumeToken(); + if ( lastWasComma ) { + makeError( Error::ConsecutiveCommasInStringList ); + return false; + } + if ( scriptBuilder() ) + scriptBuilder()->stringListArgumentEnd(); + return true; + case ',': + consumeToken(); + if ( lastWasComma ) { + makeError( Error::ConsecutiveCommasInStringList ); + return false; + } + lastWasComma = true; + break; + default: + makeError( Error::NonStringInStringList ); + return false; + } + break; + + case Lexer::QuotedString: + case Lexer::MultiLineString: + if ( !lastWasComma ) { + makeError( Error::MissingCommaInStringList ); + return false; + } + lastWasComma = false; + if ( scriptBuilder() ) + scriptBuilder()->stringListEntry( tokenValue(), token() == Lexer::MultiLineString, QString::null ); + consumeToken(); + break; + + default: + makeError( Error::NonStringInStringList ); + return false; + } + } + + makeError( Error::PrematureEndOfStringList ); + return false; + } + + bool Parser::Impl::parseNumber() { + // The lexer returns the number including the quantifier as a + // single token value. Here, we split is an check that the number + // is not out of range: + + if ( !obtainToken() || atEnd() ) + return false; + + if ( token() != Lexer::Number ) + return false; + + // number: + unsigned long result = 0; + unsigned int i = 0; + const QCString s = tokenValue().latin1(); + for ( const unsigned int len = s.length() ; i < len && isdigit( s[i] ) ; ++i ) { + const unsigned long digitValue = s[i] - '0' ; + if ( willOverflowULong( result, digitValue ) ) { + makeError( Error::NumberOutOfRange ); + return false; + } else { + result *= 10 ; result += digitValue ; + } + } + + // optional quantifier: + char quantifier = '\0'; + if ( i < s.length() ) { + assert( i + 1 == s.length() ); + quantifier = s[i]; + const unsigned long factor = factorForQuantifier( quantifier ); + if ( result > double(ULONG_MAX) / double(factor) ) { + makeError( Error::NumberOutOfRange ); + return false; + } + result *= factor; + } + + if ( scriptBuilder() ) + scriptBuilder()->numberArgument( result, quantifier ); + consumeToken(); + return true; + } + +} // namespace KSieve diff --git a/libksieve/parser/utf8validator.cpp b/libksieve/parser/utf8validator.cpp new file mode 100644 index 000000000..248a1f5e9 --- /dev/null +++ b/libksieve/parser/utf8validator.cpp @@ -0,0 +1,141 @@ +/* -*- c++ -*- + utf8validator.cpp + + This file is part of KSieve, + the KDE internet mail/usenet news message filtering library. + Copyright (c) 2003 Marc Mutz <[email protected]> + + KSieve is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License, version 2, as + published by the Free Software Foundation. + + KSieve is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + + In addition, as a special exception, the copyright holders give + permission to link the code of this program with any edition of + the Qt library by Trolltech AS, Norway (or with modified versions + of Qt that use the same license as Qt), and distribute linked + combinations including the two. You must obey the GNU General + Public License in all respects for all of the code used other than + Qt. If you modify this file, you may extend this exception to + your version of the file, but you are not obligated to do so. If + you do not wish to do so, delete this exception statement from + your version. +*/ + +#include <impl/utf8validator.h> + +#include <qglobal.h> +#include <qcstring.h> + +static inline bool is8Bit( signed char ch ) { + return ch < 0; +} + +static inline bool isUtf8TupelIndicator( unsigned char ch ) { + return (ch & 0xE0) == 0xC0; // 110x xxxx +} + +static inline bool isUtf8OverlongTupel( unsigned char ch ) { + return (ch & 0xFE) == 0xC0; +} + +static inline bool isUtf8TripleIndicator( unsigned char ch ) { + return (ch & 0xF0) == 0xE0; // 1110 xxxx +} + +static inline bool isUtf8OverlongTriple( unsigned char ch1, unsigned char ch2 ) { + return (ch1 & 0xFF) == 0xE0 && (ch2 & 0xE0) == 0x80 ; +} + +static inline bool isUtf8QuartetIndicator( unsigned char ch ) { + return (ch & 0xF8) == 0xF0; // 1111 0xxx +} + +static inline bool isUtf8OverlongQuartet( unsigned char ch1, unsigned char ch2 ) { + return (ch1 & 0xFF) == 0xF0 && (ch2 & 0xF0) == 0x80 ; +} + +static inline bool isUtf8QuintetIndicator( unsigned char ch ) { + return (ch & 0xFC) == 0xF8; // 1111 10xx +} + +static inline bool isUtf8OverlongQuintet( unsigned char ch1, unsigned char ch2 ) { + return (ch1 & 0xFF) == 0xF8 && (ch2 & 0xF8) == 0x80 ; +} + +static inline bool isUtf8SextetIndicator( unsigned char ch ) { + return (ch & 0xFE) == 0xFC; // 1111 110x +} + +static inline bool isUtf8OverlongSextet( unsigned char ch1, unsigned char ch2 ) { + return (ch1 & 0xFF) == 0xFC && (ch2 & 0xFC) == 0x80 ; +} + +static inline bool isUtf8Continuation( unsigned char ch ) { + return (ch & 0xC0) == 0x80; +} + +bool KSieve::isValidUtf8( const char * s, unsigned int len ) { + for ( unsigned int i = 0 ; i < len ; ++i ) { + const unsigned char ch = s[i]; + if ( !is8Bit( ch ) ) + continue; + if ( isUtf8TupelIndicator( ch ) ) { + if ( len - i < 1 ) // too short + return false; + if ( isUtf8OverlongTupel( ch ) ) // not minimally encoded + return false; + if ( !isUtf8Continuation( s[i+1] ) ) // not followed by 10xx xxxx + return false; + i += 1; + } else if ( isUtf8TripleIndicator( ch ) ) { + if ( len - i < 2 ) // too short + return false; + if ( isUtf8OverlongTriple( ch, s[i+1] ) ) // not minimally encoded + return false; + if ( !isUtf8Continuation( s[i+2] ) ) // not followed by 10xx xxxx + return false; + i += 2; + } else if ( isUtf8QuartetIndicator( ch ) ) { + if ( len - i < 3 ) // too short + return false; + if ( isUtf8OverlongQuartet( ch, s[i+1] ) ) // not minimally encoded + return false; + if ( !isUtf8Continuation( s[i+2] ) || + !isUtf8Continuation( s[i+3] ) ) // not followed by 2x 10xx xxxx + return false; + i += 3; + } else if ( isUtf8QuintetIndicator( ch ) ) { + if ( len - i < 4 ) // too short + return false; + if ( isUtf8OverlongQuintet( ch, s[i+1] ) ) // not minimally encoded + return false; + if ( !isUtf8Continuation( s[i+2] ) || + !isUtf8Continuation( s[i+3] ) || + !isUtf8Continuation( s[i+4] ) ) // not followed by 3x 10xx xxxx + return false; + i += 4; + } else if ( isUtf8SextetIndicator( ch ) ) { + if ( len - i < 5 ) // too short + return false; + if ( isUtf8OverlongSextet( ch, s[i+1] ) ) // not minimally encoded + return false; + if ( !isUtf8Continuation( s[i+2] ) || + !isUtf8Continuation( s[i+3] ) || + !isUtf8Continuation( s[i+4] ) || + !isUtf8Continuation( s[i+5] ) ) // not followed by 4x 10xx xxxx + return false; + i += 5; + } else + return false; + } + return true; +} |