/* This file is part of the KDE libraries Copyright (C) 1997 Martin Jones (mjones@kde.org) (C) 1997 Torben Weis (weis@kde.org) (C) 1998 Waldo Bastian (bastian@kde.org) (C) 1999 Lars Knoll (knoll@kde.org) (C) 1999 Antti Koivisto (koivisto@kde.org) (C) 2001-2003 Dirk Mueller (mueller@kde.org) (C) 2004 Apple Computer, Inc. (C) 2006 Germain Garand (germain@ebooksfrance.org) This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ //---------------------------------------------------------------------------- // // KDE HTML Widget - Tokenizers //#define TOKEN_DEBUG 1 //#define TOKEN_DEBUG 2 #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "html/htmltokenizer.h" #include "html/html_documentimpl.h" #include "html/htmlparser.h" #include "html/dtd.h" #include "misc/loader.h" #include "misc/htmlhashes.h" #include "tdehtmlview.h" #include "tdehtml_part.h" #include "xml/dom_docimpl.h" #include "css/csshelper.h" #include "ecma/kjs_proxy.h" #include <kcharsets.h> #include <kglobal.h> #include <ctype.h> #include <assert.h> #include <tqvariant.h> #include <kdebug.h> #include <stdlib.h> #include "kentities.c" using namespace tdehtml; static const TQChar commentStart [] = { '<','!','-','-', TQChar::null }; static const char scriptEnd [] = "</script"; static const char xmpEnd [] = "</xmp"; static const char styleEnd [] = "</style"; static const char textareaEnd [] = "</textarea"; static const char titleEnd [] = "</title"; #define TDEHTML_ALLOC_QCHAR_VEC( N ) (TQChar*) malloc( sizeof(TQChar)*( N ) ) #define TDEHTML_REALLOC_QCHAR_VEC(P, N ) (TQChar*) realloc(P, sizeof(TQChar)*( N )) #define TDEHTML_DELETE_QCHAR_VEC( P ) free((char*)( P )) // Full support for MS Windows extensions to Latin-1. // Technically these extensions should only be activated for pages // marked "windows-1252" or "cp1252", but // in the standard Microsoft way, these extensions infect hundreds of thousands // of web pages. Note that people with non-latin-1 Microsoft extensions // are SOL. // // See: http://www.microsoft.com/globaldev/reference/WinCP.asp // http://www.bbsinc.com/iso8859.html // http://www.obviously.com/ // // There may be better equivalents #if 0 #define fixUpChar(x) #else #define fixUpChar(x) \ switch ((x).unicode()) \ { \ case 0x80: (x) = 0x20ac; break; \ case 0x82: (x) = 0x201a; break; \ case 0x83: (x) = 0x0192; break; \ case 0x84: (x) = 0x201e; break; \ case 0x85: (x) = 0x2026; break; \ case 0x86: (x) = 0x2020; break; \ case 0x87: (x) = 0x2021; break; \ case 0x88: (x) = 0x02C6; break; \ case 0x89: (x) = 0x2030; break; \ case 0x8A: (x) = 0x0160; break; \ case 0x8b: (x) = 0x2039; break; \ case 0x8C: (x) = 0x0152; break; \ case 0x8E: (x) = 0x017D; break; \ case 0x91: (x) = 0x2018; break; \ case 0x92: (x) = 0x2019; break; \ case 0x93: (x) = 0x201C; break; \ case 0x94: (x) = 0X201D; break; \ case 0x95: (x) = 0x2022; break; \ case 0x96: (x) = 0x2013; break; \ case 0x97: (x) = 0x2014; break; \ case 0x98: (x) = 0x02DC; break; \ case 0x99: (x) = 0x2122; break; \ case 0x9A: (x) = 0x0161; break; \ case 0x9b: (x) = 0x203A; break; \ case 0x9C: (x) = 0x0153; break; \ case 0x9E: (x) = 0x017E; break; \ case 0x9F: (x) = 0x0178; break; \ default: break; \ } #endif // ---------------------------------------------------------------------------- HTMLTokenizer::HTMLTokenizer(DOM::DocumentImpl *_doc, TDEHTMLView *_view) { view = _view; buffer = 0; scriptCode = 0; scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0; charsets = TDEGlobal::charsets(); parser = new TDEHTMLParser(_view, _doc); m_executingScript = 0; m_autoCloseTimer = 0; onHold = false; reset(); } HTMLTokenizer::HTMLTokenizer(DOM::DocumentImpl *_doc, DOM::DocumentFragmentImpl *i) { view = 0; buffer = 0; scriptCode = 0; scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0; charsets = TDEGlobal::charsets(); parser = new TDEHTMLParser( i, _doc ); m_executingScript = 0; m_autoCloseTimer = 0; onHold = false; reset(); } void HTMLTokenizer::reset() { assert(m_executingScript == 0); Q_ASSERT(onHold == false); m_abort = false; while (!cachedScript.isEmpty()) cachedScript.dequeue()->deref(this); if ( buffer ) TDEHTML_DELETE_QCHAR_VEC(buffer); buffer = dest = 0; size = 0; if ( scriptCode ) TDEHTML_DELETE_QCHAR_VEC(scriptCode); scriptCode = 0; scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0; if (m_autoCloseTimer) { killTimer(m_autoCloseTimer); m_autoCloseTimer = 0; } currToken.reset(); } void HTMLTokenizer::begin() { m_executingScript = 0; onHold = false; reset(); size = 254; buffer = TDEHTML_ALLOC_QCHAR_VEC( 255 ); dest = buffer; tag = NoTag; pending = NonePending; discard = NoneDiscard; pre = false; prePos = 0; plaintext = false; xmp = false; processingInstruction = false; script = false; escaped = false; style = false; skipLF = false; select = false; comment = false; server = false; textarea = false; title = false; startTag = false; tquote = NoQuote; searchCount = 0; Entity = NoEntity; noMoreData = false; brokenComments = false; brokenServer = false; brokenScript = false; lineno = 0; scriptStartLineno = 0; tagStartLineno = 0; } void HTMLTokenizer::processListing(TokenizerString list) { bool old_pre = pre; // This function adds the listing 'list' as // preformatted text-tokens to the token-collection // thereby converting TABs. if(!style) pre = true; prePos = 0; while ( !list.isEmpty() ) { checkBuffer(3*TAB_SIZE); if (skipLF && ( *list != '\n' )) { skipLF = false; } if (skipLF) { skipLF = false; ++list; } else if (( *list == '\n' ) || ( *list == '\r' )) { if (discard == LFDiscard) { // Ignore this LF discard = NoneDiscard; // We have discarded 1 LF } else { // Process this LF if (pending) addPending(); // we used to do it not at all and we want to have // it fixed for textarea. So here we are if ( textarea ) { prePos++; *dest++ = *list; } else pending = LFPending; } /* Check for MS-DOS CRLF sequence */ if (*list == '\r') { skipLF = true; } ++list; } else if (( *list == ' ' ) || ( *list == '\t')) { if (pending) addPending(); if (*list == ' ') pending = SpacePending; else pending = TabPending; ++list; } else { discard = NoneDiscard; if (pending) addPending(); prePos++; *dest++ = *list; ++list; } } if ((pending == SpacePending) || (pending == TabPending)) addPending(); else pending = NonePending; prePos = 0; pre = old_pre; } void HTMLTokenizer::parseSpecial(TokenizerString &src) { assert( textarea || title || !Entity ); assert( !tag ); assert( xmp+textarea+title+style+script == 1 ); if (script) scriptStartLineno = lineno+src.lineCount(); if ( comment ) parseComment( src ); while ( !src.isEmpty() ) { checkScriptBuffer(); unsigned char ch = src->latin1(); if ( !scriptCodeResync && !brokenComments && !textarea && !xmp && ch == '-' && scriptCodeSize >= 3 && !src.escaped() && TQConstString( scriptCode+scriptCodeSize-3, 3 ).string() == "<!-" ) { comment = true; scriptCode[ scriptCodeSize++ ] = ch; ++src; parseComment( src ); continue; } if ( scriptCodeResync && !tquote && ( ch == '>' ) ) { ++src; scriptCodeSize = scriptCodeResync-1; scriptCodeResync = 0; scriptCode[ scriptCodeSize ] = scriptCode[ scriptCodeSize + 1 ] = 0; if ( script ) scriptHandler(); else { processListing(TokenizerString(scriptCode, scriptCodeSize)); processToken(); if ( style ) { currToken.tid = ID_STYLE + ID_CLOSE_TAG; } else if ( textarea ) { currToken.tid = ID_TEXTAREA + ID_CLOSE_TAG; } else if ( title ) { currToken.tid = ID_TITLE + ID_CLOSE_TAG; } else if ( xmp ) { currToken.tid = ID_XMP + ID_CLOSE_TAG; } processToken(); script = style = textarea = title = xmp = false; tquote = NoQuote; scriptCodeSize = scriptCodeResync = 0; } return; } // possible end of tagname, lets check. if ( !scriptCodeResync && !escaped && !src.escaped() && ( ch == '>' || ch == '/' || ch <= ' ' ) && ch && scriptCodeSize >= searchStopperLen && !TQConstString( scriptCode+scriptCodeSize-searchStopperLen, searchStopperLen ).string().find( searchStopper, 0, false )) { scriptCodeResync = scriptCodeSize-searchStopperLen+1; tquote = NoQuote; continue; } if ( scriptCodeResync && !escaped ) { if(ch == '\"') tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote); else if(ch == '\'') tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote; else if (tquote != NoQuote && (ch == '\r' || ch == '\n')) tquote = NoQuote; } escaped = ( !escaped && ch == '\\' ); if (!scriptCodeResync && (textarea||title) && !src.escaped() && ch == '&') { TQChar *scriptCodeDest = scriptCode+scriptCodeSize; ++src; parseEntity(src,scriptCodeDest,true); scriptCodeSize = scriptCodeDest-scriptCode; } else { scriptCode[ scriptCodeSize++ ] = *src; ++src; } } } void HTMLTokenizer::scriptHandler() { TQString currentScriptSrc = scriptSrc; scriptSrc = TQString::null; processListing(TokenizerString(scriptCode, scriptCodeSize)); TQString exScript( buffer, dest-buffer ); processToken(); currToken.tid = ID_SCRIPT + ID_CLOSE_TAG; processToken(); // Scripts following a frameset element should not be executed or even loaded in the case of extern scripts. bool followingFrameset = (parser->doc()->body() && parser->doc()->body()->id() == ID_FRAMESET); bool effectiveScript = !parser->skipMode() && !followingFrameset; bool deferredScript = false; if ( effectiveScript ) { CachedScript* cs = 0; // forget what we just got, load from src url instead if ( !currentScriptSrc.isEmpty() && javascript && (cs = parser->doc()->docLoader()->requestScript(currentScriptSrc, scriptSrcCharset) )) { cachedScript.enqueue(cs); } if (cs) { pendingQueue.push(src); uint scriptCount = cachedScript.count(); setSrc(TokenizerString()); scriptCodeSize = scriptCodeResync = 0; cs->ref(this); if (cachedScript.count() == scriptCount) deferredScript = true; } else if (currentScriptSrc.isEmpty() && view && javascript ) { pendingQueue.push(src); setSrc(TokenizerString()); scriptCodeSize = scriptCodeResync = 0; scriptExecution( exScript, TQString::null, tagStartLineno /*scriptStartLineno*/ ); } else { // script was filtered or disallowed effectiveScript = false; } } script = false; scriptCodeSize = scriptCodeResync = 0; if ( !effectiveScript ) return; if ( !m_executingScript && cachedScript.isEmpty() ) { src.append(pendingQueue.pop()); } else if ( cachedScript.isEmpty() ) { write( pendingQueue.pop(), false ); } else if ( !deferredScript && pendingQueue.count() > 1) { TokenizerString t = pendingQueue.pop(); pendingQueue.top().prepend( t ); } } void HTMLTokenizer::scriptExecution( const TQString& str, const TQString& scriptURL, int baseLine) { bool oldscript = script; m_executingScript++; script = false; TQString url; if (scriptURL.isNull() && view) url = static_cast<DocumentImpl*>(view->part()->document().handle())->URL().url(); else url = scriptURL; if (view) view->part()->executeScript(url,baseLine+1,Node(),str); m_executingScript--; script = oldscript; } void HTMLTokenizer::parseComment(TokenizerString &src) { // SGML strict bool strict = parser->doc()->inStrictMode() && parser->doc()->htmlMode() != DocumentImpl::XHtml && !script && !style; int delimiterCount = 0; bool canClose = false; checkScriptBuffer(src.length()); while ( src.length() ) { scriptCode[ scriptCodeSize++ ] = *src; #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 tqDebug("comment is now: *%s*", src.toString().left(16).latin1()); #endif if (strict) { if (src->unicode() == '-') { delimiterCount++; if (delimiterCount == 2) { delimiterCount = 0; canClose = !canClose; } } else delimiterCount = 0; } if ((!strict || canClose) && src->unicode() == '>') { bool handleBrokenComments = brokenComments && !( script || style ); bool scriptEnd=false; if (!strict) { if ( scriptCodeSize > 2 && scriptCode[scriptCodeSize-3] == '-' && scriptCode[scriptCodeSize-2] == '-' ) scriptEnd=true; } if (canClose || handleBrokenComments || scriptEnd ){ ++src; if ( !( title || script || xmp || textarea || style) ) { #ifdef COMMENTS_IN_DOM checkScriptBuffer(); scriptCode[ scriptCodeSize ] = 0; scriptCode[ scriptCodeSize + 1 ] = 0; currToken.tid = ID_COMMENT; processListing(DOMStringIt(scriptCode, scriptCodeSize - 2)); processToken(); currToken.tid = ID_COMMENT + ID_CLOSE_TAG; processToken(); #endif scriptCodeSize = 0; } comment = false; return; // Finished parsing comment } } ++src; } } void HTMLTokenizer::parseServer(TokenizerString &src) { checkScriptBuffer(src.length()); while ( !src.isEmpty() ) { scriptCode[ scriptCodeSize++ ] = *src; if (src->unicode() == '>' && scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') { ++src; server = false; scriptCodeSize = 0; return; // Finished parsing server include } ++src; } } void HTMLTokenizer::parseProcessingInstruction(TokenizerString &src) { char oldchar = 0; while ( !src.isEmpty() ) { unsigned char chbegin = src->latin1(); if(chbegin == '\'') { tquote = tquote == SingleQuote ? NoQuote : SingleQuote; } else if(chbegin == '\"') { tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote; } // Look for '?>' // some crappy sites omit the "?" before it, so // we look for an unquoted '>' instead. (IE compatible) else if ( chbegin == '>' && ( !tquote || oldchar == '?' ) ) { // We got a '?>' sequence processingInstruction = false; ++src; discard=LFDiscard; return; // Finished parsing comment! } ++src; oldchar = chbegin; } } void HTMLTokenizer::parseText(TokenizerString &src) { while ( !src.isEmpty() ) { // do we need to enlarge the buffer? checkBuffer(); // ascii is okay because we only do ascii comparisons unsigned char chbegin = src->latin1(); if (skipLF && ( chbegin != '\n' )) { skipLF = false; } if (skipLF) { skipLF = false; ++src; } else if (( chbegin == '\n' ) || ( chbegin == '\r' )) { if (chbegin == '\r') skipLF = true; *dest++ = '\n'; ++src; } else { *dest++ = *src; ++src; } } } void HTMLTokenizer::parseEntity(TokenizerString &src, TQChar *&dest, bool start) { if( start ) { cBufferPos = 0; entityLen = 0; Entity = SearchEntity; } while( !src.isEmpty() ) { ushort cc = src->unicode(); switch(Entity) { case NoEntity: return; break; case SearchEntity: if(cc == '#') { cBuffer[cBufferPos++] = cc; ++src; Entity = NumericSearch; } else Entity = EntityName; break; case NumericSearch: if(cc == 'x' || cc == 'X') { cBuffer[cBufferPos++] = cc; ++src; Entity = Hexadecimal; } else if(cc >= '0' && cc <= '9') Entity = Decimal; else Entity = SearchSemicolon; break; case Hexadecimal: { int uc = EntityChar.unicode(); int ll = kMin<uint>(src.length(), 8); while(ll--) { TQChar csrc(src->lower()); cc = csrc.cell(); if(csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) { break; } uc = uc*16 + (cc - ( cc < 'a' ? '0' : 'a' - 10)); cBuffer[cBufferPos++] = cc; ++src; } EntityChar = TQChar(uc); Entity = SearchSemicolon; break; } case Decimal: { int uc = EntityChar.unicode(); int ll = kMin(src.length(), 9-cBufferPos); while(ll--) { cc = src->cell(); if(src->row() || !(cc >= '0' && cc <= '9')) { Entity = SearchSemicolon; break; } uc = uc * 10 + (cc - '0'); cBuffer[cBufferPos++] = cc; ++src; } EntityChar = TQChar(uc); if(cBufferPos == 9) Entity = SearchSemicolon; break; } case EntityName: { int ll = kMin(src.length(), 9-cBufferPos); while(ll--) { TQChar csrc = *src; cc = csrc.cell(); if(csrc.row() || !((cc >= 'a' && cc <= 'z') || (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) { Entity = SearchSemicolon; break; } cBuffer[cBufferPos++] = cc; ++src; // be IE compatible and interpret even unterminated entities // outside tags. like "foo  stuff bla". if ( tag == NoTag ) { const entity* e = kde_findEntity(cBuffer, cBufferPos); if ( e && e->code < 256 ) { EntityChar = e->code; entityLen = cBufferPos; } } } if(cBufferPos == 9) Entity = SearchSemicolon; if(Entity == SearchSemicolon) { if(cBufferPos > 1) { const entity *e = kde_findEntity(cBuffer, cBufferPos); // IE only accepts unterminated entities < 256, // Gecko accepts them all, but only outside tags if(e && ( tag == NoTag || e->code < 256 || *src == ';' )) { EntityChar = e->code; entityLen = cBufferPos; } } } break; } case SearchSemicolon: #ifdef TOKEN_DEBUG kdDebug( 6036 ) << "ENTITY " << EntityChar.unicode() << endl; #endif fixUpChar(EntityChar); if (*src == ';') ++src; if ( !EntityChar.isNull() ) { checkBuffer(); if (entityLen > 0 && entityLen < cBufferPos) { int rem = cBufferPos - entityLen; src.prepend( TokenizerString(TQString::fromAscii(cBuffer+entityLen, rem)) ); } src.push( EntityChar ); } else { #ifdef TOKEN_DEBUG kdDebug( 6036 ) << "unknown entity!" << endl; #endif checkBuffer(11); // ignore the sequence, add it to the buffer as plaintext *dest++ = '&'; for(unsigned int i = 0; i < cBufferPos; i++) dest[i] = cBuffer[i]; dest += cBufferPos; if (pre) prePos += cBufferPos+1; } Entity = NoEntity; EntityChar = TQChar::null; return; }; } } void HTMLTokenizer::parseTag(TokenizerString &src) { assert(!Entity ); checkScriptBuffer( src.length() ); while ( !src.isEmpty() ) { checkBuffer(); #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 uint l = 0; while(l < src.length() && (src.toString()[l]).latin1() != '>') l++; tqDebug("src is now: *%s*, tquote: %d", src.toString().left(l).latin1(), tquote); #endif switch(tag) { case NoTag: return; case TagName: { #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 tqDebug("TagName"); #endif if (searchCount > 0) { if (*src == commentStart[searchCount]) { searchCount++; if (searchCount == 4) { #ifdef TOKEN_DEBUG kdDebug( 6036 ) << "Found comment" << endl; #endif // Found '<!--' sequence ++src; dest = buffer; // ignore the previous part of this tag tag = NoTag; comment = true; parseComment(src); return; // Finished parsing tag! } // cuts of high part, is okay cBuffer[cBufferPos++] = src->cell(); ++src; break; } else searchCount = 0; // Stop looking for '<!--' sequence } bool finish = false; unsigned int ll = kMin(src.length(), CBUFLEN-cBufferPos); while(ll--) { ushort curchar = *src; if(curchar <= ' ' || curchar == '>' ) { finish = true; break; } // this is a nasty performance trick. will work for the A-Z // characters, but not for others. if it contains one, // we fail anyway char cc = curchar; cBuffer[cBufferPos++] = cc | 0x20; ++src; } // Disadvantage: we add the possible rest of the tag // as attribute names. ### judge if this causes problems if(finish || CBUFLEN == cBufferPos) { bool beginTag; char* ptr = cBuffer; unsigned int len = cBufferPos; cBuffer[cBufferPos] = '\0'; if ((cBufferPos > 0) && (*ptr == '/')) { // End Tag beginTag = false; ptr++; len--; } else // Start Tag beginTag = true; // Accept empty xml tags like <br/> if(len > 1 && ptr[len-1] == '/' ) { ptr[--len] = '\0'; // if its like <br/> and not like <input/ value=foo>, take it as flat if (*src == '>') currToken.flat = true; } uint tagID = tdehtml::getTagID(ptr, len); if (!tagID) { #ifdef TOKEN_DEBUG TQCString tmp(ptr, len+1); kdDebug( 6036 ) << "Unknown tag: \"" << tmp.data() << "\"" << endl; #endif dest = buffer; } else { #ifdef TOKEN_DEBUG TQCString tmp(ptr, len+1); kdDebug( 6036 ) << "found tag id=" << tagID << ": " << tmp.data() << endl; #endif currToken.tid = beginTag ? tagID : tagID + ID_CLOSE_TAG; dest = buffer; } tag = SearchAttribute; cBufferPos = 0; } break; } case SearchAttribute: { #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 tqDebug("SearchAttribute"); #endif bool atespace = false; ushort curchar; while(!src.isEmpty()) { curchar = *src; if(curchar > ' ') { if(curchar == '<' || curchar == '>') tag = SearchEnd; else if(atespace && (curchar == '\'' || curchar == '"')) { tag = SearchValue; *dest++ = 0; attrName = TQString::null; } else tag = AttributeName; cBufferPos = 0; break; } atespace = true; ++src; } break; } case AttributeName: { #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 tqDebug("AttributeName"); #endif ushort curchar; int ll = kMin(src.length(), CBUFLEN-cBufferPos); while(ll--) { curchar = *src; if(curchar <= '>') { if(curchar <= ' ' || curchar == '=' || curchar == '>') { unsigned int a; cBuffer[cBufferPos] = '\0'; a = tdehtml::getAttrID(cBuffer, cBufferPos); if ( !a ) { // did we just get /> or e.g checked/> if (curchar == '>' && cBufferPos >=1 && cBuffer[cBufferPos-1] == '/') { currToken.flat = true; if (cBufferPos>1) a = tdehtml::getAttrID(cBuffer, cBufferPos-1); } if (!a) attrName = TQString::fromLatin1(TQCString(cBuffer, cBufferPos+1).data()); } dest = buffer; *dest++ = a; #ifdef TOKEN_DEBUG if (!a || (cBufferPos && *cBuffer == '!')) kdDebug( 6036 ) << "Unknown attribute: *" << TQCString(cBuffer, cBufferPos+1).data() << "*" << endl; else kdDebug( 6036 ) << "Known attribute: " << TQCString(cBuffer, cBufferPos+1).data() << endl; #endif tag = SearchEqual; break; } } cBuffer[cBufferPos++] = ( curchar >= 'A' && curchar <= 'Z' ) ? curchar | 0x20 : curchar; ++src; } if ( cBufferPos == CBUFLEN ) { cBuffer[cBufferPos] = '\0'; attrName = TQString::fromLatin1(TQCString(cBuffer, cBufferPos+1).data()); dest = buffer; *dest++ = 0; tag = SearchEqual; } break; } case SearchEqual: { #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 tqDebug("SearchEqual"); #endif ushort curchar; bool atespace = false; while(!src.isEmpty()) { curchar = src->unicode(); if(curchar > ' ') { if(curchar == '=') { #ifdef TOKEN_DEBUG kdDebug(6036) << "found equal" << endl; #endif tag = SearchValue; ++src; } else if(atespace && (curchar == '\'' || curchar == '"')) { tag = SearchValue; *dest++ = 0; attrName = TQString::null; } else { DOMString v(""); currToken.addAttribute(parser->docPtr(), buffer, attrName, v); dest = buffer; tag = SearchAttribute; } break; } atespace = true; ++src; } break; } case SearchValue: { ushort curchar; while(!src.isEmpty()) { curchar = src->unicode(); if(curchar > ' ') { if(( curchar == '\'' || curchar == '\"' )) { tquote = curchar == '\"' ? DoubleQuote : SingleQuote; tag = QuotedValue; ++src; } else tag = Value; break; } ++src; } break; } case QuotedValue: { #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 tqDebug("QuotedValue"); #endif ushort curchar; while(!src.isEmpty()) { checkBuffer(); curchar = src->unicode(); if(curchar <= '\'' && !src.escaped()) { // ### attributes like '&{blaa....};' are supposed to be treated as jscript. if ( curchar == '&' ) { ++src; parseEntity(src, dest, true); break; } else if ( (tquote == SingleQuote && curchar == '\'') || (tquote == DoubleQuote && curchar == '\"') ) { // some <input type=hidden> rely on trailing spaces. argh while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r')) dest--; // remove trailing newlines DOMString v(buffer+1, dest-buffer-1); currToken.addAttribute(parser->docPtr(), buffer, attrName, v); dest = buffer; tag = SearchAttribute; tquote = NoQuote; ++src; break; } } *dest++ = *src; ++src; } break; } case Value: { #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 tqDebug("Value"); #endif ushort curchar; while(!src.isEmpty()) { checkBuffer(); curchar = src->unicode(); if(curchar <= '>' && !src.escaped()) { // parse Entities if ( curchar == '&' ) { ++src; parseEntity(src, dest, true); break; } // no quotes. Every space means end of value // '/' does not delimit in IE! if ( curchar <= ' ' || curchar == '>' ) { DOMString v(buffer+1, dest-buffer-1); currToken.addAttribute(parser->docPtr(), buffer, attrName, v); dest = buffer; tag = SearchAttribute; break; } } *dest++ = *src; ++src; } break; } case SearchEnd: { #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 tqDebug("SearchEnd"); #endif while(!src.isEmpty()) { if(*src == '<' || *src == '>') break; if (*src == '/') currToken.flat = true; ++src; } if(src.isEmpty() && *src != '<' && *src != '>') break; searchCount = 0; // Stop looking for '<!--' sequence tag = NoTag; tquote = NoQuote; if ( *src == '>' ) ++src; if ( !currToken.tid ) //stop if tag is unknown return; uint tagID = currToken.tid; #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0 kdDebug( 6036 ) << "appending Tag: " << tagID << endl; #endif // If the tag requires an end tag it cannot be flat, // unless we are using the HTML parser to parse XHTML // The only exception is SCRIPT and priority 0 tokens. if (tagID < ID_CLOSE_TAG && tagID != ID_SCRIPT && DOM::endTag[tagID] == DOM::REQUIRED && parser->doc()->htmlMode() != DocumentImpl::XHtml) currToken.flat = false; bool beginTag = !currToken.flat && (tagID < ID_CLOSE_TAG); if(tagID >= ID_CLOSE_TAG) tagID -= ID_CLOSE_TAG; else if ( !brokenScript && tagID == ID_SCRIPT ) { DOMStringImpl* a = 0; bool foundTypeAttribute = false; scriptSrc = scriptSrcCharset = TQString::null; if ( currToken.attrs && /* potentially have a ATTR_SRC ? */ view && /* are we a regular tokenizer or just for innerHTML ? */ parser->doc()->view()->part()->jScriptEnabled() /* jscript allowed at all? */ ) { if ( ( a = currToken.attrs->getValue( ATTR_SRC ) ) ) scriptSrc = parser->doc()->completeURL(tdehtml::parseURL( DOMString(a) ).string() ); if ( ( a = currToken.attrs->getValue( ATTR_CHARSET ) ) ) scriptSrcCharset = DOMString(a).string().stripWhiteSpace(); if ( scriptSrcCharset.isEmpty() && view) scriptSrcCharset = parser->doc()->view()->part()->encoding(); /* Check type before language, since language is deprecated */ if ((a = currToken.attrs->getValue(ATTR_TYPE)) != 0 && !DOMString(a).string().isEmpty()) foundTypeAttribute = true; else a = currToken.attrs->getValue(ATTR_LANGUAGE); } javascript = true; if( foundTypeAttribute ) { /* Mozilla 1.5 doesn't accept the text/javascript1.x formats, but WinIE 6 does. Mozilla 1.5 doesn't accept text/jscript, text/ecmascript, and text/livescript, but WinIE 6 does. Mozilla 1.5 accepts application/x-javascript, WinIE 6 doesn't. Mozilla 1.5 allows leading and trailing whitespace, but WinIE 6 doesn't. Mozilla 1.5 and WinIE 6 both accept the empty string, but neither accept a whitespace-only string. We want to accept all the values that either of these browsers accept, but not other values. */ TQString type = DOMString(a).string().stripWhiteSpace().lower(); if( type.compare("text/javascript") != 0 && type.compare("text/javascript1.0") != 0 && type.compare("text/javascript1.1") != 0 && type.compare("text/javascript1.2") != 0 && type.compare("text/javascript1.3") != 0 && type.compare("text/javascript1.4") != 0 && type.compare("text/javascript1.5") != 0 && type.compare("text/jscript") != 0 && type.compare("text/ecmascript") != 0 && type.compare("text/livescript") != 0 && type.compare("application/x-javascript") != 0 && type.compare("application/x-ecmascript") != 0 && type.compare("application/javascript") != 0 && type.compare("application/ecmascript") != 0 ) javascript = false; } else if( a ) { /* Mozilla 1.5 doesn't accept jscript or ecmascript, but WinIE 6 does. Mozilla 1.5 accepts javascript1.0, javascript1.4, and javascript1.5, but WinIE 6 accepts only 1.1 - 1.3. Neither Mozilla 1.5 nor WinIE 6 accept leading or trailing whitespace. We want to accept all the values that either of these browsers accept, but not other values. */ TQString lang = DOMString(a).string(); lang = lang.lower(); if( lang.compare("") != 0 && lang.compare("javascript") != 0 && lang.compare("javascript1.0") != 0 && lang.compare("javascript1.1") != 0 && lang.compare("javascript1.2") != 0 && lang.compare("javascript1.3") != 0 && lang.compare("javascript1.4") != 0 && lang.compare("javascript1.5") != 0 && lang.compare("ecmascript") != 0 && lang.compare("livescript") != 0 && lang.compare("jscript") ) javascript = false; } } processToken(); if ( parser->selectMode() && beginTag) discard = AllDiscard; switch( tagID ) { case ID_PRE: pre = beginTag; if (beginTag) discard = LFDiscard; prePos = 0; break; case ID_BR: prePos = 0; break; case ID_SCRIPT: if (beginTag) { searchStopper = scriptEnd; searchStopperLen = 8; script = true; parseSpecial(src); } else if (tagID < ID_CLOSE_TAG) // Handle <script src="foo"/> scriptHandler(); break; case ID_STYLE: if (beginTag) { searchStopper = styleEnd; searchStopperLen = 7; style = true; parseSpecial(src); } break; case ID_TEXTAREA: if(beginTag) { searchStopper = textareaEnd; searchStopperLen = 10; textarea = true; discard = NoneDiscard; parseSpecial(src); } break; case ID_TITLE: if (beginTag) { searchStopper = titleEnd; searchStopperLen = 7; title = true; parseSpecial(src); } break; case ID_XMP: if (beginTag) { searchStopper = xmpEnd; searchStopperLen = 5; xmp = true; parseSpecial(src); } break; case ID_SELECT: select = beginTag; break; case ID_PLAINTEXT: plaintext = beginTag; break; } return; // Finished parsing tag! } } // end switch } return; } void HTMLTokenizer::addPending() { if ( select && !(comment || script)) { *dest++ = ' '; } else if ( textarea ) { switch(pending) { case LFPending: *dest++ = '\n'; prePos = 0; break; case SpacePending: *dest++ = ' '; ++prePos; break; case TabPending: *dest++ = '\t'; prePos += TAB_SIZE - (prePos % TAB_SIZE); break; case NonePending: assert(0); } } else { int p; switch (pending) { case SpacePending: // Insert a breaking space *dest++ = TQChar(' '); prePos++; break; case LFPending: *dest = '\n'; dest++; prePos = 0; break; case TabPending: p = TAB_SIZE - ( prePos % TAB_SIZE ); for ( int x = 0; x < p; x++ ) *dest++ = TQChar(' '); prePos += p; break; case NonePending: assert(0); break; } } pending = NonePending; } void HTMLTokenizer::write( const TokenizerString &str, bool appendData ) { #ifdef TOKEN_DEBUG kdDebug( 6036 ) << this << " Tokenizer::write(\"" << str.toString() << "\"," << appendData << ")" << endl; #endif if ( !buffer ) return; if ( ( m_executingScript && appendData ) || cachedScript.count() ) { // don't parse; we will do this later if (pendingQueue.isEmpty()) pendingQueue.push(str); else if (appendData) pendingQueue.bottom().append(str); else pendingQueue.top().append(str); return; } if ( onHold ) { src.append(str); return; } if (!src.isEmpty()) src.append(str); else setSrc(str); m_abort = false; // if (Entity) // parseEntity(src, dest); while ( !src.isEmpty() ) { if ( m_abort ) return; // do we need to enlarge the buffer? checkBuffer(); ushort cc = src->unicode(); if (skipLF && (cc != '\n')) skipLF = false; if (skipLF) { skipLF = false; ++src; } else if ( Entity ) parseEntity( src, dest ); else if ( plaintext ) parseText( src ); else if (script) parseSpecial(src); else if (style) parseSpecial(src); else if (xmp) parseSpecial(src); else if (textarea) parseSpecial(src); else if (title) parseSpecial(src); else if (comment) parseComment(src); else if (server) parseServer(src); else if (processingInstruction) parseProcessingInstruction(src); else if (tag) parseTag(src); else if ( startTag ) { startTag = false; bool endTag = false; switch(cc) { case '/': endTag = true; break; case '!': { // <!-- comment --> searchCount = 1; // Look for '<!--' sequence to start comment break; } case '?': { // xml processing instruction processingInstruction = true; tquote = NoQuote; parseProcessingInstruction(src); continue; break; } case '%': if (!brokenServer) { // <% server stuff, handle as comment %> server = true; tquote = NoQuote; parseServer(src); continue; } // else fall through default: { if( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z'))) { // Start of a Start-Tag } else { // Invalid tag // Add as is if (pending) addPending(); *dest = '<'; dest++; continue; } } }; // end case // According to SGML any LF immediately after a starttag, or // immediately before an endtag should be ignored. // ### Gecko and MSIE though only ignores LF immediately after // starttags and only for PRE elements -- asj (28/06-2005) if ( pending ) if (!select) addPending(); else pending = NonePending; // Cancel unused discards discard = NoneDiscard; // if (!endTag) discard = LFDiscard; processToken(); cBufferPos = 0; tag = TagName; parseTag(src); } else if ( cc == '&' && !src.escaped()) { ++src; if ( pending ) addPending(); discard = NoneDiscard; parseEntity(src, dest, true); } else if ( cc == '<' && !src.escaped()) { tagStartLineno = lineno+src.lineCount(); ++src; discard = NoneDiscard; startTag = true; } else if (( cc == '\n' ) || ( cc == '\r' )) { if (discard == SpaceDiscard) discard = NoneDiscard; if (discard == LFDiscard) { // Ignore one LF discard = NoneDiscard; } else if (discard == AllDiscard) { // Ignore } else { if (select && !script) { pending = LFPending; } else { if (pending) addPending(); pending = LFPending; } } /* Check for MS-DOS CRLF sequence */ if (cc == '\r') { skipLF = true; } ++src; } else if (( cc == ' ' ) || ( cc == '\t' )) { if(discard == LFDiscard) discard = NoneDiscard; if(discard == SpaceDiscard) { // Ignore one space discard = NoneDiscard; } else if(discard == AllDiscard) { // Ignore } else { if (select && !script) { if (!pending) pending = SpacePending; } else { if (pending) addPending(); if (cc == ' ') pending = SpacePending; else pending = TabPending; } } ++src; } else { if (pending) addPending(); discard = NoneDiscard; if ( pre ) { prePos++; } *dest = *src; fixUpChar( *dest ); ++dest; ++src; } } if (noMoreData && cachedScript.isEmpty() && !m_executingScript) end(); // this actually causes us to be deleted } void HTMLTokenizer::timerEvent( TQTimerEvent *e ) { if ( e->timerId() == m_autoCloseTimer && cachedScript.isEmpty() ) { finish(); } } void HTMLTokenizer::setAutoClose( bool b ) { killTimer( m_autoCloseTimer ); m_autoCloseTimer = 0; if ( b ) m_autoCloseTimer = startTimer(100); } void HTMLTokenizer::end() { if ( buffer == 0 ) { emit finishedParsing(); return; } // parseTag is using the buffer for different matters if ( !tag ) processToken(); if(buffer) TDEHTML_DELETE_QCHAR_VEC(buffer); if(scriptCode) TDEHTML_DELETE_QCHAR_VEC(scriptCode); scriptCode = 0; scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0; buffer = 0; emit finishedParsing(); } void HTMLTokenizer::finish() { if ( m_autoCloseTimer ) { killTimer( m_autoCloseTimer ); m_autoCloseTimer = 0; } // do this as long as we don't find matching comment ends while((title || script || comment || server) && scriptCode && scriptCodeSize) { // we've found an unmatched comment start if (comment) brokenComments = true; else if (server) brokenServer = true; else if (script) brokenScript = true; checkScriptBuffer(); scriptCode[ scriptCodeSize ] = 0; scriptCode[ scriptCodeSize + 1 ] = 0; int pos; TQString food; if (title || style || script) food.setUnicode(scriptCode, scriptCodeSize); else if (server) { food = "<"; food += TQString(scriptCode, scriptCodeSize); } else { pos = TQConstString(scriptCode, scriptCodeSize).string().find('>'); food.setUnicode(scriptCode+pos+1, scriptCodeSize-pos-1); // deep copy } TDEHTML_DELETE_QCHAR_VEC(scriptCode); scriptCode = 0; scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0; if (script) scriptHandler(); comment = title = server = script = false; if ( !food.isEmpty() ) write(food, true); } // this indicates we will not receive any more data... but if we are waiting on // an external script to load, we can't finish parsing until that is done noMoreData = true; if (cachedScript.isEmpty() && !m_executingScript && !onHold) end(); // this actually causes us to be deleted } void HTMLTokenizer::processToken() { KJSProxy *jsProxy = view ? view->part()->jScript() : 0L; if (jsProxy) jsProxy->setEventHandlerLineno(tagStartLineno+1); if ( dest > buffer ) { #if 0 if(currToken.tid) { tqDebug( "unexpected token id: %d, str: *%s*", currToken.tid,TQConstString( buffer,dest-buffer ).string().latin1() ); assert(0); } #endif currToken.text = new DOMStringImpl( buffer, dest - buffer ); currToken.text->ref(); currToken.tid = ID_TEXT; } else if(!currToken.tid) { currToken.reset(); if (jsProxy) jsProxy->setEventHandlerLineno(lineno+src.lineCount()+1); return; } dest = buffer; #ifdef TOKEN_DEBUG TQString name = TQString( getTagName(currToken.tid) ); TQString text; if(currToken.text) text = TQConstString(currToken.text->s, currToken.text->l).string(); kdDebug( 6036 ) << "Token --> " << name << " id = " << currToken.tid << endl; if (currToken.flat) kdDebug( 6036 ) << "Token is FLAT!" << endl; if(!text.isNull()) kdDebug( 6036 ) << "text: \"" << text << "\"" << endl; unsigned long l = currToken.attrs ? currToken.attrs->length() : 0; if(l) { kdDebug( 6036 ) << "Attributes: " << l << endl; for (unsigned long i = 0; i < l; ++i) { NodeImpl::Id tid = currToken.attrs->idAt(i); DOMString value = currToken.attrs->valueAt(i); kdDebug( 6036 ) << " " << tid << " " << parser->doc()->getDocument()->getName(NodeImpl::AttributeId, tid).string() << "=\"" << value.string() << "\"" << endl; } } kdDebug( 6036 ) << endl; #endif // In some cases, parseToken() can cause javascript code to be executed // (for example, when setting an attribute that causes an event handler // to be created). So we need to protect against re-entrancy into the parser m_executingScript++; // pass the token over to the parser, the parser DOES NOT delete the token parser->parseToken(&currToken); m_executingScript--; if ( currToken.flat && currToken.tid != ID_TEXT && !parser->noSpaces() ) discard = NoneDiscard; currToken.reset(); if (jsProxy) jsProxy->setEventHandlerLineno(1); } HTMLTokenizer::~HTMLTokenizer() { reset(); delete parser; } void HTMLTokenizer::enlargeBuffer(int len) { int newsize = kMax(size*2, size+len); int oldoffs = (dest - buffer); buffer = TDEHTML_REALLOC_QCHAR_VEC(buffer, newsize); dest = buffer + oldoffs; size = newsize; } void HTMLTokenizer::enlargeScriptBuffer(int len) { int newsize = kMax(scriptCodeMaxSize*2, scriptCodeMaxSize+len); scriptCode = TDEHTML_REALLOC_QCHAR_VEC(scriptCode, newsize); scriptCodeMaxSize = newsize; } void HTMLTokenizer::notifyFinished(CachedObject* /*finishedObj*/) { assert(!cachedScript.isEmpty()); bool done = false; while (!done && cachedScript.head()->isLoaded()) { kdDebug( 6036 ) << "Finished loading an external script" << endl; CachedScript* cs = cachedScript.dequeue(); DOMString scriptSource = cs->script(); #ifdef TOKEN_DEBUG kdDebug( 6036 ) << "External script is:" << endl << scriptSource.string() << endl; #endif setSrc(TokenizerString()); // make sure we forget about the script before we execute the new one // infinite recursion might happen otherwise TQString cachedScriptUrl( cs->url().string() ); cs->deref(this); scriptExecution( scriptSource.string(), cachedScriptUrl ); done = cachedScript.isEmpty(); // 'script' is true when we are called synchronously from // scriptHandler(). In that case scriptHandler() will take care // of 'scriptOutput'. if ( !script ) { while (pendingQueue.count() > 1) { TokenizerString t = pendingQueue.pop(); pendingQueue.top().prepend( t ); } if (done) { write(pendingQueue.pop(), false); } // we might be deleted at this point, do not // access any members. } } } bool HTMLTokenizer::isWaitingForScripts() const { return cachedScript.count(); } bool HTMLTokenizer::isExecutingScript() const { return (m_executingScript > 0); } void HTMLTokenizer::setSrc(const TokenizerString& source) { lineno += src.lineCount(); src = source; src.resetLineCount(); } void HTMLTokenizer::setOnHold(bool _onHold) { if (onHold == _onHold) return; onHold = _onHold; }