diff options
Diffstat (limited to 'tdehtml/html/htmlparser.cpp')
-rw-r--r-- | tdehtml/html/htmlparser.cpp | 1731 |
1 files changed, 1731 insertions, 0 deletions
diff --git a/tdehtml/html/htmlparser.cpp b/tdehtml/html/htmlparser.cpp new file mode 100644 index 000000000..31d3d4c17 --- /dev/null +++ b/tdehtml/html/htmlparser.cpp @@ -0,0 +1,1731 @@ +/* + This file is part of the KDE libraries + + Copyright (C) 1997 Martin Jones ([email protected]) + (C) 1997 Torben Weis ([email protected]) + (C) 1999,2001 Lars Knoll ([email protected]) + (C) 2000,2001 Dirk Mueller ([email protected]) + (C) 2003 Apple Computer, Inc. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public License + along with this library; see the file COPYING.LIB. If not, write to + the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. +*/ +//---------------------------------------------------------------------------- +// +// KDE HTML Widget -- HTML Parser +// #define PARSER_DEBUG + +#include "dom/dom_exception.h" + +#include "html/html_baseimpl.h" +#include "html/html_blockimpl.h" +#include "html/html_documentimpl.h" +#include "html/html_elementimpl.h" +#include "html/html_formimpl.h" +#include "html/html_headimpl.h" +#include "html/html_imageimpl.h" +#include "html/html_inlineimpl.h" +#include "html/html_listimpl.h" +#include "html/html_miscimpl.h" +#include "html/html_tableimpl.h" +#include "html/html_objectimpl.h" +#include "xml/dom_textimpl.h" +#include "xml/dom_nodeimpl.h" +#include "misc/htmlhashes.h" +#include "html/htmltokenizer.h" +#include "tdehtmlview.h" +#include "tdehtml_part.h" +#include "tdehtml_factory.h" +#include "css/cssproperties.h" +#include "css/cssvalues.h" +#include "css/csshelper.h" + +#include "rendering/render_object.h" + +#include "html/htmlparser.h" +#include <kdebug.h> +#include <tdelocale.h> + +using namespace DOM; +using namespace tdehtml; + +//---------------------------------------------------------------------------- + +/** + * @internal + */ +class HTMLStackElem +{ +public: + HTMLStackElem( int _id, + int _level, + DOM::NodeImpl *_node, + bool _inline, + HTMLStackElem * _next ) + : + id(_id), + level(_level), + strayTableContent(false), + m_inline(_inline), + node(_node), + next(_next) + { node->ref(); } + + ~HTMLStackElem() + { node->deref(); } + + void setNode(NodeImpl* newNode) + { + newNode->ref(); + node->deref(); + node = newNode; + } + + int id; + int level; + bool strayTableContent; + bool m_inline; + NodeImpl *node; + HTMLStackElem *next; +}; + +/** + * @internal + * + * The parser parses tokenized input into the document, building up the + * document tree. If the document is wellformed, parsing it is + * straightforward. + * Unfortunately, people can't write wellformed HTML documents, so the parser + * has to be tolerant about errors. + * + * We have to take care of the following error conditions: + * 1. The element being added is explicitly forbidden inside some outer tag. + * In this case we should close all tags up to the one, which forbids + * the element, and add it afterwards. + * 2. We are not allowed to add the element directly. It could be, that + * the person writing the document forgot some tag inbetween (or that the + * tag inbetween is optional...) This could be the case with the following + * tags: HTML HEAD BODY TBODY TR TD LI (did I forget any?) + * 3. We wan't to add a block element inside to an inline element. Close all + * inline elements up to the next higher block element. + * 4. If this doesn't help close elements, until we are allowed to add the + * element or ignore the tag. + * + */ + +TDEHTMLParser::TDEHTMLParser( TDEHTMLView *_parent, DocumentImpl *doc) +{ + //kdDebug( 6035 ) << "parser constructor" << endl; +#if SPEED_DEBUG > 0 + qt.start(); +#endif + + HTMLWidget = _parent; + document = doc; + + blockStack = 0; + current = 0; + + // ID_CLOSE_TAG == Num of tags + forbiddenTag = new ushort[ID_CLOSE_TAG+1]; + + reset(); +} + +TDEHTMLParser::TDEHTMLParser( DOM::DocumentFragmentImpl *i, DocumentImpl *doc ) +{ + HTMLWidget = 0; + document = doc; + + forbiddenTag = new ushort[ID_CLOSE_TAG+1]; + + blockStack = 0; + current = 0; + + reset(); + + setCurrent(i); + + inBody = true; +} + +TDEHTMLParser::~TDEHTMLParser() +{ +#if SPEED_DEBUG > 0 + kdDebug( ) << "TIME: parsing time was = " << qt.elapsed() << endl; +#endif + + freeBlock(); + + if (current) current->deref(); + + delete [] forbiddenTag; + delete isindex; +} + +void TDEHTMLParser::reset() +{ + setCurrent ( document ); + + freeBlock(); + + // before parsing no tags are forbidden... + memset(forbiddenTag, 0, (ID_CLOSE_TAG+1)*sizeof(ushort)); + + inBody = false; + haveFrameSet = false; + haveContent = false; + haveBody = false; + haveTitle = false; + inSelect = false; + inStrayTableContent = 0; + m_inline = false; + + form = 0; + map = 0; + end = false; + isindex = 0; + + discard_until = 0; +} + +void TDEHTMLParser::parseToken(Token *t) +{ + if (t->tid > 2*ID_CLOSE_TAG) + { + kdDebug( 6035 ) << "Unknown tag!! tagID = " << t->tid << endl; + return; + } + if(discard_until) { + if(t->tid == discard_until) + discard_until = 0; + + // do not skip </iframe> + if ( discard_until || current->id() + ID_CLOSE_TAG != t->tid ) + return; + } + +#ifdef PARSER_DEBUG + kdDebug( 6035 ) << "\n\n==> parser: processing token " << getTagName(t->tid) << "(" << t->tid << ")" + << " current = " << getTagName(current->id()) << "(" << current->id() << ")" << endl; + kdDebug(6035) << "inline=" << m_inline << " inBody=" << inBody << " haveFrameSet=" << haveFrameSet << " haveContent=" << haveContent << endl; +#endif + + // holy shit. apparently some sites use </br> instead of <br> + // be compatible with IE and NS + if(t->tid == ID_BR+ID_CLOSE_TAG && document->inCompatMode()) + t->tid -= ID_CLOSE_TAG; + + if(t->tid > ID_CLOSE_TAG) + { + processCloseTag(t); + return; + } + + // ignore spaces, if we're not inside a paragraph or other inline code + if( t->tid == ID_TEXT && t->text ) { + if(inBody && !skipMode() && + current->id() != ID_STYLE && current->id() != ID_TITLE && + current->id() != ID_SCRIPT && + !t->text->containsOnlyWhitespace()) haveContent = true; +#ifdef PARSER_DEBUG + kdDebug(6035) << "length="<< t->text->l << " text='" << TQConstString(t->text->s, t->text->l).string() << "'" << endl; +#endif + } + + NodeImpl *n = getElement(t); + // just to be sure, and to catch currently unimplemented stuff + if(!n) + return; + + // set attributes + if(n->isElementNode() && t->tid != ID_ISINDEX) + { + ElementImpl *e = static_cast<ElementImpl *>(n); + e->setAttributeMap(t->attrs); + + // take care of optional close tags + if(endTag[e->id()] == DOM::OPTIONAL) + popBlock(t->tid); + } + + // if this tag is forbidden inside the current context, pop + // blocks until we are allowed to add it... + while(blockStack && forbiddenTag[t->tid]) { +#ifdef PARSER_DEBUG + kdDebug( 6035 ) << "t->id: " << t->tid << " is forbidden :-( " << endl; +#endif + popOneBlock(); + } + + // sometimes flat doesn't make sense + switch(t->tid) { + case ID_SELECT: + case ID_OPTION: + t->flat = false; + } + + // the tokenizer needs the feedback for space discarding + if ( tagPriority[t->tid] == 0 ) + t->flat = true; + + if ( !insertNode(n, t->flat) ) { + // we couldn't insert the node... +#ifdef PARSER_DEBUG + kdDebug( 6035 ) << "insertNode failed current=" << current->id() << ", new=" << n->id() << "!" << endl; +#endif + if (map == n) + { +#ifdef PARSER_DEBUG + kdDebug( 6035 ) << " --> resetting map!" << endl; +#endif + map = 0; + } + if (form == n) + { +#ifdef PARSER_DEBUG + kdDebug( 6035 ) << " --> resetting form!" << endl; +#endif + form = 0; + } + delete n; + } +} + +static bool isTableRelatedTag(int id) +{ + return (id == ID_TR || id == ID_TD || id == ID_TABLE || id == ID_TBODY || id == ID_TFOOT || id == ID_THEAD || + id == ID_TH); +} + +bool TDEHTMLParser::insertNode(NodeImpl *n, bool flat) +{ + int id = n->id(); + + // let's be stupid and just try to insert it. + // this should work if the document is wellformed +#ifdef PARSER_DEBUG + NodeImpl *tmp = current; +#endif + NodeImpl *newNode = current->addChild(n); + if ( newNode ) { +#ifdef PARSER_DEBUG + kdDebug( 6035 ) << "added " << n->nodeName().string() << " to " << tmp->nodeName().string() << ", new current=" << newNode->nodeName().string() << endl; +#endif + // We allow TABLE > FORM in dtd.cpp, but do not allow the form have children in this case + if (current->id() == ID_TABLE && id == ID_FORM) { + flat = true; + static_cast<HTMLFormElementImpl*>(n)->setMalformed(true); + } + + // don't push elements without end tag on the stack + if(tagPriority[id] != 0 && !flat) { +#if SPEED_DEBUG < 2 + if(!n->attached() && HTMLWidget ) + n->attach(); +#endif + if(n->isInline()) m_inline = true; + pushBlock(id, tagPriority[id]); + setCurrent( newNode ); + } else { +#if SPEED_DEBUG < 2 + if(!n->attached() && HTMLWidget) + n->attach(); + if (n->maintainsState()) { + document->registerMaintainsState(n); + TQString state(document->nextState()); + if (!state.isNull()) n->restoreState(state); + } + n->close(); +#endif + if(n->isInline()) m_inline = true; + } + + +#if SPEED_DEBUG < 1 + if(tagPriority[id] == 0 && n->renderer()) + n->renderer()->calcMinMaxWidth(); +#endif + return true; + } else { +#ifdef PARSER_DEBUG + kdDebug( 6035 ) << "ADDING NODE FAILED!!!! current = " << current->nodeName().string() << ", new = " << n->nodeName().string() << endl; +#endif + // error handling... + HTMLElementImpl *e; + bool handled = false; + + // first switch on current element for elements with optional end-tag and inline-only content + switch(current->id()) + { + case ID_P: + case ID_DT: + if(!n->isInline()) + { + popBlock(current->id()); + return insertNode(n); + } + break; + default: + break; + } + + // switch according to the element to insert + switch(id) + { + case ID_TR: + case ID_TH: + case ID_TD: + if (inStrayTableContent && !isTableRelatedTag(current->id())) { + // pop out to the nearest enclosing table-related tag. + while (blockStack && !isTableRelatedTag(current->id())) + popOneBlock(); + return insertNode(n); + } + break; + case ID_COMMENT: + break; + case ID_HEAD: + // ### allow not having <HTML> in at all, as per HTML spec + if (!current->isDocumentNode() && current->id() != ID_HTML ) + return false; + break; + case ID_META: + case ID_LINK: + case ID_ISINDEX: + case ID_BASE: + if( !head ) + createHead(); + if( head ) { + if ( head->addChild(n) ) { +#if SPEED_DEBUG < 2 + if(!n->attached() && HTMLWidget) + n->attach(); +#endif + } + + return true; + } + + break; + case ID_HTML: + if (!current->isDocumentNode() ) { + if ( doc()->firstChild()->id() == ID_HTML) { + // we have another <HTML> element.... apply attributes to existing one + // make sure we don't overwrite already existing attributes + NamedAttrMapImpl *map = static_cast<ElementImpl*>(n)->attributes(true); + NamedAttrMapImpl *bmap = static_cast<ElementImpl*>(doc()->firstChild())->attributes(false); + bool changed = false; + for (unsigned long l = 0; map && l < map->length(); ++l) { + NodeImpl::Id attrId = map->idAt(l); + DOMStringImpl *attrValue = map->valueAt(l); + changed = !bmap->getValue(attrId); + bmap->setValue(attrId,attrValue); + } + if ( changed ) + doc()->recalcStyle( NodeImpl::Inherit ); + } + return false; + } + break; + case ID_TITLE: + case ID_STYLE: + if ( !head ) + createHead(); + if ( head ) { + DOM::NodeImpl *newNode = head->addChild(n); + if ( newNode ) { + pushBlock(id, tagPriority[id]); + setCurrent ( newNode ); +#if SPEED_DEBUG < 2 + if(!n->attached() && HTMLWidget) + n->attach(); +#endif + } else { +#ifdef PARSER_DEBUG + kdDebug( 6035 ) << "adding style before to body failed!!!!" << endl; +#endif + discard_until = ID_STYLE + ID_CLOSE_TAG; + return false; + } + return true; + } else if(inBody) { + discard_until = id + ID_CLOSE_TAG; + return false; + } + break; + case ID_SCRIPT: + // if we failed to insert it, go into skip mode + discard_until = id + ID_CLOSE_TAG; + break; + case ID_BODY: + if(inBody && doc()->body()) { + // we have another <BODY> element.... apply attributes to existing one + // make sure we don't overwrite already existing attributes + // some sites use <body bgcolor=rightcolor>...<body bgcolor=wrongcolor> + NamedAttrMapImpl *map = static_cast<ElementImpl*>(n)->attributes(true); + NamedAttrMapImpl *bmap = doc()->body()->attributes(false); + bool changed = false; + for (unsigned long l = 0; map && l < map->length(); ++l) { + NodeImpl::Id attrId = map->idAt(l); + DOMStringImpl *attrValue = map->valueAt(l); + if ( !bmap->getValue(attrId) ) { + bmap->setValue(attrId,attrValue); + changed = true; + } + } + if ( changed ) + doc()->recalcStyle( NodeImpl::Inherit ); + } else if ( current->isDocumentNode() ) + break; + return false; + break; + + // the following is a hack to move non rendered elements + // outside of tables. + // needed for broken constructs like <table><form ...><tr>.... + case ID_INPUT: + { + ElementImpl *e = static_cast<ElementImpl *>(n); + DOMString type = e->getAttribute(ATTR_TYPE); + + if ( strcasecmp( type, "hidden" ) != 0 ) + break; + // Fall through! + } + case ID_TEXT: + { + // Don't try to fit random white-space anywhere + TextImpl *t = static_cast<TextImpl *>(n); + if (t->containsOnlyWhitespace()) + return false; + // ignore text inside the following elements. + switch(current->id()) + { + case ID_SELECT: + return false; + default: + ; + // fall through!! + }; + break; + } + case ID_DL: + popBlock( ID_DT ); + if ( current->id() == ID_DL ) { + e = new HTMLGenericElementImpl( document, ID_DD ); + insertNode( e ); + handled = true; + } + break; + case ID_DT: + e = new HTMLDListElementImpl(document); + if ( insertNode(e) ) { + insertNode(n); + return true; + } + break; + case ID_AREA: + { + if(map) + { + map->addChild(n); +#if SPEED_DEBUG < 2 + if(!n->attached() && HTMLWidget) + n->attach(); +#endif + handled = true; + return true; + } + else + return false; + } + case ID_CAPTION: { + switch (current->id()) { + case ID_THEAD: + case ID_TBODY: + case ID_TFOOT: + case ID_TR: + case ID_TH: + case ID_TD: { + NodeImpl* tsection = current; + if (current->id() == ID_TR) + tsection = current->parent(); + else if (current->id() == ID_TD || current->id() == ID_TH) + tsection = current->parent()->parent(); + NodeImpl* table = tsection->parent(); + int exceptioncode = 0; + table->insertBefore(n, tsection, exceptioncode); + pushBlock(id, tagPriority[id]); + setCurrent(n); + inStrayTableContent++; + blockStack->strayTableContent = true; + return true; + } + default: + break; + } + break; + } + + case ID_THEAD: + case ID_TBODY: + case ID_TFOOT: + case ID_COLGROUP: { + if (isTableRelatedTag(current->id())) { + while (blockStack && current->id() != ID_TABLE && isTableRelatedTag(current->id())) + popOneBlock(); + return insertNode(n); + } + } + default: + break; + } + + // switch on the currently active element + switch(current->id()) + { + case ID_HTML: + switch(id) + { + case ID_SCRIPT: + case ID_STYLE: + case ID_META: + case ID_LINK: + case ID_OBJECT: + case ID_EMBED: + case ID_TITLE: + case ID_ISINDEX: + case ID_BASE: + if(!head) { + head = new HTMLHeadElementImpl(document); + insertNode(head.get()); + handled = true; + } + break; + case ID_TEXT: { + TextImpl *t = static_cast<TextImpl *>(n); + if (t->containsOnlyWhitespace()) + return false; + /* Fall through to default */ + } + default: + if ( haveFrameSet ) break; + e = new HTMLBodyElementImpl(document); + startBody(); + insertNode(e); + handled = true; + break; + } + break; + case ID_HEAD: + // we can get here only if the element is not allowed in head. + if (id == ID_HTML) + return false; + else { + // This means the body starts here... + if ( haveFrameSet ) break; + popBlock(ID_HEAD); + e = new HTMLBodyElementImpl(document); + startBody(); + insertNode(e); + handled = true; + } + break; + case ID_BODY: + break; + case ID_CAPTION: + // Illegal content in a caption. Close the caption and try again. + popBlock(ID_CAPTION); + switch( id ) { + case ID_THEAD: + case ID_TFOOT: + case ID_TBODY: + case ID_TR: + case ID_TD: + case ID_TH: + return insertNode(n, flat); + } + break; + case ID_TABLE: + case ID_THEAD: + case ID_TFOOT: + case ID_TBODY: + case ID_TR: + switch(id) + { + case ID_TABLE: + popBlock(ID_TABLE); // end the table + handled = checkChild( current->id(), id, !doc()->inCompatMode()); + break; + default: + { + NodeImpl *node = current; + NodeImpl *parent = node->parentNode(); + // A script may have removed the current node's parent from the DOM + // http://bugzilla.opendarwin.org/show_bug.cgi?id=7137 + // FIXME: we should do real recovery here and re-parent with the correct node. + if (!parent) + return false; + NodeImpl *parentparent = parent->parentNode(); + + if (n->isTextNode() || + ( node->id() == ID_TR && + ( parent->id() == ID_THEAD || + parent->id() == ID_TBODY || + parent->id() == ID_TFOOT ) && parentparent->id() == ID_TABLE ) || + ( !checkChild( ID_TR, id ) && ( node->id() == ID_THEAD || node->id() == ID_TBODY || node->id() == ID_TFOOT ) && + parent->id() == ID_TABLE ) ) + { + node = (node->id() == ID_TABLE) ? node : + ((node->id() == ID_TR ) ? parentparent : parent); + NodeImpl *parent = node->parentNode(); + if (!parent) + return false; + int exceptioncode = 0; +#ifdef PARSER_DEBUG + kdDebug( 6035 ) << "calling insertBefore(" << n->nodeName().string() << "," << node->nodeName().string() << ")" << endl; +#endif + parent->insertBefore(n, node, exceptioncode); + if (exceptioncode) { +#ifndef PARSER_DEBUG + if (!n->isTextNode()) +#endif + kdDebug(6035) << "adding content before table failed.." << endl; + break; + } + if ( n->isElementNode() && tagPriority[id] != 0 && + !flat && endTag[id] != DOM::FORBIDDEN ) { + + pushBlock(id, tagPriority[id]); + setCurrent ( n ); + inStrayTableContent++; + blockStack->strayTableContent = true; + } + return true; + } + + if ( current->id() == ID_TR ) + e = new HTMLTableCellElementImpl(document, ID_TD); + else if ( current->id() == ID_TABLE ) + e = new HTMLTableSectionElementImpl( document, ID_TBODY, true /* implicit */ ); + else + e = new HTMLTableRowElementImpl( document ); + + insertNode(e); + handled = true; + break; + } // end default + } // end switch + break; + case ID_OBJECT: + discard_until = id + ID_CLOSE_TAG; + return false; + case ID_UL: + case ID_OL: + case ID_DIR: + case ID_MENU: + e = new HTMLLIElementImpl(document); + e->addCSSProperty(CSS_PROP_LIST_STYLE_TYPE, CSS_VAL_NONE); + insertNode(e); + handled = true; + break; + case ID_DL: + popBlock(ID_DL); + handled = true; + break; + case ID_DT: + popBlock(ID_DT); + handled = true; + break; + case ID_FORM: + popBlock(ID_FORM); + handled = true; + break; + case ID_SELECT: + if( n->isInline() ) + return false; + break; + case ID_P: + case ID_H1: + case ID_H2: + case ID_H3: + case ID_H4: + case ID_H5: + case ID_H6: + if(!n->isInline()) + { + popBlock(current->id()); + handled = true; + } + break; + case ID_OPTION: + case ID_OPTGROUP: + if (id == ID_OPTGROUP) + { + popBlock(current->id()); + handled = true; + } + else if(id == ID_SELECT) + { + // IE treats a nested select as </select>. Let's do the same + popBlock( ID_SELECT ); + break; + } + break; + // head elements in the body should be ignored. + + case ID_ADDRESS: + case ID_COLGROUP: + case ID_FONT: + popBlock(current->id()); + handled = true; + break; + default: + if(current->isDocumentNode()) + { + if(current->firstChild() == 0) { + e = new HTMLHtmlElementImpl(document); + insertNode(e); + handled = true; + } + } + else if(current->isInline()) + { + popInlineBlocks(); + handled = true; + } + } + + // if we couldn't handle the error, just rethrow the exception... + if(!handled) + { + //kdDebug( 6035 ) << "Exception handler failed in HTMLPArser::insertNode()" << endl; + return false; + } + + return insertNode(n); + } +} + + +NodeImpl *TDEHTMLParser::getElement(Token* t) +{ + NodeImpl *n = 0; + + switch(t->tid) + { + case ID_HTML: + n = new HTMLHtmlElementImpl(document); + break; + case ID_HEAD: + if(!head && current->id() == ID_HTML) { + head = new HTMLHeadElementImpl(document); + n = head.get(); + } + break; + case ID_BODY: + // body no longer allowed if we have a frameset + if(haveFrameSet) break; + popBlock(ID_HEAD); + n = new HTMLBodyElementImpl(document); + haveBody = true; + startBody(); + break; + +// head elements + case ID_BASE: + n = new HTMLBaseElementImpl(document); + break; + case ID_LINK: + n = new HTMLLinkElementImpl(document); + break; + case ID_META: + n = new HTMLMetaElementImpl(document); + break; + case ID_STYLE: + n = new HTMLStyleElementImpl(document); + break; + case ID_TITLE: + // only one non-empty <title> allowed + if (haveTitle) { + discard_until = ID_TITLE+ID_CLOSE_TAG; + break; + } + n = new HTMLTitleElementImpl(document); + // we'll set haveTitle when closing the tag + break; + +// frames + case ID_FRAME: + n = new HTMLFrameElementImpl(document); + break; + case ID_FRAMESET: + popBlock(ID_HEAD); + if ( inBody && !haveFrameSet && !haveContent && !haveBody) { + popBlock( ID_BODY ); + // ### actually for IE document.body returns the now hidden "body" element + // we can't implement that behavior now because it could cause too many + // regressions and the headaches are not worth the work as long as there is + // no site actually relying on that detail (Dirk) + if (static_cast<HTMLDocumentImpl*>(document)->body()) + static_cast<HTMLDocumentImpl*>(document)->body() + ->addCSSProperty(CSS_PROP_DISPLAY, CSS_VAL_NONE); + inBody = false; + } + if ( (haveBody || haveContent || haveFrameSet) && current->id() == ID_HTML) + break; + n = new HTMLFrameSetElementImpl(document); + haveFrameSet = true; + startBody(); + break; + // a bit a special case, since the frame is inlined... + case ID_IFRAME: + n = new HTMLIFrameElementImpl(document); + if (!t->flat) discard_until = ID_IFRAME+ID_CLOSE_TAG; + break; + +// form elements + case ID_FORM: + // thou shall not nest <form> - NS/IE quirk + if (form) break; + n = form = new HTMLFormElementImpl(document, false); + break; + case ID_BUTTON: + n = new HTMLButtonElementImpl(document, form); + break; + case ID_FIELDSET: + n = new HTMLFieldSetElementImpl(document, form); + break; + case ID_INPUT: + if ( t->attrs && + TDEHTMLFactory::defaultHTMLSettings()->isAdFilterEnabled() && + TDEHTMLFactory::defaultHTMLSettings()->isHideAdsEnabled() && + !strcasecmp( t->attrs->getValue( ATTR_TYPE ), "image" ) ) + { + if (TDEHTMLFactory::defaultHTMLSettings()->isAdFiltered( doc()->completeURL( tdehtml::parseURL(t->attrs->getValue(ATTR_SRC)).string() ) )) + return 0; + } + n = new HTMLInputElementImpl(document, form); + break; + case ID_ISINDEX: + n = handleIsindex(t); + if( !inBody ) { + isindex = n; + n = 0; + } else + t->flat = true; + break; + case ID_KEYGEN: + n = new HTMLKeygenElementImpl(document, form); + break; + case ID_LABEL: + n = new HTMLLabelElementImpl(document); + break; + case ID_LEGEND: + n = new HTMLLegendElementImpl(document, form); + break; + case ID_OPTGROUP: + n = new HTMLOptGroupElementImpl(document, form); + break; + case ID_OPTION: + n = new HTMLOptionElementImpl(document, form); + break; + case ID_SELECT: + inSelect = true; + n = new HTMLSelectElementImpl(document, form); + break; + case ID_TEXTAREA: + n = new HTMLTextAreaElementImpl(document, form); + break; + +// lists + case ID_DL: + n = new HTMLDListElementImpl(document); + break; + case ID_DD: + n = new HTMLGenericElementImpl(document, t->tid); + popBlock(ID_DT); + popBlock(ID_DD); + break; + case ID_DT: + n = new HTMLGenericElementImpl(document, t->tid); + popBlock(ID_DD); + popBlock(ID_DT); + break; + case ID_UL: + { + n = new HTMLUListElementImpl(document); + break; + } + case ID_OL: + { + n = new HTMLOListElementImpl(document); + break; + } + case ID_DIR: + n = new HTMLDirectoryElementImpl(document); + break; + case ID_MENU: + n = new HTMLMenuElementImpl(document); + break; + case ID_LI: + popBlock(ID_LI); + n = new HTMLLIElementImpl(document); + break; +// formatting elements (block) + case ID_BLOCKQUOTE: + n = new HTMLGenericElementImpl(document, t->tid); + break; + case ID_LAYER: + case ID_ILAYER: + n = new HTMLLayerElementImpl(document, t->tid); + break; + case ID_P: + case ID_DIV: + n = new HTMLDivElementImpl(document, t->tid); + break; + case ID_H1: + case ID_H2: + case ID_H3: + case ID_H4: + case ID_H5: + case ID_H6: + n = new HTMLGenericElementImpl(document, t->tid); + break; + case ID_HR: + n = new HTMLHRElementImpl(document); + break; + case ID_PRE: + case ID_XMP: + case ID_PLAINTEXT: + n = new HTMLPreElementImpl(document, t->tid); + break; + +// font stuff + case ID_BASEFONT: + n = new HTMLBaseFontElementImpl(document); + break; + case ID_FONT: + n = new HTMLFontElementImpl(document); + break; + +// ins/del + case ID_DEL: + case ID_INS: + n = new HTMLGenericElementImpl(document, t->tid); + break; + +// anchor + case ID_A: + popBlock(ID_A); + + n = new HTMLAnchorElementImpl(document); + break; + +// images + case ID_IMG: + if (t->attrs&& + TDEHTMLFactory::defaultHTMLSettings()->isAdFilterEnabled()&& + TDEHTMLFactory::defaultHTMLSettings()->isHideAdsEnabled()) + { + TQString url = doc()->completeURL( tdehtml::parseURL(t->attrs->getValue(ATTR_SRC)).string() ); + if (TDEHTMLFactory::defaultHTMLSettings()->isAdFiltered(url)) + return 0; + } + n = new HTMLImageElementImpl(document, form); + break; + + case ID_MAP: + map = new HTMLMapElementImpl(document); + n = map; + break; + case ID_AREA: + n = new HTMLAreaElementImpl(document); + break; + +// objects, applets and scripts + case ID_APPLET: + n = new HTMLAppletElementImpl(document); + break; + case ID_EMBED: + n = new HTMLEmbedElementImpl(document); + break; + case ID_OBJECT: + n = new HTMLObjectElementImpl(document); + break; + case ID_PARAM: + n = new HTMLParamElementImpl(document); + break; + case ID_SCRIPT: + { + HTMLScriptElementImpl *scriptElement = new HTMLScriptElementImpl(document); + scriptElement->setCreatedByParser(true); + n = scriptElement; + break; + } + +// tables + case ID_TABLE: + n = new HTMLTableElementImpl(document); + break; + case ID_CAPTION: + n = new HTMLTableCaptionElementImpl(document); + break; + case ID_COLGROUP: + case ID_COL: + n = new HTMLTableColElementImpl(document, t->tid); + break; + case ID_TR: + popBlock(ID_TR); + n = new HTMLTableRowElementImpl(document); + break; + case ID_TD: + case ID_TH: + popBlock(ID_TH); + popBlock(ID_TD); + n = new HTMLTableCellElementImpl(document, t->tid); + break; + case ID_TBODY: + case ID_THEAD: + case ID_TFOOT: + popBlock( ID_THEAD ); + popBlock( ID_TBODY ); + popBlock( ID_TFOOT ); + n = new HTMLTableSectionElementImpl(document, t->tid, false); + break; + +// inline elements + case ID_BR: + n = new HTMLBRElementImpl(document); + break; + case ID_Q: + n = new HTMLGenericElementImpl(document, t->tid); + break; + +// elements with no special representation in the DOM + +// block: + case ID_ADDRESS: + case ID_CENTER: + n = new HTMLGenericElementImpl(document, t->tid); + break; +// inline + // %fontstyle + case ID_TT: + case ID_U: + case ID_B: + case ID_I: + case ID_S: + case ID_STRIKE: + case ID_BIG: + case ID_SMALL: + + // %phrase + case ID_EM: + case ID_STRONG: + case ID_DFN: + case ID_CODE: + case ID_SAMP: + case ID_KBD: + case ID_VAR: + case ID_CITE: + case ID_ABBR: + case ID_ACRONYM: + + // %special + case ID_SUB: + case ID_SUP: + case ID_SPAN: + case ID_WBR: + case ID_NOBR: + if ( t->tid == ID_NOBR || t->tid == ID_WBR ) + popBlock( t->tid ); + case ID_BDO: + n = new HTMLGenericElementImpl(document, t->tid); + break; + + // these are special, and normally not rendered + case ID_NOEMBED: + if (!t->flat) { + n = new HTMLGenericElementImpl(document, t->tid); + discard_until = ID_NOEMBED + ID_CLOSE_TAG; + } + return n; + case ID_NOFRAMES: + if (!t->flat) { + n = new HTMLGenericElementImpl(document, t->tid); + discard_until = ID_NOFRAMES + ID_CLOSE_TAG; + } + return n; + case ID_NOSCRIPT: + if (!t->flat) { + n = new HTMLGenericElementImpl(document, t->tid); + if(HTMLWidget && HTMLWidget->part()->jScriptEnabled()) + discard_until = ID_NOSCRIPT + ID_CLOSE_TAG; + } + return n; + case ID_NOLAYER: +// discard_until = ID_NOLAYER + ID_CLOSE_TAG; + return 0; + break; + case ID_MARQUEE: + n = new HTMLMarqueeElementImpl(document); + break; +// text + case ID_TEXT: +// kdDebug(6035) << "ID_TEXT: \"" << DOMString(t->text).string() << "\"" << endl; + n = new TextImpl(document, t->text); + break; + case ID_COMMENT: +#ifdef COMMENTS_IN_DOM + n = new CommentImpl(document, t->text); +#endif + break; + default: + kdDebug( 6035 ) << "Unknown tag " << t->tid << "!" << endl; + } + return n; +} + +void TDEHTMLParser::processCloseTag(Token *t) +{ + // support for really broken html. Can't believe I'm supporting such crap (lars) + switch(t->tid) + { + case ID_HTML+ID_CLOSE_TAG: + case ID_BODY+ID_CLOSE_TAG: + // we never trust those close tags, since stupid webpages close + // them prematurely + return; + case ID_FORM+ID_CLOSE_TAG: + form = 0; + // this one is to get the right style on the body element + break; + case ID_MAP+ID_CLOSE_TAG: + map = 0; + break; + case ID_SELECT+ID_CLOSE_TAG: + inSelect = false; + break; + case ID_TITLE+ID_CLOSE_TAG: + // Set haveTitle only if <title> isn't empty + if ( current->firstChild() ) + haveTitle = true; + break; + default: + break; + } + +#ifdef PARSER_DEBUG + kdDebug( 6035 ) << "added the following children to " << current->nodeName().string() << endl; + NodeImpl *child = current->firstChild(); + while(child != 0) + { + kdDebug( 6035 ) << " " << child->nodeName().string() << endl; + child = child->nextSibling(); + } +#endif + popBlock(t->tid-ID_CLOSE_TAG); +#ifdef PARSER_DEBUG + kdDebug( 6035 ) << "closeTag --> current = " << current->nodeName().string() << endl; +#endif +} + +bool TDEHTMLParser::isResidualStyleTag(int _id) +{ + switch (_id) { + case ID_A: + case ID_FONT: + case ID_TT: + case ID_U: + case ID_B: + case ID_I: + case ID_S: + case ID_STRIKE: + case ID_BIG: + case ID_SMALL: + case ID_EM: + case ID_STRONG: + case ID_DFN: + case ID_CODE: + case ID_SAMP: + case ID_KBD: + case ID_VAR: + case ID_DEL: + case ID_INS: + case ID_WBR: + case ID_NOBR: + return true; + default: + return false; + } +} + +bool TDEHTMLParser::isAffectedByResidualStyle(int _id) +{ + if (isResidualStyleTag(_id)) + return true; + + switch (_id) { + case ID_P: + case ID_DIV: + case ID_BLOCKQUOTE: + case ID_ADDRESS: + case ID_H1: + case ID_H2: + case ID_H3: + case ID_H4: + case ID_H5: + case ID_H6: + case ID_CENTER: + case ID_UL: + case ID_OL: + case ID_LI: + case ID_DL: + case ID_DT: + case ID_DD: + case ID_PRE: + return true; + default: + return false; + } +} + +void TDEHTMLParser::handleResidualStyleCloseTagAcrossBlocks(HTMLStackElem* elem) +{ + // Find the element that crosses over to a higher level. + // ### For now, if there is more than one, we will only make sure we close the residual style. + int exceptionCode = 0; + HTMLStackElem* curr = blockStack; + HTMLStackElem* maxElem = 0; + HTMLStackElem* endElem = 0; + HTMLStackElem* prev = 0; + HTMLStackElem* prevMaxElem = 0; + bool advancedResidual = false; // ### if set we only close the residual style + while (curr && curr != elem) { + if (curr->level > elem->level) { + if (!isAffectedByResidualStyle(curr->id)) return; + if (maxElem) advancedResidual = true; + else + endElem = curr; + maxElem = curr; + prevMaxElem = prev; + } + + prev = curr; + curr = curr->next; + } + + if (!curr || !maxElem ) return; + + NodeImpl* residualElem = prev->node; + NodeImpl* blockElem = prevMaxElem ? prevMaxElem->node : current; + NodeImpl* parentElem = elem->node; + + // Check to see if the reparenting that is going to occur is allowed according to the DOM. + // FIXME: We should either always allow it or perform an additional fixup instead of + // just bailing here. + // Example: <p><font><center>blah</font></center></p> isn't doing a fixup right now. + if (!parentElem->childAllowed(blockElem)) + return; + + if (maxElem->node->parentNode() != elem->node && !advancedResidual) { + // Walk the stack and remove any elements that aren't residual style tags. These + // are basically just being closed up. Example: + // <font><span>Moo<p>Goo</font></p>. + // In the above example, the <span> doesn't need to be reopened. It can just close. + HTMLStackElem* currElem = maxElem->next; + HTMLStackElem* prevElem = maxElem; + while (currElem != elem) { + HTMLStackElem* nextElem = currElem->next; + if (!isResidualStyleTag(currElem->id)) { + prevElem->next = nextElem; + prevElem->setNode(currElem->node); + delete currElem; + } + else + prevElem = currElem; + currElem = nextElem; + } + + // We have to reopen residual tags in between maxElem and elem. An example of this case s: + // <font><i>Moo<p>Foo</font>. + // In this case, we need to transform the part before the <p> into: + // <font><i>Moo</i></font><i> + // so that the <i> will remain open. This involves the modification of elements + // in the block stack. + // This will also affect how we ultimately reparent the block, since we want it to end up + // under the reopened residual tags (e.g., the <i> in the above example.) + NodeImpl* prevNode = 0; + NodeImpl* currNode = 0; + currElem = maxElem; + while (currElem->node != residualElem) { + if (isResidualStyleTag(currElem->node->id())) { + // Create a clone of this element. + currNode = currElem->node->cloneNode(false); + currElem->node->close(); + removeForbidden(currElem->id, forbiddenTag); + + // Change the stack element's node to point to the clone. + currElem->setNode(currNode); + + // Attach the previous node as a child of this new node. + if (prevNode) + currNode->appendChild(prevNode, exceptionCode); + else // The new parent for the block element is going to be the innermost clone. + parentElem = currNode; + + prevNode = currNode; + } + + currElem = currElem->next; + } + + // Now append the chain of new residual style elements if one exists. + if (prevNode) + elem->node->appendChild(prevNode, exceptionCode); + } + + // We need to make a clone of |residualElem| and place it just inside |blockElem|. + // All content of |blockElem| is reparented to be under this clone. We then + // reparent |blockElem| using real DOM calls so that attachment/detachment will + // be performed to fix up the rendering tree. + // So for this example: <b>...<p>Foo</b>Goo</p> + // The end result will be: <b>...</b><p><b>Foo</b>Goo</p> + // + // Step 1: Remove |blockElem| from its parent, doing a batch detach of all the kids. + SharedPtr<NodeImpl> guard(blockElem); + blockElem->parentNode()->removeChild(blockElem, exceptionCode); + + if (!advancedResidual) { + // Step 2: Clone |residualElem|. + NodeImpl* newNode = residualElem->cloneNode(false); // Shallow clone. We don't pick up the same kids. + + // Step 3: Place |blockElem|'s children under |newNode|. Remove all of the children of |blockElem| + // before we've put |newElem| into the document. That way we'll only do one attachment of all + // the new content (instead of a bunch of individual attachments). + NodeImpl* currNode = blockElem->firstChild(); + while (currNode) { + NodeImpl* nextNode = currNode->nextSibling(); + SharedPtr<NodeImpl> guard(currNode); //Protect from deletion while moving + blockElem->removeChild(currNode, exceptionCode); + newNode->appendChild(currNode, exceptionCode); + currNode = nextNode; + + // TODO - To be replaced. + // Re-register form elements with currently active form, step 1 will have removed them + if (form && currNode && currNode->isGenericFormElement()) + { + HTMLGenericFormElementImpl *e = static_cast<HTMLGenericFormElementImpl *>(currNode); + form->registerFormElement(e); + } + } + + // Step 4: Place |newNode| under |blockElem|. |blockElem| is still out of the document, so no + // attachment can occur yet. + blockElem->appendChild(newNode, exceptionCode); + } + + // Step 5: Reparent |blockElem|. Now the full attachment of the fixed up tree takes place. + parentElem->appendChild(blockElem, exceptionCode); + + // Step 6: Elide |elem|, since it is effectively no longer open. Also update + // the node associated with the previous stack element so that when it gets popped, + // it doesn't make the residual element the next current node. + HTMLStackElem* currElem = maxElem; + HTMLStackElem* prevElem = 0; + while (currElem != elem) { + prevElem = currElem; + currElem = currElem->next; + } + prevElem->next = elem->next; + prevElem->setNode(elem->node); + delete elem; + + // Step 7: Reopen intermediate inlines, e.g., <b><p><i>Foo</b>Goo</p>. + // In the above example, Goo should stay italic. + curr = blockStack; + HTMLStackElem* residualStyleStack = 0; + while (curr && curr != endElem) { + // We will actually schedule this tag for reopening + // after we complete the close of this entire block. + NodeImpl* currNode = current; + if (isResidualStyleTag(curr->id)) { + // We've overloaded the use of stack elements and are just reusing the + // struct with a slightly different meaning to the variables. Instead of chaining + // from innermost to outermost, we build up a list of all the tags we need to reopen + // from the outermost to the innermost, i.e., residualStyleStack will end up pointing + // to the outermost tag we need to reopen. + // We also set curr->node to be the actual element that corresponds to the ID stored in + // curr->id rather than the node that you should pop to when the element gets pulled off + // the stack. + popOneBlock(false); + curr->setNode(currNode); + curr->next = residualStyleStack; + residualStyleStack = curr; + } + else + popOneBlock(); + + curr = blockStack; + } + + reopenResidualStyleTags(residualStyleStack, 0); // FIXME: Deal with stray table content some day + // if it becomes necessary to do so. +} + +void TDEHTMLParser::reopenResidualStyleTags(HTMLStackElem* elem, DOM::NodeImpl* malformedTableParent) +{ + // Loop for each tag that needs to be reopened. + while (elem) { + // Create a shallow clone of the DOM node for this element. + NodeImpl* newNode = elem->node->cloneNode(false); + + // Append the new node. In the malformed table case, we need to insert before the table, + // which will be the last child. + int exceptionCode = 0; + if (malformedTableParent) + malformedTableParent->insertBefore(newNode, malformedTableParent->lastChild(), exceptionCode); + else + current->appendChild(newNode, exceptionCode); + // FIXME: Is it really OK to ignore the exceptions here? + + // Now push a new stack element for this node we just created. + pushBlock(elem->id, elem->level); + + // Set our strayTableContent boolean if needed, so that the reopened tag also knows + // that it is inside a malformed table. + blockStack->strayTableContent = malformedTableParent != 0; + if (blockStack->strayTableContent) + inStrayTableContent++; + + // Clear our malformed table parent variable. + malformedTableParent = 0; + + // Update |current| manually to point to the new node. + setCurrent(newNode); + + // Advance to the next tag that needs to be reopened. + HTMLStackElem* next = elem->next; + delete elem; + elem = next; + } +} + +void TDEHTMLParser::pushBlock(int _id, int _level) +{ + HTMLStackElem *Elem = new HTMLStackElem(_id, _level, current, m_inline, blockStack); + + blockStack = Elem; + addForbidden(_id, forbiddenTag); +} + +void TDEHTMLParser::popBlock( int _id ) +{ + HTMLStackElem *Elem = blockStack; + int maxLevel = 0; + +#ifdef PARSER_DEBUG + kdDebug( 6035 ) << "popBlock(" << getTagName(_id) << ")" << endl; + while(Elem) { + kdDebug( 6035) << " > " << getTagName(Elem->id) << endl; + Elem = Elem->next; + } + Elem = blockStack; +#endif + + while( Elem && (Elem->id != _id)) + { + if (maxLevel < Elem->level) + { + maxLevel = Elem->level; + } + Elem = Elem->next; + } + if (!Elem) + return; + + if (maxLevel > Elem->level) { + // We didn't match because the tag is in a different scope, e.g., + // <b><p>Foo</b>. Try to correct the problem. + if (!isResidualStyleTag(_id)) + return; + return handleResidualStyleCloseTagAcrossBlocks(Elem); + } + + bool isAffectedByStyle = isAffectedByResidualStyle(Elem->id); + HTMLStackElem* residualStyleStack = 0; + NodeImpl* malformedTableParent = 0; + + Elem = blockStack; + + while (Elem) + { + if (Elem->id == _id) + { + int strayTable = inStrayTableContent; + popOneBlock(); + Elem = 0; + + // This element was the root of some malformed content just inside an implicit or + // explicit <tbody> or <tr>. + // If we end up needing to reopen residual style tags, the root of the reopened chain + // must also know that it is the root of malformed content inside a <tbody>/<tr>. + if (strayTable && (inStrayTableContent < strayTable) && residualStyleStack) { + NodeImpl* curr = current; + while (curr && curr->id() != ID_TABLE) + curr = curr->parentNode(); + malformedTableParent = curr ? curr->parentNode() : 0; + } + } + else + { + // Schedule this tag for reopening + // after we complete the close of this entire block. + NodeImpl* currNode = current; + if (isAffectedByStyle && isResidualStyleTag(Elem->id)) { + // We've overloaded the use of stack elements and are just reusing the + // struct with a slightly different meaning to the variables. Instead of chaining + // from innermost to outermost, we build up a list of all the tags we need to reopen + // from the outermost to the innermost, i.e., residualStyleStack will end up pointing + // to the outermost tag we need to reopen. + // We also set Elem->node to be the actual element that corresponds to the ID stored in + // Elem->id rather than the node that you should pop to when the element gets pulled off + // the stack. + popOneBlock(false); + Elem->next = residualStyleStack; + Elem->setNode(currNode); + residualStyleStack = Elem; + } + else + popOneBlock(); + Elem = blockStack; + } + } + + reopenResidualStyleTags(residualStyleStack, malformedTableParent); +} + +void TDEHTMLParser::popOneBlock(bool delBlock) +{ + HTMLStackElem *Elem = blockStack; + + // we should never get here, but some bad html might cause it. +#ifndef PARSER_DEBUG + if(!Elem) return; +#else + kdDebug( 6035 ) << "popping block: " << getTagName(Elem->id) << "(" << Elem->id << ")" << endl; +#endif + +#if SPEED_DEBUG < 1 + if((Elem->node != current)) { + if (current->maintainsState() && document){ + document->registerMaintainsState(current); + TQString state(document->nextState()); + if (!state.isNull()) current->restoreState(state); + } + current->close(); + } +#endif + + removeForbidden(Elem->id, forbiddenTag); + + blockStack = Elem->next; + // we only set inline to false, if the element we close is a block level element. + // This helps getting cases as <p><b>bla</b> <b>bla</b> right. + + m_inline = Elem->m_inline; + + if (current->id() == ID_FORM && form && inStrayTableContent) + form->setMalformed(true); + + setCurrent( Elem->node ); + + if (Elem->strayTableContent) + inStrayTableContent--; + + if (delBlock) + delete Elem; +} + +void TDEHTMLParser::popInlineBlocks() +{ + while(blockStack && current->isInline() && current->id() != ID_FONT) + popOneBlock(); +} + +void TDEHTMLParser::freeBlock() +{ + while (blockStack) + popOneBlock(); + blockStack = 0; +} + +void TDEHTMLParser::createHead() +{ + if(head || !doc()->firstChild()) + return; + + head = new HTMLHeadElementImpl(document); + HTMLElementImpl *body = doc()->body(); + int exceptioncode = 0; + doc()->firstChild()->insertBefore(head.get(), body, exceptioncode); + if ( exceptioncode ) { +#ifdef PARSER_DEBUG + kdDebug( 6035 ) << "creation of head failed!!!!" << endl; +#endif + delete head.get(); + head = 0; + } +} + +NodeImpl *TDEHTMLParser::handleIsindex( Token *t ) +{ + NodeImpl *n; + HTMLFormElementImpl *myform = form; + if ( !myform ) { + myform = new HTMLFormElementImpl(document, true); + n = myform; + } else + n = new HTMLDivElementImpl( document, ID_DIV ); + NodeImpl *child = new HTMLHRElementImpl( document ); + n->addChild( child ); + DOMStringImpl* a = t->attrs ? t->attrs->getValue(ATTR_PROMPT) : 0; + DOMString text = i18n("This is a searchable index. Enter search keywords: "); + if (a) + text = a; + child = new TextImpl(document, text.implementation()); + n->addChild( child ); + child = new HTMLIsIndexElementImpl(document, myform); + static_cast<ElementImpl *>(child)->setAttribute(ATTR_TYPE, "tdehtml_isindex"); + n->addChild( child ); + child = new HTMLHRElementImpl( document ); + n->addChild( child ); + + return n; +} + +void TDEHTMLParser::startBody() +{ + if(inBody) return; + + inBody = true; + + if( isindex ) { + insertNode( isindex, true /* don't decend into this node */ ); + isindex = 0; + } +} |