summaryrefslogtreecommitdiffstats
path: root/klinkstatus/src/parser/htmlparser.h
diff options
context:
space:
mode:
Diffstat (limited to 'klinkstatus/src/parser/htmlparser.h')
-rw-r--r--klinkstatus/src/parser/htmlparser.h124
1 files changed, 124 insertions, 0 deletions
diff --git a/klinkstatus/src/parser/htmlparser.h b/klinkstatus/src/parser/htmlparser.h
new file mode 100644
index 00000000..cf487ebf
--- /dev/null
+++ b/klinkstatus/src/parser/htmlparser.h
@@ -0,0 +1,124 @@
+ /***************************************************************************
+ * Copyright (C) 2004 by Paulo Moura Guedes *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ ***************************************************************************/
+
+#ifndef HTML_PARSER_H
+#define HTML_PARSER_H
+
+#include <qstring.h>
+
+#include <vector>
+
+
+#include "mstring.h"
+#include "node.h"
+
+#include <iostream>
+
+using namespace std;
+
+typedef unsigned int uint;
+
+
+
+class HtmlParser
+{
+public:
+
+ HtmlParser();
+ HtmlParser(QString const& documento);
+ ~HtmlParser();
+
+ vector<Node*> const& nodes() const;
+ bool hasBaseUrl() const;
+ bool hasTitle() const;
+ bool hasContentType() const;
+ NodeBASE const& baseUrl() const;
+ NodeTITLE const& title() const;
+ NodeMETA const& contentTypeMetaNode() const;
+
+ static uint estimativaLinks(uint doc_size);
+ /**
+ * Convenience function for performance as it only parse in order
+ * to get the charset.
+ */
+ static QString findCharsetInMetaElement(QString const& html);
+
+ // test:
+ void mostra() const;
+
+private:
+
+ vector<QString> const& parseNodesOfType(QString const& element);
+ /**
+ * Vector nodes passed for performance.
+ */
+ static void parseNodesOfType(QString const& element, QString const& doc, vector<QString>& nodes);
+
+ void parseNodesOfTypeA();
+ void parseNodesOfTypeAREA();
+ void parseNodesOfTypeLINK();
+ void parseNodesOfTypeMETA();
+ void parseNodesOfTypeIMG();
+ void parseNodesOfTypeFRAME();
+ void parseNodesOfTypeIFRAME();
+ void parseNodesOfTypeBASE();
+ void parseNodesOfTypeTITLE();
+
+ void stripComments();
+ void stripScriptContent();
+
+ /**
+ Return the index of the next character of the end of tag.
+ e.g.
+ endOfTag("<img src=\"bad > luck\">") => 22 (not 15)
+ */
+ static int endOfTag(QString const& s, int index = 0, QChar end_of_tag = '>');
+
+private:
+
+ vector<QString> aux_; // for what the hell is this? looks ugly... maybe I was drunk, can't remember
+ vector<Node*> nodes_;
+ NodeBASE node_BASE_;
+ NodeTITLE node_TITLE_;
+ NodeMETA node_META_content_type_;
+ bool is_content_type_set_;
+
+ QString document_;
+ QString script_; // Fica aqui guardado (JavaScript, etc)
+ QString comments_;
+};
+
+
+inline HtmlParser::~HtmlParser()
+{
+ //kdDebug(23100) << "*";
+}
+
+inline uint HtmlParser::estimativaLinks(uint doc_size)
+{
+ return doc_size / 100; // valor estimado...
+}
+
+inline bool HtmlParser::hasContentType() const
+{
+ return is_content_type_set_;
+}
+
+#endif