diff options
Diffstat (limited to 'klinkstatus/src/parser')
-rw-r--r-- | klinkstatus/src/parser/Makefile.am | 6 | ||||
-rw-r--r-- | klinkstatus/src/parser/htmlparser.cpp | 455 | ||||
-rw-r--r-- | klinkstatus/src/parser/htmlparser.h | 124 | ||||
-rw-r--r-- | klinkstatus/src/parser/http.cpp | 87 | ||||
-rw-r--r-- | klinkstatus/src/parser/http.h | 79 | ||||
-rw-r--r-- | klinkstatus/src/parser/mstring.cpp | 278 | ||||
-rw-r--r-- | klinkstatus/src/parser/mstring.h | 174 | ||||
-rw-r--r-- | klinkstatus/src/parser/node.cpp | 255 | ||||
-rw-r--r-- | klinkstatus/src/parser/node.h | 279 | ||||
-rw-r--r-- | klinkstatus/src/parser/node_impl.h | 412 | ||||
-rw-r--r-- | klinkstatus/src/parser/url.cpp | 350 | ||||
-rw-r--r-- | klinkstatus/src/parser/url.h | 57 |
12 files changed, 2556 insertions, 0 deletions
diff --git a/klinkstatus/src/parser/Makefile.am b/klinkstatus/src/parser/Makefile.am new file mode 100644 index 00000000..b99146c1 --- /dev/null +++ b/klinkstatus/src/parser/Makefile.am @@ -0,0 +1,6 @@ +INCLUDES = $(all_includes) +METASOURCES = AUTO +noinst_HEADERS = htmlparser.h http.h mstring.h node.h node_impl.h url.h +libparser_la_LDFLAGS = $(all_libraries) +noinst_LTLIBRARIES = libparser.la +libparser_la_SOURCES = htmlparser.cpp http.cpp mstring.cpp node.cpp url.cpp diff --git a/klinkstatus/src/parser/htmlparser.cpp b/klinkstatus/src/parser/htmlparser.cpp new file mode 100644 index 00000000..6bc93761 --- /dev/null +++ b/klinkstatus/src/parser/htmlparser.cpp @@ -0,0 +1,455 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * [email protected] * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +#include "htmlparser.h" + +#include <kapplication.h> +#include <kdebug.h> + + +HtmlParser::HtmlParser(QString const& documento) + : is_content_type_set_(false), document_(documento) +{ + Q_ASSERT(!documento.isEmpty()); + + stripScriptContent(); + stripComments(); // after removing the script because comments in scripts have diferent sintaxe + + nodes_.reserve(estimativaLinks(documento.length() * 2)); // à confiança ;) + + parseNodesOfTypeA(); + parseNodesOfTypeAREA(); + parseNodesOfTypeLINK(); + parseNodesOfTypeMETA(); + parseNodesOfTypeIMG(); + parseNodesOfTypeFRAME(); + parseNodesOfTypeIFRAME(); + parseNodesOfTypeBASE(); + parseNodesOfTypeTITLE(); +} + +bool HtmlParser::hasBaseUrl() const +{ + return (node_BASE_.element() == Node::BASE && + !node_BASE_.url().isEmpty()); +} + +NodeBASE const& HtmlParser::baseUrl() const +{ + Q_ASSERT(hasBaseUrl()); + return node_BASE_; +} + +NodeMETA const& HtmlParser::contentTypeMetaNode() const +{ + Q_ASSERT(hasContentType()); + return node_META_content_type_; +} + +bool HtmlParser::hasTitle() const +{ + return (node_TITLE_.element() == Node::TITLE && + !node_TITLE_.attributeTITLE().isEmpty()); +} + +NodeTITLE const& HtmlParser::title() const +{ + Q_ASSERT(hasTitle()); + return node_TITLE_; +} + +vector<QString> const& HtmlParser::parseNodesOfType(QString const& element) +{ + HtmlParser::parseNodesOfType(element, document_, aux_); + return aux_; +} + +void HtmlParser::parseNodesOfType(QString const& tipo, QString const& document, vector<QString>& nodes) +{ + QString node; + QString doc(document); + int inicio = 0, fim = 0; + + nodes.clear(); + if(upperCase(tipo) == "A") + nodes.reserve(estimativaLinks(doc.length() * 2)); + + while(true) + { + inicio = findSeparableWord(doc, "<" + tipo); + if(inicio == -1) + return; + + //if( (doc[inicio] != ' ' && doc[inicio] != '\n' && doc[inicio] != '\r') ) + if(!::isSpace(doc[inicio])) + { + doc.remove(0, QString("<" + tipo).length()); + continue; + } + + if(upperCase(tipo) == "A") + fim = findWord(doc, "</A>", inicio); + else + { + //fim = findChar(doc, '>', inicio + 1); + fim = endOfTag(doc, inicio, '>'); + } + + if(fim == -1) + { + doc.remove(0, 1); + continue; + } + + int tag_begining_go_back = (tipo.length() + QString("<").length()); + node = doc.mid(inicio - tag_begining_go_back, + fim - inicio + tag_begining_go_back); + nodes.push_back(node); + doc.remove(0, fim); + } +} + +int HtmlParser::endOfTag(QString const& s, int index, QChar end_of_tag) +{ + if( (uint)index >= s.length() ) + return -1; + + int _end_of_tag = s.find(end_of_tag, index); + if(_end_of_tag == -1) + return _end_of_tag; + + int open_aspas = s.find('"', index); + if(open_aspas == -1) + return _end_of_tag + 1; + + else if(_end_of_tag < open_aspas) + return _end_of_tag + 1; + + else if( ((uint)open_aspas + 1) >= s.length() - 1 ) + return -1; + + else + { + int close_aspas = s.find('"', open_aspas + 1); + if(close_aspas != -1) + return endOfTag(s, close_aspas + 1, end_of_tag); + else + { + kdDebug(23100) << "Mismatched quotes (\"): " << s.mid(index, _end_of_tag - index) << endl; + //return -1; + return _end_of_tag + 1; + } + } +} + +vector<Node*> const& HtmlParser::nodes() const +{ + return nodes_; +} + + +void HtmlParser::parseNodesOfTypeA() +{ + vector<QString> const& aux = parseNodesOfType("A"); + + for(vector<QString>::size_type i = 0; i != aux.size(); ++i) + { + nodes_.push_back( new NodeA(aux[i]) ); + } +} + +void HtmlParser::parseNodesOfTypeAREA() +{ + vector<QString> const& aux = parseNodesOfType("AREA"); + + for(vector<QString>::size_type i = 0; i != aux.size(); ++i) + { + nodes_.push_back( new NodeAREA(aux[i]) ); + } +} + +void HtmlParser::parseNodesOfTypeLINK() +{ + vector<QString> const& aux = parseNodesOfType("LINK"); + + for(vector<QString>::size_type i = 0; i != aux.size(); ++i) + nodes_.push_back( new NodeLINK(aux[i]) ); +} + +void HtmlParser::parseNodesOfTypeMETA() +{ + vector<QString> const& aux = parseNodesOfType("META"); + + for(vector<QString>::size_type i = 0; i != aux.size(); ++i) + { + NodeMETA* node = new NodeMETA(aux[i]); + nodes_.push_back(node); + + if(!is_content_type_set_ && node->atributoHTTP_EQUIV().lower() == QString("Content-Type").lower()) { + is_content_type_set_ = true; + node_META_content_type_.setNode(aux[i]); + } + } +} + +QString HtmlParser::findCharsetInMetaElement(QString const& html) +{ + vector<QString> metaTags; + parseNodesOfType("META", html, metaTags); + + for(vector<QString>::size_type i = 0; i != metaTags.size(); ++i) + { + NodeMETA node(metaTags[i]); + + if(node.atributoHTTP_EQUIV().lower() == QString("Content-Type").lower()) { + return node.charset(); + } + } + return QString(); +} + +void HtmlParser::parseNodesOfTypeIMG() +{ + vector<QString> const& aux = parseNodesOfType("IMG"); + + for(vector<QString>::size_type i = 0; i != aux.size(); ++i) + nodes_.push_back( new NodeIMG(aux[i]) ); +} + +void HtmlParser::parseNodesOfTypeFRAME() +{ + vector<QString> const& aux = parseNodesOfType("FRAME"); + + for(vector<QString>::size_type i = 0; i != aux.size(); ++i) + nodes_.push_back( new NodeFRAME(aux[i]) ); +} + +void HtmlParser::parseNodesOfTypeIFRAME() +{ + vector<QString> const& aux = parseNodesOfType("IFRAME"); + + for(vector<QString>::size_type i = 0; i != aux.size(); ++i) + nodes_.push_back( new NodeFRAME(aux[i]) ); +} + +void HtmlParser::parseNodesOfTypeBASE() +{ + QString node; + QString doc = document_; + int inicio = 0, fim = 0; + + inicio = findSeparableWord(doc, "<BASE"); + if(inicio == -1 || !doc[inicio].isSpace()) + return; + + fim = doc.find(">", inicio); + if(fim == -1) + return; + + node = doc.mid(inicio, fim-inicio); + node_BASE_.setNode(node); +} + +void HtmlParser::parseNodesOfTypeTITLE() +{ + QString node; + QString doc = document_; + int inicio = 0, fim = 0; + + inicio = findSeparableWord(doc, "<TITLE>"); + if(inicio == -1) + return; + + fim = findSeparableWord(doc, "</TITLE>", inicio); + if(fim == -1) + return; + + node = doc.mid(inicio, fim-inicio); + + node_TITLE_.setNode(node); +} + + +void HtmlParser::stripComments() +{ + QString begin_comment = "<!--"; + QString end_comment = "-->"; + uint const begin_comment_length = begin_comment.length(); + + int inicio = -1; + do + { + inicio = findWord(document_, begin_comment); + if(inicio != -1) + { + int fim = findWord(document_, end_comment, inicio); + if(fim == -1) + { + kdDebug(23100) << "End of comment is missing!" << endl; + document_.remove(inicio - begin_comment_length, begin_comment_length); + } + else + { + comments_ += "\n" + document_.mid(inicio - begin_comment_length, + fim - inicio + begin_comment_length); + document_.remove(inicio - begin_comment_length, fim - inicio + begin_comment_length); + } + } + } + while(inicio != -1); +} + +void HtmlParser::stripScriptContent() +{ + int inicio = -1; + QString const begin_script = "<script"; + QString const end_script = "</script>"; + uint const begin_script_length = begin_script.length(); + + do + { + inicio = findWord(document_, begin_script); + if(inicio != -1) + { + int fim = findWord(document_, end_script, inicio); + + if(fim == -1) + { + kdDebug(23100) << "Malformed script tag!" << endl; + document_.remove(inicio - begin_script_length, begin_script_length); + } + else + { + script_ += "\n" + document_.mid(inicio - begin_script_length, + fim - inicio + begin_script_length); + + document_.remove(inicio - begin_script_length, + fim - inicio + begin_script_length); + } + } + } + while(inicio != -1); +} + + + + +#include <iostream> +void HtmlParser::mostra() const +{ + kdDebug(23100) << "\nA:\n\n"; + for(unsigned int i = 0; i != nodes_.size(); ++i) + { + if(nodes_[i]->element() == Node::A) + kdDebug(23100) << nodes_[i]->url() << "\t" << nodes_[i]->linkLabel() << endl; + } + kdDebug(23100) << "____________________________________________________________________" << endl; + + kdDebug(23100) << "\nLINK:\n\n"; + for(unsigned int i = 0; i != nodes_.size(); ++i) + { + if(nodes_[i]->element() == Node::LINK) + kdDebug(23100) << nodes_[i]->url() << "\t" << nodes_[i]->linkLabel() << endl; + } + kdDebug(23100) << "____________________________________________________________________" << endl; + + kdDebug(23100) << "\nMETA:\n"; + for(unsigned int i = 0; i != nodes_.size(); ++i) + { + if(nodes_[i]->element() == Node::META) + { +#if defined Q_WS_WIN + NodeMETA* nm = (NodeMETA*)nodes_[i]; +#else + + NodeMETA* nm = dynamic_cast<NodeMETA*>(nodes_[i]); +#endif + + kdDebug(23100) << nm->url() << endl + << nm->atributoHTTP_EQUIV() << endl + << nm->atributoNAME() << endl + << nm->atributoCONTENT() << endl; + } + } + kdDebug(23100) << "____________________________________________________________________" << endl; + + kdDebug(23100) << "\nIMG:\n\n"; + for(unsigned int i = 0; i != nodes_.size(); ++i) + { + if(nodes_[i]->element() == Node::IMG) + kdDebug(23100) << nodes_[i]->url() << "\t" + << nodes_[i]->linkLabel() << endl; + } + kdDebug(23100) << "____________________________________________________________________" << endl; + + kdDebug(23100) << "\nFRAME:\n\n"; + for(unsigned int i = 0; i != nodes_.size(); ++i) + { + if(nodes_[i]->element() == Node::FRAME) + kdDebug(23100) << nodes_[i]->url() << endl; + } + kdDebug(23100) << "____________________________________________________________________" << endl; + + kdDebug(23100) << "\nBASE:\n\n"; + kdDebug(23100) << node_BASE_.url() << endl; + + kdDebug(23100) << "____________________________________________________________________" << endl; + +} + +#ifdef HTMLPARSER + +#include <fstream> + +int main() +{ + //ifstream stream("aterraprometida.html"); + //ifstream stream("/var/www/html/STL/standard_library.html"); + //ifstream stream("/var/www/html/qt-doc/functions.html"); + ifstream stream("/var/www/html/index.html"); + + QString content; + while(stream) + { + char c; + stream.get(c); + content += c; + } + // kdDebug(23100) << content << endl; + kdDebug(23100) << "__________________________________________________________" << endl; + HtmlParser parser(content); + parser.mostra(); + kdDebug(23100) << "__________________________________________________________\n\n\n" << endl; + vector<Node*> nods = parser.nodes(); + for(int i = 0; i != nods.size(); ++i) + { + if(nods[i]->element() == Node::META) + { + NodeMETA* nod_meta = (NodeMETA*)(nods[i]); + //Node* nod_meta = nods[i]; + + kdDebug(23100) << nod_meta->atributoCONTENT() << endl; + } + + } +} + + +#endif diff --git a/klinkstatus/src/parser/htmlparser.h b/klinkstatus/src/parser/htmlparser.h new file mode 100644 index 00000000..cf487ebf --- /dev/null +++ b/klinkstatus/src/parser/htmlparser.h @@ -0,0 +1,124 @@ + /*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * [email protected] * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +#ifndef HTML_PARSER_H +#define HTML_PARSER_H + +#include <qstring.h> + +#include <vector> + + +#include "mstring.h" +#include "node.h" + +#include <iostream> + +using namespace std; + +typedef unsigned int uint; + + + +class HtmlParser +{ +public: + + HtmlParser(); + HtmlParser(QString const& documento); + ~HtmlParser(); + + vector<Node*> const& nodes() const; + bool hasBaseUrl() const; + bool hasTitle() const; + bool hasContentType() const; + NodeBASE const& baseUrl() const; + NodeTITLE const& title() const; + NodeMETA const& contentTypeMetaNode() const; + + static uint estimativaLinks(uint doc_size); + /** + * Convenience function for performance as it only parse in order + * to get the charset. + */ + static QString findCharsetInMetaElement(QString const& html); + + // test: + void mostra() const; + +private: + + vector<QString> const& parseNodesOfType(QString const& element); + /** + * Vector nodes passed for performance. + */ + static void parseNodesOfType(QString const& element, QString const& doc, vector<QString>& nodes); + + void parseNodesOfTypeA(); + void parseNodesOfTypeAREA(); + void parseNodesOfTypeLINK(); + void parseNodesOfTypeMETA(); + void parseNodesOfTypeIMG(); + void parseNodesOfTypeFRAME(); + void parseNodesOfTypeIFRAME(); + void parseNodesOfTypeBASE(); + void parseNodesOfTypeTITLE(); + + void stripComments(); + void stripScriptContent(); + + /** + Return the index of the next character of the end of tag. + e.g. + endOfTag("<img src=\"bad > luck\">") => 22 (not 15) + */ + static int endOfTag(QString const& s, int index = 0, QChar end_of_tag = '>'); + +private: + + vector<QString> aux_; // for what the hell is this? looks ugly... maybe I was drunk, can't remember + vector<Node*> nodes_; + NodeBASE node_BASE_; + NodeTITLE node_TITLE_; + NodeMETA node_META_content_type_; + bool is_content_type_set_; + + QString document_; + QString script_; // Fica aqui guardado (JavaScript, etc) + QString comments_; +}; + + +inline HtmlParser::~HtmlParser() +{ + //kdDebug(23100) << "*"; +} + +inline uint HtmlParser::estimativaLinks(uint doc_size) +{ + return doc_size / 100; // valor estimado... +} + +inline bool HtmlParser::hasContentType() const +{ + return is_content_type_set_; +} + +#endif diff --git a/klinkstatus/src/parser/http.cpp b/klinkstatus/src/parser/http.cpp new file mode 100644 index 00000000..1133c937 --- /dev/null +++ b/klinkstatus/src/parser/http.cpp @@ -0,0 +1,87 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * [email protected] * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +#include "http.h" +#include "mstring.h" + +#include <kdebug.h> + +#include <iostream> + + + +void HttpResponseHeader::parseLocation() +{ + QString cabecalho(toString()); + + int location = findWord(cabecalho, "Location: "); + Q_ASSERT(location != -1); + + int fim_de_linha_1 = cabecalho.find('\n', location); + int fim_de_linha_2 = cabecalho.find('\r', location); + + Q_ASSERT(fim_de_linha_1 != -1 || fim_de_linha_2 != -1); + + int fim_de_linha; + + if(fim_de_linha_1 == -1 && fim_de_linha_2 != -1) + fim_de_linha = fim_de_linha_2; + + else if(fim_de_linha_2 == -1 && fim_de_linha_1 != -1) + fim_de_linha = fim_de_linha_1; + + else if(fim_de_linha_1 < fim_de_linha_2) + fim_de_linha = fim_de_linha_1; + + else fim_de_linha = fim_de_linha_2; + + location_ = cabecalho.mid(location, fim_de_linha - location); +} + +QString HttpResponseHeader::charset() const +{ + return HttpResponseHeader::charset(value("content-type")); +} + +QString HttpResponseHeader::charset(QString const& contentTypeHttpHeaderLine) +{ + QString _charset; + + if(contentTypeHttpHeaderLine.isEmpty()) + return _charset; + + int index = contentTypeHttpHeaderLine.find("charset="); + if(index != -1) + index += QString("charset=").length(); + else { + index = contentTypeHttpHeaderLine.find("charset:"); + if(index != -1) + index += QString("charset:").length(); + } + + if(index != -1) { + _charset = contentTypeHttpHeaderLine.mid(index); + _charset = _charset.stripWhiteSpace(); + } + +// kdDebug(23100) << "Charset: |" << _charset << "|" << endl; + return _charset; + +} diff --git a/klinkstatus/src/parser/http.h b/klinkstatus/src/parser/http.h new file mode 100644 index 00000000..5878cfd1 --- /dev/null +++ b/klinkstatus/src/parser/http.h @@ -0,0 +1,79 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * [email protected] * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +#ifndef HTTP_H +#define HTTP_H + +#include <qhttp.h> +#include <qstring.h> + + +class HttpResponseHeader: public QHttpResponseHeader +{ +public: + + HttpResponseHeader(); + HttpResponseHeader(const QHttpResponseHeader & header); + HttpResponseHeader(QString const& str); + virtual ~HttpResponseHeader(); + + void parseLocation(); + QString const& location() const; + QString charset() const; + + /** + * Parses the charset from this kind of server response: + * Content-Type: text/html; charset=EUC-JP + * Return an empty string in case it doesn't find nothing. + */ + static QString charset(QString const& contentTypeHttpHeaderLine); + +private: + + QString location_; +}; + + +inline HttpResponseHeader::HttpResponseHeader() + : QHttpResponseHeader() +{ +} + +inline HttpResponseHeader::HttpResponseHeader(const QHttpResponseHeader & /*header*/) + : QHttpResponseHeader() +{ +} + +inline HttpResponseHeader::HttpResponseHeader(QString const& str) + : QHttpResponseHeader() +{ + parse(str); +} + +inline HttpResponseHeader::~HttpResponseHeader() +{ +} + +inline QString const& HttpResponseHeader::location() const +{ + return location_; +} + +#endif diff --git a/klinkstatus/src/parser/mstring.cpp b/klinkstatus/src/parser/mstring.cpp new file mode 100644 index 00000000..114d6dc6 --- /dev/null +++ b/klinkstatus/src/parser/mstring.cpp @@ -0,0 +1,278 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * [email protected] * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +#include "mstring.h" + +#include <iostream> + + +using namespace std; + + +int findWord(QString const& s, QString const& palavra, uint a_partir_do_indice) +{ + int indice = s.find(palavra, a_partir_do_indice, false); + + if(indice == -1) + return indice; + else + return (indice + palavra.length()); +} + +int findChar(QString const& s, QChar letra, uint a_partir_do_indice) +{ + int index = s.find(letra, a_partir_do_indice, false); + if(index == -1) + return index; + else + return index + 1; +} + +/** + The string palavra, must not have any spaces. +*/ +int findSeparableWord(QString const& s_, QString const& palavra, uint a_partir_do_indice) +{ + bool encontrou = true; + QString s(s_); + uint indice_palavra = 0; + int indice = a_partir_do_indice; + + do + { + encontrou = true; + indice_palavra = 0; + + indice = findChar(s, (palavra[indice_palavra++]), indice ); + + if(indice == -1) + { + return indice; + } + --indice; + + while(encontrou && indice_palavra != palavra.length() && indice < (int)s.length()) + { + indice = nextNonSpaceChar(s, indice); + + if(indice == -1) + return indice; + + // Nao se incrementa o indice porque isso j��feito com a fun�o nextNonSpaceChar + encontrou = encontrou && !(notEqual(s[indice], palavra[indice_palavra++]) ); + + } + } + while(!encontrou && indice < (int)s.length()); + + if(encontrou && indice < (int)s.length()) + return ++indice; + else + return -1; +} + +int nextNonSpaceChar(QString const& s, uint i) +{ + ++i; + // while( (s[i] == ' ' || s[i] == '\t' || s[i] == '\r' || s[i] == '\n') + while(isSpace(s[i]) + && i < s.length() ) + ++i; + + if(i < s.length()) + return i; + else + return -1; +} + + +/** + e.g. + nextSpaceChar("o biltre") => 1 +*/ +int nextSpaceChar(QString const& s, uint i) +{ + //while( (s[i] != ' ' && s[i] != '\r' && s[i] != '\n' && s[i] != '\t') && + //i < s.size() ) + while(!isSpace(s[i]) && + i < s.length() ) + ++i; + + if(i < s.length()) + return i; + else + return -1; +} + +int nextCharDifferentThan(QChar c, QString const& s, uint i) +{ + while(i < s.length() && s[i] == c) + ++i; + + if(i != s.length()) + return i; + else + return -1; +} + +vector<QString> tokenize(QString s) +{ + Q_ASSERT(!s.isEmpty()); + vector<QString> v; + + while(true) + { + int inicio = 0; + //if(s[0] == ' ' || s[0] == '\t' || s[0] == '\r' || s[0] == '\n') + if(isSpace(s[0])) + inicio = nextNonSpaceChar(s, 0); + if(inicio == -1) + return v; + + int fim = nextSpaceChar(s, inicio); + if(fim == -1) + { + v.push_back(s.mid(inicio)); + return v; + } + else + { + QString palavra = s.mid(inicio, fim - inicio); + v.push_back(palavra); + s.remove(0, fim); + } + } +} + +vector<QString> tokenizeWordsSeparatedByDots(QString s) +{ + vector<QString> v; + + while(true) + { + int inicio = 0; + if(s[0] == '.') + inicio = nextCharDifferentThan(QChar('.'), s, 0); + if(inicio == -1) + return v; + + int fim = s.find('.', inicio); + if(fim == -1) + { + v.push_back(s.mid(inicio)); + return v; + } + else + { + QString palavra = s.mid(inicio, fim - inicio); + v.push_back(palavra); + s.remove(0, fim); + } + } +} + +vector<QString> tokenizeWordsSeparatedBy(QString s, QChar criteria) +{ + vector<QString> v; + + while(true) + { + int inicio = 0; + if(s[0] == criteria) + inicio = nextCharDifferentThan(criteria, s, 0); + if(inicio == -1) + return v; + + int fim = s.find(criteria, inicio); + if(fim == -1) + { + v.push_back(s.mid(inicio)); + return v; + } + else + { + QString palavra = s.mid(inicio, fim - inicio); + v.push_back(palavra); + s.remove(0, fim); + } + } +} + + + +#ifdef STRING +//c++ -g -o teste_string mstring.cpp -DSTRING +#include <fstream> + +int main(int argc, char* argv[]) +{ + string s; + s = "S"; + s = "Afazer"; + s = "O MeU S sdadsadd "; + s = "www.trolltech.com/search/qt-interest/bla bla%20Bla"; + s = "...http://w.ww..go.o.gle.p.t......."; + + /* + ifstream stream("testeparser.html"); + string content; + while(stream) { + char c; + stream.get(c); + content += c; + } + */ + // kdDebug(23100) << simplifyWhiteSpace(content) << endl; + kdDebug(23100) << simplifyWhiteSpace(s) << endl; + + /* + vector<string> v(tokenize(s)); + for(int i = 0; i != v.size(); ++i) + kdDebug(23100) << v[i] << endl; + */ + + /* + int i = nextSpaceChar(s, 0); + i = nextNonSpaceChar(s, i); + kdDebug(23100) << s.substr(i) << endl; + */ + + + vector<string> v(tokenizeWordsSeparatedByDots(s)); + for(int i = 0; i != v.size(); ++i) + kdDebug(23100) << v[i] << endl; + + removeLastCharIfExists(s, '/'); + kdDebug(23100) << s << endl; + + /* + kdDebug(23100) << findChar(s, 'T') << endl; + kdDebug(23100) << findWord(s, "trolltech") << endl; + kdDebug(23100) << findWord(s, "TROLLTECH") << endl; + kdDebug(23100) << findWord(s, "TROLLTECH", 2) << endl; + */ + /* + stripWhiteSpace(s); + kdDebug(23100) << s << endl; + */ +} + + +#endif diff --git a/klinkstatus/src/parser/mstring.h b/klinkstatus/src/parser/mstring.h new file mode 100644 index 00000000..cd359c7d --- /dev/null +++ b/klinkstatus/src/parser/mstring.h @@ -0,0 +1,174 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * [email protected] * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +#ifndef STRING_H +#define STRING_H + +#include <qstring.h> + +#include <vector> +#include <cctype> + +class QString; + +typedef unsigned int uint; + + +/* Similar to std::string::find but return the next index of the last char + of the first word it finds. + Case insensitive. + e.g. + findWord("Biltre larvado", "biltre") => 6 +*/ +int findWord(QString const& s, QString const& palavra, uint a_partir_do_indice = 0); + +/** + Similar to std::string::find but return the next index of the first char + it finds. + Case insensitive. +*/ +int findChar(QString const& s, QChar letra, uint a_partir_do_indice = 0); + +/** + Same as findWord but non space chars are eliminated. + e.g. + findWord("<a href=""></a>", "<a") => 2 + findSeparableWord("<a href=""></a>", "<a") => 2 + + findWord("<\na href=""></a>", "<a") => -1 + findSeparableWord("<\na href=""></a>", "<a") => 3 +*/ +int findSeparableWord(QString const& s, QString const& palavra, uint a_partir_do_indice = 0); + +/** + Space means Unicode characters with decimal values + 9 (TAB), 10 (LF), 11 (VT), 12 (FF), 13 (CR), and 32 (Space). +*/ +bool isSpace(QChar c); + +/** + Return -1 if unsuccessful. +*/ +int nextNonSpaceChar(QString const& s, uint i); +int nextNonSpaceCharReverse(QString const& s, uint i); +int nextSpaceChar(QString const& s, uint i); + +int nextCharDifferentThan(QChar c, QString const& s, uint i); + +/** Return a vector with the words */ +std::vector<QString> tokenize(QString s); +std::vector<QString> tokenizeWordsSeparatedByDots(QString s); +std::vector<QString> tokenizeWordsSeparatedBy(QString s, QChar criteria); + +/** + Returns a string that has whitespace removed from the start and the end, + and which has each sequence of internal whitespace replaced with a single space. +*/ +QString simplifyWhiteSpace(QString const& s); + +/** + If char 'caractere' is the last in the string 's' it is removed +*/ +void removeLastCharIfExists(QString& s, QChar caractere); + +QString upperCase(QString const& s); +QString lowerCase(QString const& s); + +/** + Remove whitespaces from the end of the string +*/ +void stripWhiteSpaceFromTheEnd(QString& s); + +/** + Returns a string that has whitespace removed from the start and the end. +*/ +void stripWhiteSpace(QString& s); + +/** + Case insensitive comparisons +*/ +bool equal(QString const& s1, QString const& s2); +bool notEqual(QString const& s1, QString const& s2); + +bool equal(QChar c1, QChar c2); +bool notEqual(QChar c1, QChar c2); + + +//_________________________________________________________________________ + +inline bool isSpace(QChar c) +{ + return c.isSpace(); +} + +inline bool equal(QString const& s1, QString const& s2) +{ + if(s1 == s2) + return true; + else + return s1.lower() == s2.lower(); +} + +inline bool notEqual(QString const& s1, QString const& s2) +{ + return !(equal(s1, s2)); +} + +inline bool equal(QChar c1, QChar c2) +{ + return c1.lower() == c2.lower(); +} + +inline bool notEqual(QChar c1, QChar c2) +{ + return !(equal(c1, c2)); +} + +inline QString upperCase(QString const& s) +{ + return s.upper(); +} + +inline QString lowerCase(QString const& s) +{ + return s.lower(); +} + +inline QString simplifyWhiteSpace(QString const& s) +{ + return s.simplifyWhiteSpace(); +} + +inline void removeLastCharIfExists(QString& s, QChar caractere) +{ + int index = s.length() - 1; + if(s[index] == caractere) + s.remove(index); +} + +inline void stripWhiteSpace(QString& s) +{ + s = s.stripWhiteSpace(); +} + + + + +#endif diff --git a/klinkstatus/src/parser/node.cpp b/klinkstatus/src/parser/node.cpp new file mode 100644 index 00000000..068184ae --- /dev/null +++ b/klinkstatus/src/parser/node.cpp @@ -0,0 +1,255 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * [email protected] * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +#include "node.h" +#include "mstring.h" +#include "url.h" +#include "../utils/utils.h" + + +/* + Node________________________________________________________________________ +*/ + +QString Node::getAttribute(QString const& atributo) +{ + QString attribute_; + int fim = - 1; + bool tem_aspas_ou_plicas = false; + + int inicio = findWord(content_, atributo); + if(inicio != -1) + { + if(content_[inicio] == '"') + { + fim = content_.find("\"", inicio + 1); + tem_aspas_ou_plicas = true; + } + else if(content_[inicio] == '\'') + { + fim = content_.find("'", inicio + 1); + tem_aspas_ou_plicas = true; + } + else + { + int fim_bloco = nextSpaceChar(content_, inicio + 1); + int fim_tag = content_.find(">", inicio + 1); + int fim_aspas = content_.find("\"", inicio + 1); + + if(fim_bloco == -1 && fim_tag == -1 && fim_aspas == -1) + { + attribute_ = content_; + malformed_ = true; + return attribute_; + } + + if(smallerUnsigned(fim_bloco, fim_tag) == -1 && + smallerUnsigned(fim_bloco, fim_aspas) == -1) + fim = fim_bloco; + + else if(smallerUnsigned(fim_tag, fim_aspas) == -1) + fim = fim_tag; + + else + fim = fim_aspas; + } + + if(fim == -1) + { + attribute_ = content_; + malformed_ = true; + return attribute_; + } + + attribute_ = content_.mid(inicio, fim-inicio); + + if(tem_aspas_ou_plicas) + { + attribute_ = attribute_.mid(1, attribute_.length() - 1); + } + else + { + ::stripWhiteSpace(attribute_); + } + } + + else + { + attribute_ = ""; + } + ::decode(attribute_); + + return attribute_; +} + + +/* + NodeLink________________________________________________________________________ +*/ + +void NodeLink::parseAttributeHREF() +{ + if(findWord(content(), "HREF") == -1 && + findWord(content(), "NAME") == -1 && + findWord(content(), "TARGET") == -1) + { + kdDebug(23100) << "MALFORMED: " << endl + << "NodeLink::parseAttributeHREF: " << content() << endl; + setMalformed(true); + return; + } + + else if(findWord(content(), "HREF") != -1) + { + attribute_href_ = getAttribute("HREF="); + + if( !(malformed() || attribute_href_.isEmpty()) ) + { + // Definnishr o tipo de link + linktype_ = Url::resolveLinkType(attribute_href_); + + parseLinkLabel(); + } + } +} + +void NodeLink::parseLinkLabel() +{ + int fim_tag = 0; + char proximo_caractere = ' '; + + do + { + fim_tag = content_.find(">", fim_tag); + + if(fim_tag != -1) + proximo_caractere = QChar(content_[++fim_tag]); + + } + while(fim_tag != -1 && proximo_caractere == '<'/*If the label starts by <*/); + + if(fim_tag != -1) + { + int fim_label = content_.find("<", fim_tag); + + if(fim_label != -1) + { + link_label_ = + ::simplifyWhiteSpace(content_.mid(fim_tag, + fim_label - fim_tag)); + } + } +} + + +/* + NodeMETA________________________________________________________________________ +*/ + +void NodeMETA::parseAttributeURL() +{ + if(attribute_http_equiv_.isEmpty()) + parseAttributeHTTP_EQUIV(); + + if(upperCase(attribute_http_equiv_) == "REFRESH") + { + is_redirection_ = true; + + if(findWord(content(), "URL") == -1) + { + //setMalformed(true); + return; + } + + attribute_url_ = getAttribute("URL="); + + int aspas = -1; + do + { + aspas = attribute_url_.find("\""); + if(aspas != -1) + attribute_url_.remove(aspas, 1); + } + while(aspas != -1); + + if(attribute_url_.isEmpty()) + kdDebug(23100) << "void NodeMeta::parseAttributeURL(): Assertion `!attribute_url_.isEmpty()' failed.\n" + << content_ << endl << attribute_http_equiv_ << endl << attribute_url_ << endl; + Q_ASSERT(!attribute_url_.isEmpty()); + + linktype_ = Url::resolveLinkType(attribute_url_); + } +} + +QString NodeMETA::charset() const +{ + QString charset; + QString content(atributoCONTENT()); + + if(content.isEmpty()) + return charset; + + int index = content.find("charset="); + if(index != -1) + { + index += QString("charset=").length(); + charset = content.mid(index, content.length() - index); + charset = charset.stripWhiteSpace(); + } + +// kdDebug(23100) << "Charset: |" << charset << "|" << endl; + return charset; +} + +/* + NodeIMG________________________________________________________________________ +*/ + +void NodeIMG::parseAttributeSRC() +{ + if(findWord(content(), "SRC") == -1) + { + kdDebug(23100) << "MALFORMED_____________________________________________________________" << endl; + kdDebug(23100) << "Conteudo: " << content() << endl; + setMalformed(true); + return; + } + + attribute_src_ = getAttribute("SRC="); + linktype_ = Url::resolveLinkType(attribute_src_); +} + + +/* + NodeFRAME________________________________________________________________________ +*/ + +void NodeFRAME::parseAttributeSRC() +{ + if(findWord(content(), "SRC") == -1) + { + //setMalformed(true); + return; + } + + attribute_src_ = getAttribute("SRC="); + linktype_ = Url::resolveLinkType(attribute_src_); +} + diff --git a/klinkstatus/src/parser/node.h b/klinkstatus/src/parser/node.h new file mode 100644 index 00000000..1d0b1fc3 --- /dev/null +++ b/klinkstatus/src/parser/node.h @@ -0,0 +1,279 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * [email protected] * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +#ifndef NODULO_H +#define NODULO_H + +#include "mstring.h" + +#include <qstring.h> + +#include <kdebug.h> +#include <kcharsets.h> + +using namespace std; + +typedef unsigned int uint; + + +class Node +{ +public: + + enum Element { + A, + AREA, + LINK, + META, + IMG, + FRAME, + BASE, + TITLE + }; + enum LinkType { + href, + file_href, + mailto, + relative + }; + + Node(); + Node(QString const& content); + virtual ~Node(); + + QString getAttribute(QString const& atributo); + virtual QString const& url() const = 0; + virtual QString const& linkLabel() const = 0; // URL label + virtual void setNode(QString const& content); + virtual void parse() = 0; + void setMalformed(bool flag = true); + virtual void setLinkType(LinkType const& lt); + + QString const& content() const; + bool malformed() const; + LinkType linkType() const; + Element element() const; + virtual bool isLink() const = 0; + + bool isRedirection() const; + +protected: + + Element element_; + LinkType linktype_; + QString link_label_; + QString content_; + bool is_redirection_; + bool malformed_; +}; + + +class NodeLink: public Node +{ +public: + NodeLink(); + NodeLink(QString const& content); + ~NodeLink() + {} + ; + + virtual void parse(); + + virtual QString const& url() const; + virtual QString const& linkLabel() const; // URL label + virtual QString mailto() const; + virtual bool isLink() const; + +private: + virtual void parseAttributeHREF(); + void parseLinkLabel(); + +private: + QString attribute_href_; +}; + +class NodeA: public NodeLink +{ +public: + NodeA(QString const& content); + ~NodeA() + {} + ; + QString const& attributeNAME() const; + + virtual void parse(); + +private: + void parseAttributeNAME(); + +private: + QString attribute_name_; +}; + +class NodeAREA: public NodeLink +{ +public: + NodeAREA(QString const& content); + ~NodeAREA() {}; + + QString const& attributeTITLE() const; + + virtual void parse(); + +private: + void parseAttributeTITLE(); + +private: + QString attribute_title_; +}; + + +class NodeLINK: public NodeLink +{ +public: + NodeLINK(QString const& content); + ~NodeLINK() + {} + ; +}; + +class NodeMETA: public Node +{ +public: + NodeMETA(); + NodeMETA(QString const& content); + ~NodeMETA() + {} + ; + + virtual QString const& url() const; + virtual const QString& linkLabel() const; + virtual bool isLink() const; + QString const& atributoHTTP_EQUIV() const; + QString const& atributoNAME() const; + QString const& atributoCONTENT() const; + QString charset() const; + bool isRedirection() const; + + virtual void parse(); + +private: + /** + Procura se existem os atributos HTTP-EQUIV=Refresh e URL=... + Se existir considera o content do atributo URL como um link. + ex: <META HTTP-EQUIV=Refresh CONTENT="10; URL=http://www.htmlhelp.com/"> + */ + void parseAttributeURL(); + + void parseAttributeHTTP_EQUIV(); + void parseAttributeNAME(); + void parseAttributeCONTENT(); + +private: + QString attribute_http_equiv_; + QString attribute_url_; + QString attribute_name_; + QString attribute_content_; +}; + +class NodeIMG: public Node +{ +public: + NodeIMG(QString const& content); + ~NodeIMG() + {} + ; + + virtual void parse(); + + virtual QString const& url() const; + virtual QString const& linkLabel() const; // Image label + virtual bool isLink() const; + +private: + void parseAttributeSRC(); + void parseAttributeTITLE(); + void parseAttributeALT(); + +private: + QString attribute_src_; + QString attribute_title_; + QString attribute_alt_; +}; + +class NodeFRAME: public Node +{ +public: + NodeFRAME(QString const& content); + ~NodeFRAME() + {} + ; + + virtual void parse(); + virtual QString const& url() const; + virtual QString const& linkLabel() const; + virtual bool isLink() const; + +private: + void parseAttributeSRC(); + +private: + QString attribute_src_; +}; + +class NodeBASE: public NodeLink +{ +public: + NodeBASE(); + NodeBASE(QString const& content); + ~NodeBASE() + {} + ; + + virtual bool isLink() const; +}; + +class NodeTITLE: public Node +{ +public: + NodeTITLE(); + NodeTITLE(QString const& content); + ~NodeTITLE() + {} + ; + + virtual QString const& url() const; + virtual QString const& linkLabel() const; + virtual void parse(); + virtual bool isLink() const; + + QString const& attributeTITLE() const; + +private: + void parseAttributeTITLE(); + +private: + QString attribute_title_; +}; + + +#include "node_impl.h" + +#endif diff --git a/klinkstatus/src/parser/node_impl.h b/klinkstatus/src/parser/node_impl.h new file mode 100644 index 00000000..51249075 --- /dev/null +++ b/klinkstatus/src/parser/node_impl.h @@ -0,0 +1,412 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * [email protected] * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +inline Node::Node() + : is_redirection_(false), malformed_(false) +{} + +inline Node::~Node() +{ + //kdDebug(23100) << "/"; +} + +inline Node::Node(QString const& content) + : content_(content), is_redirection_(false), malformed_(false) +{} + +inline void Node::setNode(QString const& content) +{ + content_ = content; + parse(); +} + +inline QString const& Node::content() const +{ + return content_; +} + +inline bool Node::malformed() const +{ + return malformed_; +} + +inline void Node::setMalformed(bool flag) +{ + malformed_ = flag; +} + +inline Node::LinkType Node::linkType() const +{ + return linktype_; +} + +inline Node::Element Node::element() const +{ + return element_; +} + +inline void Node::setLinkType(Node::LinkType const& lt) +{ + linktype_ = lt; +} + +inline bool Node::isRedirection() const +{ + return is_redirection_; +} + +// class NodeLink_______________________________________________________ + +inline NodeLink::NodeLink() + : Node() +{} + +inline NodeLink::NodeLink(QString const& content) + : Node(content) +{ + parse(); +} + +inline void NodeLink::parse() +{ + parseAttributeHREF(); +} + +inline QString const& NodeLink::url() const +{ + return attribute_href_; +} + +inline QString const& NodeLink::linkLabel() const +{ + return link_label_; +} + +inline QString NodeLink::mailto() const +{ + Q_ASSERT(linktype_ == Node::mailto); + + QString href = KCharsets::resolveEntities(attribute_href_); + + int inicio = findWord(href, "MAILTO:"); + Q_ASSERT(inicio != -1); + + return href.mid(inicio); +} + +inline bool NodeLink::isLink() const +{ + if(Node::linkType() != Node::mailto && !url().isEmpty()) + return true; + else + return false; +} + +// class NodeA_______________________________________________________ + +inline NodeA::NodeA(QString const& content) + : NodeLink(content) +{ + element_ = A; + parse(); +} + +inline QString const& NodeA::attributeNAME() const +{ + return attribute_name_; +} + +inline void NodeA::parse() +{ + parseAttributeNAME(); +} + +inline void NodeA::parseAttributeNAME() +{ + attribute_name_ = getAttribute("NAME="); + //kdDebug(23100) << "NodeA::parseAttributeNAME: " << attribute_name_ << endl; +} + +// class NodeAREA_______________________________________________________ + +inline NodeAREA::NodeAREA(QString const& content) + : NodeLink(content) +{ + element_ = AREA; + parse(); +} + +inline QString const& NodeAREA::attributeTITLE() const +{ + return attribute_title_; +} + +inline void NodeAREA::parse() +{ + parseAttributeTITLE(); +} + +inline void NodeAREA::parseAttributeTITLE() +{ + attribute_title_ = getAttribute("TITLE="); +//kdDebug(23100) << "NodeAREA::parseAttributeTITLE: " << attribute_title_ << endl; +} + +// class NodeLINK________________________________________ + +inline NodeLINK::NodeLINK(QString const& content) + : NodeLink(content) +{ + element_ = LINK; +} + +// class NodeMeta________________________________________ + +inline NodeMETA::NodeMETA() + : Node() +{ + element_ = META; +} + +inline NodeMETA::NodeMETA(QString const& content) + : Node(content) +{ + element_ = META; + parse(); +} + +inline QString const& NodeMETA::url() const +{ + return attribute_url_; +} + +inline const QString& NodeMETA::linkLabel() const +{ + return link_label_; +} + +inline bool NodeMETA::isLink() const +{ + if(upperCase(attribute_http_equiv_) == "REFRESH" && + findWord(content(), "URL") != -1) + { + // Q_ASSERT(findWord(content(), "URL") != -1); // not necessarily + return true; + } + else + return false; +} + +inline QString const& NodeMETA::atributoHTTP_EQUIV() const +{ + return attribute_http_equiv_; +} + +inline QString const& NodeMETA::atributoNAME() const +{ + return attribute_name_; +} + +inline QString const& NodeMETA::atributoCONTENT() const +{ + return attribute_content_; +} + +inline bool NodeMETA::isRedirection() const +{ + return + upperCase(attribute_http_equiv_) == "REFRESH"; +} + +inline void NodeMETA::parse() +{ + parseAttributeHTTP_EQUIV(); + parseAttributeNAME(); + parseAttributeCONTENT(); + + parseAttributeURL(); +} + +inline void NodeMETA::parseAttributeHTTP_EQUIV() +{ + attribute_http_equiv_ = getAttribute("HTTP-EQUIV="); +} + +inline void NodeMETA::parseAttributeNAME() +{ + attribute_name_ = getAttribute("NAME="); +} + +inline void NodeMETA::parseAttributeCONTENT() +{ + attribute_content_ = getAttribute("CONTENT="); +// kdDebug(23100) << "CONTENT: " << attribute_content_ << endl; +} + + +// class NodeIMG________________________________________ + +inline NodeIMG::NodeIMG(QString const& content) + : Node(content) +{ + element_ = IMG; + parse(); +} + +inline void NodeIMG::parse() +{ + parseAttributeSRC(); + parseAttributeTITLE(); + parseAttributeALT(); +} + +inline QString const& NodeIMG::url() const +{ + return attribute_src_; +} + +inline QString const& NodeIMG::linkLabel() const +{ + if(!attribute_title_.isEmpty()) + return attribute_title_; + else + return attribute_alt_; +} + +inline bool NodeIMG::isLink() const +{ + if(!url().isEmpty()) + return true; + else + return false; +} + +inline void NodeIMG::parseAttributeTITLE() +{ + attribute_title_ = getAttribute("TITLE="); +} + +inline void NodeIMG::parseAttributeALT() +{ + attribute_alt_ = getAttribute("ALT="); +} + + +// class NodeFRAME________________________________________ + +inline NodeFRAME::NodeFRAME(QString const& content) + : Node(content) +{ + element_ = FRAME; + parse(); +} + +inline void NodeFRAME::parse() +{ + parseAttributeSRC(); +} + +inline QString const& NodeFRAME::url() const +{ + return attribute_src_; +} + +inline QString const& NodeFRAME::linkLabel() const +{ + return link_label_; +} + +inline bool NodeFRAME::isLink() const +{ + if(!url().isEmpty()) + return true; + else + return false; +} + +// class NodeBASE________________________________________ + +inline NodeBASE::NodeBASE() + : NodeLink() +{ + element_ = BASE; +} + +inline NodeBASE::NodeBASE(QString const& content) + : NodeLink(content) +{ + element_ = BASE; +} + +inline bool NodeBASE::isLink() const +{ + return false; +} + +// class NodeTITLE________________________________________ + +inline NodeTITLE::NodeTITLE() + : Node() +{ + element_ = TITLE; + parse(); +} + +inline NodeTITLE::NodeTITLE(QString const& content) + : Node(content) +{ + element_ = TITLE; + parse(); +} + +inline QString const& NodeTITLE::url() const +{ + return QString::null; +} + +inline QString const& NodeTITLE::linkLabel() const +{ + return QString::null; +} + +inline void NodeTITLE::parse() +{ + parseAttributeTITLE(); +} + +inline bool NodeTITLE::isLink() const +{ + return false; +} + +inline QString const& NodeTITLE::attributeTITLE() const +{ + return attribute_title_; +} + +inline void NodeTITLE::parseAttributeTITLE() +{ + attribute_title_ = content_; + attribute_title_.replace("<TITLE>", "", false); + attribute_title_.replace("</TITLE>", "", false); + attribute_title_.stripWhiteSpace(); + + //kdDebug(23100) << "TITLE: " << attribute_title_ << endl; +} diff --git a/klinkstatus/src/parser/url.cpp b/klinkstatus/src/parser/url.cpp new file mode 100644 index 00000000..f7f1f6f8 --- /dev/null +++ b/klinkstatus/src/parser/url.cpp @@ -0,0 +1,350 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * [email protected] * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +#include <kresolver.h> + +#include "url.h" +#include "mstring.h" +#include "../utils/utils.h" + +#include <kcharsets.h> + + +Node::LinkType Url::resolveLinkType(QString const& url) +{ + QString aux(url); + aux = KURL::decode_string(aux); + + if(aux.isNull()) + return Node::relative; + + if(findWord(url, "FILE:") != -1) + return Node::file_href; + else if(findWord(KCharsets::resolveEntities(url), "MAILTO:") != -1) + return Node::mailto; + else if( (int)url.find(":/") != -1) + return Node::href; + else + return Node::relative; +} + +KURL Url::normalizeUrl(QString const& string_url, LinkStatus const& link_parent, QString const& document_root) +{ + QString _string_url = string_url.stripWhiteSpace(); + + QString s_url; + KURL base_url; + + // resolve base url + if(link_parent.hasBaseURI()) + base_url = link_parent.baseURI(); + else + base_url = link_parent.absoluteUrl(); + + // resolve relative url + if(_string_url.isEmpty()) + return base_url; + else if(Url::hasProtocol(_string_url)) + return KURL(_string_url); + else + { + s_url.prepend(base_url.protocol() + "://" + base_url.host()); + + if(_string_url[0] == '/') { + if(!base_url.protocol().startsWith("http")) { + s_url.append(document_root); + } + } + else { + s_url.append(base_url.directory(true, false) + "/"); + } + + if( (_string_url[0] == ';' || // parameters + _string_url[0] == '?' || // query + _string_url[0] == '#') ) // fragment or reference + { + s_url.append(base_url.fileName(false)); + } + + s_url.append(_string_url); + KURL url(s_url); + if(base_url.hasUser()) + url.setUser(base_url.user()); + if(base_url.hasPass()) + url.setPass(base_url.pass()); + + url.setPort(base_url.port()); + + url.cleanPath(); + +// kdDebug(23100) << "Normalized URL: " +// << KCharsets::resolveEntities(KURL::decode_string(url.url())) << endl; + + return KURL(KCharsets::resolveEntities(KURL::decode_string(url.url()))); + } +} + +KURL Url::normalizeUrl(QString const& string_url) +{ + QString qs_url(KCharsets::resolveEntities(string_url.stripWhiteSpace())); + + if(qs_url[0] == '/') + { + KURL url; + url.setPath(qs_url); + url.cleanPath(); + return url; + } + + else + { + if(!Url::hasProtocol(qs_url)) + qs_url.prepend("http://"); + + KURL url(qs_url); + url.cleanPath(); + return url; + } +} + +bool Url::existUrl(KURL const& url, vector<LinkStatus*> const& v) +{ + if(url.prettyURL().isEmpty()) + return true; + + for(uint i = 0; i != v.size(); ++i) + if(v[i]->absoluteUrl() == url) + return true; + + return false; +} + +/** + www.iscte.pt, iscte.pt => true; + iscte.pt, www.iscte.pt => true; + www.iscte.pt, alunos.iscte.pt => true; (if restrict = false) + www.iscte.pt, alunos.iscte.pt => false; (if restrict = true) + alunos.iscte.pt, www.iscte.pt => false; + alunos.iscte.pt, iscte.pt => false. +*/ +// FIXME - Rename this function to sameDomain +bool Url::equalHost(QString const& host1, QString const& host2, bool restrict) +{ + //Q_ASSERT(!host1.isEmpty()); + //Q_ASSERT(!host2.isEmpty()); // this fails if href="javascript:......." + //if(host2.isEmpty()) + //return false; + + if(host1 == host2) + return true; + + QString host1_(KNetwork::KResolver::normalizeDomain(host1)); + QString host2_(KNetwork::KResolver::normalizeDomain(host2)); + removeLastCharIfExists(host1_, '/'); + removeLastCharIfExists(host2_, '/'); + + vector<QString> v1 = tokenizeWordsSeparatedByDots(host1_); + vector<QString> v2 = tokenizeWordsSeparatedByDots(host2_); + uint const size1 = v1.size(); + uint const size2 = v2.size(); + + if( !(size1 >= 1 && size2 >= 1) && // localhost would have size = 1 + !(host1_[0].isNumber() || host2_[0].isNumber()) ) // not (host == IP) + { + kdDebug(23100) << "Invalid host: " << host2 << endl; + return false; + } + + vector<QString>::size_type aux = 0; + vector<QString>::size_type aux2 = 0; + if(v1[0] == "www") + aux = 1; + if(v2[0] == "www") + aux2 = 1; + + if((size2 - aux2 < size1 - aux) && restrict) // e.g. paradigma.co.pt < linkstatus.paradigma.co.pt + return false; + + if(restrict && (size2 - aux2 > size1 - aux)) // e.g. linkstatus.paradigma.co.pt > paradigma.co.pt + return false; + + int i = 1; + while( ((int)(size1 - i) >= (int)aux) && ((int)(size2 - i) >= (int)aux) ) + { + if( !(v1[size1 - i] == v2[size2 - i]) ) + return false; + + ++i; + } + + return true; +} + +/* This should be done by parsing but I wan't to know when some new scheme comes along :) */ +bool Url::hasProtocol(QString const& url) +{ + QString s_url(url); + s_url.stripWhiteSpace(); + + if(s_url[0] == '/') + return false; + + else + { + KURL url = KURL::fromPathOrURL(s_url); + if(!url.protocol().isEmpty()) + return true; + /* + if(s_url.startsWith("http:") || + s_url.startsWith("https:") || + s_url.startsWith("ftp:") || + s_url.startsWith("sftp:") || + s_url.startsWith("webdav:") || + s_url.startsWith("webdavs:") || + s_url.startsWith("finger:") || + s_url.startsWith("fish:") || + s_url.startsWith("imap:") || + s_url.startsWith("imaps:") || + s_url.startsWith("lan:") || + s_url.startsWith("ldap:") || + s_url.startsWith("pop3:") || + s_url.startsWith("pop3s:") || + s_url.startsWith("smtp:") || + s_url.startsWith("smtps:") || + s_url.startsWith("file:") || + s_url.startsWith("news:") || + s_url.startsWith("gopher:") || + s_url.startsWith("mailto:") || + s_url.startsWith("telnet:") || + s_url.startsWith("prospero:") || + s_url.startsWith("wais:") || + s_url.startsWith("nntp:") ) + { + return true; + } + */ + else + return false; + } +} + +/** + http://linkstatus.paradigma.co.pt/en/index.html&bix=bix -> /en/index.html&bix=bix +*/ +QString Url::convertToLocal(LinkStatus const* ls) +{ + KURL url = ls->absoluteUrl(); + KURL base_url = ls->rootUrl(); + + if(base_url == url) + return "./" + url.fileName(); + else + return KURL::relativeURL(base_url, url); +} + +/** + If url2 has the same domain has url1 returns true. + If restrict, sourceforge.net != quanta.sourceforge.net. + Else is equal. +*/ +bool Url::localDomain(KURL const& url1, KURL const& url2, bool restrict) +{ + if(url1.protocol() != url2.protocol()) + { + //kdDebug(23100) << "NOT localDomain" << endl; + return false; + } + else if(!url1.hasHost()) + { + //kdDebug(23100) << "localDomain" << endl; + return true; + } + else + { + //return ::equalHost(url1.host(), url2.host(), restrict); + if(Url::equalHost(url1.host(), url2.host(), restrict)) + { + //kdDebug(23100) << "localDomain" << endl; + return true; + } + else + { + //kdDebug(23100) << "NOT localDomain" << endl; + return false; + } + + } +} + +/** + Returns true if url2 is a parent of url1. +*/ +bool Url::parentDir(KURL const& url1, KURL const& url2) +{ + if(url1.protocol() != url2.protocol()) + return false; + + else if(!url1.hasHost()) + return url2.isParentOf(url1); + + else + { + if(!equalHost(url1.host(), url2.host())) + return false; + + vector<QString> tokens_1 = tokenizeWordsSeparatedBy(url1.directory(true, false), QChar('/')); + vector<QString> tokens_2 = tokenizeWordsSeparatedBy(url2.directory(true, false), QChar('/')); + + if(tokens_1.size() == 0) + return false; + + //if(tokens_2.size() > tokens_1.size() or tokens_2.size() == 0) + //return true; + vector<QString>::size_type size = 0; + if(tokens_1.size() < tokens_2.size()) + size = tokens_1.size(); + else + size = tokens_2.size(); + + for(vector<QString>::size_type i = 0; i != size; ++i) + { + if(tokens_2[i] != tokens_1[i]) + return true; + } + } + + return false; +} + +bool Url::externalLink(KURL const& url1, KURL const& url2, bool restrict) +{ + if(url1.protocol() != url2.protocol()) + { + kdDebug(23100) << "externalLink" << endl; + return true; + } + else if(!url1.hasHost() && !url2.hasHost()) + { + kdDebug(23100) << "NOT externalLink" << endl; + return false; + } + else + return !Url::equalHost(url1.host(), url2.host(), restrict); +} diff --git a/klinkstatus/src/parser/url.h b/klinkstatus/src/parser/url.h new file mode 100644 index 00000000..6f22743d --- /dev/null +++ b/klinkstatus/src/parser/url.h @@ -0,0 +1,57 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * [email protected] * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +#ifndef URL_H +#define URL_H + +#include "../engine/linkstatus.h" +#include "node.h" + +#include <kurl.h> +#include <qstring.h> + +#include <vector> + +using namespace std; + + +class LinkStatus; + +namespace Url +{ +Node::LinkType resolveLinkType(QString const& url); +KURL normalizeUrl(QString const& string_url, LinkStatus const& link_parent, QString const& document_root); +KURL normalizeUrl(QString const& string_url); +bool validUrl(KURL const& url); +bool existUrl(KURL const& url, vector<LinkStatus*> const& v); +bool equalHost(QString const& host1, QString const& host2, bool restrict = false); +bool hasProtocol(QString const& url); +QString convertToLocal(LinkStatus const* ls); +bool localDomain(KURL const& url1, KURL const& url2, bool restrict = true); +bool parentDir(KURL const& url1, KURL const& url2); +bool externalLink(KURL const& url1, KURL const& url2, bool restrict = true); +} + +inline bool validUrl(KURL const& url) +{ + return (url.isValid() /*&& url.hasHost()*/); +} + +#endif |