1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
/***************************************************************************
* Copyright (C) 2004 by Paulo Moura Guedes *
* [email protected] *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
***************************************************************************/
#ifndef HTML_PARSER_H
#define HTML_PARSER_H
#include <tqstring.h>
#include <vector>
#include "mstring.h"
#include "node.h"
#include <iostream>
using namespace std;
typedef unsigned int uint;
class HtmlParser
{
public:
HtmlParser();
HtmlParser(TQString const& documento);
~HtmlParser();
vector<Node*> const& nodes() const;
bool hasBaseUrl() const;
bool hasTitle() const;
bool hasContentType() const;
NodeBASE const& baseUrl() const;
NodeTITLE const& title() const;
NodeMETA const& contentTypeMetaNode() const;
static uint estimativaLinks(uint doc_size);
/**
* Convenience function for performance as it only parse in order
* to get the charset.
*/
static TQString findCharsetInMetaElement(TQString const& html);
// test:
void mostra() const;
private:
vector<TQString> const& parseNodesOfType(TQString const& element);
/**
* Vector nodes passed for performance.
*/
static void parseNodesOfType(TQString const& element, TQString const& doc, vector<TQString>& nodes);
void parseNodesOfTypeA();
void parseNodesOfTypeAREA();
void parseNodesOfTypeLINK();
void parseNodesOfTypeMETA();
void parseNodesOfTypeIMG();
void parseNodesOfTypeFRAME();
void parseNodesOfTypeIFRAME();
void parseNodesOfTypeBASE();
void parseNodesOfTypeTITLE();
void stripComments();
void stripScriptContent();
/**
Return the index of the next character of the end of tag.
e.g.
endOfTag("<img src=\"bad > luck\">") => 22 (not 15)
*/
static int endOfTag(TQString const& s, int index = 0, TQChar end_of_tag = '>');
private:
vector<TQString> aux_; // for what the hell is this? looks ugly... maybe I was drunk, can't remember
vector<Node*> nodes_;
NodeBASE node_BASE_;
NodeTITLE node_TITLE_;
NodeMETA node_META_content_type_;
bool is_content_type_set_;
TQString document_;
TQString script_; // Fica aqui guardado (JavaScript, etc)
TQString comments_;
};
inline HtmlParser::~HtmlParser()
{
//kdDebug(23100) << "*";
}
inline uint HtmlParser::estimativaLinks(uint doc_size)
{
return doc_size / 100; // valor estimado...
}
inline bool HtmlParser::hasContentType() const
{
return is_content_type_set_;
}
#endif
|