summaryrefslogtreecommitdiffstats
path: root/klinkstatus/src/engine/searchmanager.h
blob: d2414cfe22eef7c1db9136d2d283312cce463270 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
/***************************************************************************
 *   Copyright (C) 2004 by Paulo Moura Guedes                              *
 *   [email protected]                                                        *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the                         *
 *   Free Software Foundation, Inc.,                                       *
 *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.             *
 ***************************************************************************/

#ifndef GESTOR_PESTQUISA_H
#define GESTOR_PESTQUISA_H

#include <kurl.h>

#include <tqobject.h>
#include <tqstring.h>
#include <tqdatetime.h>
#include <tqregexp.h>
#include <tqmap.h>
class TQDomElement;

#include <vector>

#include "linkstatus.h"
#include "linkchecker.h"
#include "../parser/node.h"
#include "../parser/url.h"

using namespace std;

typedef TQMap<TQString, KHTMLPart*> KHTMLPartMap;

class SearchManager: public TQObject
{
    Q_OBJECT
  TQ_OBJECT

public:

    enum SearchMode {
        depth,
        domain,
        depth_and_domain
    };

    SearchManager(int max_simultaneous_connections = 3, int time_out = 50,
                  TQObject *parent = 0, const char *name = 0);
    ~SearchManager();
    
    TQString toXML() const;
    void save(TQDomElement& element) const;

    KHTMLPartMap const& htmlParts() const { return html_parts_; }

    KHTMLPart* htmlPart(TQString const& key_url) const;
    void addHtmlPart(TQString const& key_url, KHTMLPart* html_part);
    void removeHtmlParts();

    void startSearch(KURL const& root);
    void startSearch(KURL const& root, SearchMode const& modo);
    void resume();
    void cancelSearch();

    bool hasDocumentRoot() const;
    KURL const& documentRoot() const;
    void setDocumentRoot(KURL const& url);

    void setSearchMode(SearchMode modo);
    void setDepth(int depth);
    void setExternalDomainDepth(int depth);
    void setDomain(TQString const& domain);
    void setCheckParentDirs(bool flag);
    void setCheckExternalLinks(bool flag);
    void setCheckRegularExpressions(bool flag);
    void setRegularExpression(TQString const& reg_exp, bool case_sensitive);
    void setTimeOut(int time_out);

    void cleanItems();
    void reset();

    bool searching() const;
    bool localDomain(KURL const& url, bool restrict = true) const;
    //bool isLocalRestrict(KURL const& url) const;
    SearchMode const& searchMode() const;
    bool checkRegularExpressions() const { return check_regular_expressions_; }
    bool existUrl(KURL const& url, KURL const& url_parent) const;
    LinktqStatus const* linktqStatus(TQString const& s_url) const;
    int checkedLinks() const;
    TQTime timeElapsed() const;
    bool checkParentDirs() const;
    bool checkExternalLinks() const;
    LinktqStatus const* linkStatusRoot() const;
    int maxSimultaneousConnections() const;
    int timeOut() const;
    
    bool sendIdentification() const { return send_identification_; }
    TQString const& userAgent() const { return user_agent_; }

private:

    void checkRoot();
    void checkVectorLinks(vector<LinktqStatus*> const& links); // corresponde a um no de um nivel de depth
    vector<LinktqStatus*> tqchildren(LinktqStatus* link);
    void startSearch();
    void continueSearch();
    void finnish();
    void pause();
    vector<LinktqStatus*> const& nodeToAnalize() const;
    vector<LinktqStatus*> chooseLinks(vector<LinktqStatus*> const& links);
    void checkLinksSimultaneously(vector<LinktqStatus*> const& links);
    void addLevel();
    bool checkableByDomain(KURL const& url, LinktqStatus const& link_parent) const;
    bool checkable(KURL const& url, LinktqStatus const& link_parent) const;
    int maximumCurrentConnections() const;
    bool onlyCheckHeader(LinktqStatus* ls) const;

    /*
      Entende-se por domain vago um domain do tipo www.google.pt ou google.pt, pelo que,
      por exemplo, imagens.google.pt, e considerado estar no mesmo domain.
      pwp.netcabo.pt ou www.google.pt/imagens nao sao considerados domains vagos.
    */
    bool generalDomain() const;
    bool generalDomainChecked() const; // Para garantir que o procedimento generalDomain() so e chamado uma vez

private slots:

    void slotRootChecked(const LinktqStatus * link, LinkChecker * checker);
    void slotLinkChecked(const LinktqStatus * link, LinkChecker * checker);
    void slotSearchFinished();
    void slotLinkCheckerFinnished(LinkChecker * checker);

signals:

    void signalRootChecked(const LinktqStatus * link, LinkChecker * checker);
    void signalLinkChecked(const LinktqStatus * link, LinkChecker * checker);
    void signalSearchFinished();
    void signalSearchPaused();
    void signalAddingLevelTotalSteps(uint number_of_links);
    void signalAddingLevelProgress();
    void signalLinksToCheckTotalSteps(uint links_to_check);
    //void signalLinksToCheckProgress();

private:

    int max_simultaneous_connections_;
    SearchMode search_mode_;
    LinktqStatus root_;
    bool has_document_root_;
    KURL document_root_url_; // in case of non http protocols the document root must be explicitly given
    int depth_;
    int current_depth_;
    int external_domain_depth_;
    int current_node_;
    int current_index_;
    int links_being_checked_;
    int finished_connections_;
    int maximum_current_connections_;
    TQRegExp reg_exp_;
    TQString domain_;
    bool general_domain_;
    bool checked_general_domain_;
    int time_out_;
    int current_connections_;
    bool send_identification_; // user-agent
    TQString user_agent_;
    
    bool canceled_;
    bool searching_;
    int checked_links_;
    TQTime time_;
    int ignored_links_;
    bool check_parent_dirs_;
    bool check_external_links_;
    bool check_regular_expressions_;
    uint number_of_level_links_;
    uint number_of_links_to_check_;
    vector< vector< vector <LinktqStatus*> > > search_results_;
    KHTMLPartMap html_parts_;
};

#include "searchmanager_impl.h"

#endif