diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc')
-rw-r--r-- | debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc | 517 |
1 files changed, 517 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc new file mode 100644 index 00000000..3f6d5e5f --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc @@ -0,0 +1,517 @@ +//-------------------------------------------------------------------- +// +// TextCollector.cc +// +// 2/6/2002 created for libhtdig +// +// Neal Richter [email protected] +// +// TextCollector: +// General Purpose Text Document Indexer. +// Calls appropriate parsers. +// The parser notifies the TextCollector object that it got something +// (got_* functions) and the TextCollector object feed the databases +// and statistics accordingly. +// +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: TextCollector.cc,v 1.4 2004/05/28 13:15:29 lha Exp $ +// +//-------------------------------------------------------------------- + + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "TextCollector.h" +#include "htdig.h" +#include "HtWordList.h" +#include "WordRecord.h" +#include "URLRef.h" +#include "Server.h" +#include "Parsable.h" +#include "BasicDocument.h" +#include "StringList.h" +#include "WordType.h" +#include "md5.h" +#include "defaults.h" + +#include <signal.h> +#include <stdio.h> + +#include <sys/timeb.h> + + +//***************************************************************************** +// TextCollector::TextCollector() +// +TextCollector::TextCollector(TextCollectorLog flags): +words(*(HtConfiguration::config())) +{ + HtConfiguration *config = HtConfiguration::config(); + //FILE *urls_parsed; + + currenthopcount = 0; + + //turn on word tracking! + trackWords = 1; + + // + // Initialize the flags for the various HTML factors + // + + // text_factor + factor[0] = FLAG_TEXT; + // title_factor + factor[1] = FLAG_TITLE; + // heading factor (now generic) + factor[2] = FLAG_HEADING; + factor[3] = FLAG_HEADING; + factor[4] = FLAG_HEADING; + factor[5] = FLAG_HEADING; + factor[6] = FLAG_HEADING; + factor[7] = FLAG_HEADING; + // img alt text + //factor[8] = FLAG_KEYWORDS; + factor[8] = FLAG_TEXT; // treat alt text as plain text, until it has + // its own FLAG and factor. + // keywords factor + factor[9] = FLAG_KEYWORDS; + // META description factor + factor[10] = FLAG_DESCRIPTION; + + doc = NULL; + minimumWordLength = config->Value("minimum_word_length", 3); + + + //TODO put document-index log file stuff here via logs like Retriever + + check_unique_md5 = config->Boolean("check_unique_md5", 0); + check_unique_date = config->Boolean("check_unique_date", 0); + + d_md5 = 0; + if (check_unique_md5) + { + d_md5 = Database::getDatabaseInstance(DB_HASH); + + if (d_md5->OpenReadWrite(config->Find("md5_db"), 0666) != OK) + { + cerr << "DocumentDB::Open: " << config->Find("md5_db") << " " << strerror(errno) << "\n"; + } + } + + temp_doc_count = 0; + +} + + +//***************************************************************************** +// TextCollector::~TextCollector() +// +TextCollector::~TextCollector() +{ + if (d_md5) + d_md5->Close(); + //delete doc; + + if(temp_doc_count != 0) + { + words.Flush(); + temp_doc_count = 0; + } + + words.Flush(); + words.Close(); + +} + + +//***************************************************************************** +// void TextCollector::IndexDoc() +// +// + +int +TextCollector::IndexDoc(BasicDocument & a_basicdoc) +{ + DocumentRef *ref; + time_t date; + int old_document = 0; + static int index = 0; + + //struct timeb tb; + + //HtConfiguration *config = HtConfiguration::config(); + + doc = &a_basicdoc; + + ref = docs[doc->Location()]; // It might be nice to have just an Exists() here + if (ref) + { + // + // We already have an entry for this document in our database. + // This means we can get the document ID and last modification + // time from there. + // + current_id = ref->DocID(); + date = ref->DocTime(); + if (ref->DocAccessed()) + old_document = 1; + else // we haven't retrieved it yet, so we only have the first link + old_document = 0; + ref->DocBackLinks(ref->DocBackLinks() + 1); // we had a new link + ref->DocAccessed(time(0)); + ref->DocState(Reference_normal); + currenthopcount = ref->DocHopCount(); + } + else + { + // + // Never seen this document before. We need to create an + // entry for it. This implies that it gets a new document ID. + // + + date = 0; + + current_id = docs.NextDocID(); + ref = new DocumentRef; + ref->DocID(current_id); + ref->DocURL(doc->Location()); + ref->DocState(Reference_normal); + ref->DocAccessed(time(0)); + ref->DocHopCount(0); + ref->DocBackLinks(1); // We had to have a link to get here! + old_document = 0; + } + + word_context.DocID(ref->DocID()); + + if (debug > 0) + { + // + // Display progress + // + cout << index++ << ':' << current_id << ':' << currenthopcount << ':' << doc->Location() << + ": "; + cout.flush(); + } + + //printf("New Doc\n"); + //ftime(&tb); + //fprintf(stderr, "[1] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm); + + RetrievedDocument(ref); + + //ftime(&tb); + //fprintf(stderr, "[2] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm); + + if(temp_doc_count > 250) + { + //words.Flush(); + temp_doc_count = 0; + } + else + { + temp_doc_count++; + } + + //ftime(&tb); + //fprintf(stderr, "[3] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm); + + docs.Add(*ref); + + //ftime(&tb); + //fprintf(stderr, "[4] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm); + + delete ref; + + words.Flush(); + //words.Close(); + + if (urls_seen) + { + fprintf(urls_seen, "%s|%d|%s|%d|0|1\n", + (const char *) doc->Location(), doc->Length(), doc->ContentType(), + (int) doc->ModTime()); + } + + + return(1); +} + +int TextCollector::FlushWordDB() +{ + if(temp_doc_count != 0) + { + words.Flush(); + temp_doc_count = 0; + } + + words.Flush(); + words.Close(); + return(1); +} + +//***************************************************************************** +// void TextCollector::RetrievedDocument(Document &doc, const String &url, DocumentRef *ref) +// We found a document that needs to be parsed. Since we don't know the +// document type, we'll let the Document itself return an appropriate +// Parsable object which we can call upon to parse the document contents. +// +void +TextCollector::RetrievedDocument(DocumentRef * ref) +{ + n_links = 0; + current_ref = ref; + current_title = 0; + word_context.Anchor(0); + current_time = 0; + current_head = 0; + current_meta_dsc = 0; + time_t doc_time; + + //Check if the Document is self-parseable + //We will pass ourselves as a callback object for all the got_*() routines + if (doc->SelfParseable() == TRUE) + { + doc->internalParser(*this); + } + else + { + // Create a parser object and let it have a go at the document. + // We will pass ourselves as a callback object for all the got_*() + // routines. + // This will generate the Parsable object as a specific parser + /* + Parsable *parsable = doc->getParsable(); + if (parsable) + parsable->parse(*this, *base); + else + { // If we didn't get a parser, then we should get rid of this! + ref->DocState(Reference_noindex); + return; + } + */ + } + + // We don't need to dispose of the parsable object since it will + // automatically be reused. + + + // + // Update the document reference + // + ref->DocTitle((char *) current_title); + ref->DocHead((char *) current_head); + ref->DocMetaDsc((char *) current_meta_dsc); + +/* if (current_time == 0) + ref->DocTime(doc->ModTime()); + else + ref->DocTime(current_time); */ + + doc_time = doc->ModTime(); + if(doc_time != 0) + ref->DocTime(doc_time); + else + ref->DocTime(time(NULL)); + + ref->DocSize(doc->Length()); + ref->DocAccessed(time(0)); + ref->DocLinks(n_links); +} + + +//***************************************************************************** +// void TextCollector::got_word(char *word, int location, int heading) +// The location is normalized to be in the range 0 - 1000. +// +void +TextCollector::got_word(const char *word, int location, int heading) +{ + if (debug > 3) + cout << "word: " << word << '@' << location << endl; + if (heading >= 11 || heading < 0) // Current limits for headings + heading = 0; // Assume it's just normal text + + if ((trackWords) && (strlen(word) >= minimumWordLength)) + { + String w = word; + HtWordReference wordRef; + + wordRef.Location(location); + wordRef.Flags(factor[heading]); + + wordRef.Word(w); + words.Replace(WordReference::Merge(wordRef, word_context)); + +#ifdef DEBUG + cout << "Adding: [" << w << "]"<< endl; //NEALR +#endif + + // Check for compound words... + String parts = word; + int added; + int nparts = 1; + do + { + added = 0; + char *start = parts.get(); + char *punctp = 0, *nextp = 0, *p; + char punct; + int n; + while (*start) + { + p = start; + for (n = 0; n < nparts; n++) + { + while (HtIsStrictWordChar((unsigned char) *p)) + p++; + punctp = p; + if (!*punctp && n + 1 < nparts) + break; + while (*p && !HtIsStrictWordChar((unsigned char) *p)) + p++; + if (n == 0) + nextp = p; + } + if (n < nparts) + break; + punct = *punctp; + *punctp = '\0'; + if (*start && (*p || start > parts.get())) + { + w = start; + HtStripPunctuation(w); + if (w.length() >= minimumWordLength) + { + wordRef.Word(w); + words.Replace(WordReference::Merge(wordRef, word_context)); + if (debug > 3) + cout << "word part: " << start << '@' << location << endl; + +#ifdef DEBUG + cout << "Adding: [" << w << "]"<< endl; //NEALR +#endif + } + added++; + } + start = nextp; + *punctp = punct; + } + nparts++; + } + while (added > 2); + } +} + + +//***************************************************************************** +// void TextCollector::got_title(const char *title) +// +void +TextCollector::got_title(const char *title) +{ + if (debug > 1) + cout << "\ntitle: " << title << endl; + current_title = title; +} + +//***************************************************************************** +// void TextCollector::got_time(const char *time) +// +void +TextCollector::got_time(const char *time) +{ + HtDateTime new_time(current_time); + + if (debug > 1) + cout << "\ntime: " << time << endl; + + // + // As defined by the Dublin Core, this should be YYYY-MM-DD + // In the future, we'll need to deal with the scheme portion + // in case someone picks a different format. + // + new_time.SetFTime(time, "%Y-%m-%d"); + current_time = new_time.GetTime_t(); + + // If we can't convert it, current_time stays the same and we get + // the default--the date returned by the server... +} + +//***************************************************************************** +// void TextCollector::got_head(const char *head) +// +void +TextCollector::got_head(const char *head) +{ + if (debug > 4) + cout << "head: " << head << endl; + current_head = head; +} + +//***************************************************************************** +// void TextCollector::got_meta_dsc(const char *md) +// +void +TextCollector::got_meta_dsc(const char *md) +{ + if (debug > 4) + cout << "meta description: " << md << endl; + current_meta_dsc = md; +} + + +//***************************************************************************** +// void TextCollector::got_meta_email(const char *e) +// +void +TextCollector::got_meta_email(const char *e) +{ + if (debug > 1) + cout << "\nmeta email: " << e << endl; + current_ref->DocEmail(e); +} + + +//***************************************************************************** +// void TextCollector::got_meta_notification(const char *e) +// +void +TextCollector::got_meta_notification(const char *e) +{ + if (debug > 1) + cout << "\nmeta notification date: " << e << endl; + current_ref->DocNotification(e); +} + + +//***************************************************************************** +// void TextCollector::got_meta_subject(const char *e) +// +void +TextCollector::got_meta_subject(const char *e) +{ + if (debug > 1) + cout << "\nmeta subect: " << e << endl; + current_ref->DocSubject(e); +} + + +//***************************************************************************** +// void TextCollector::got_noindex() +// +void +TextCollector::got_noindex() +{ + if (debug > 1) + cout << "\nMETA ROBOT: Noindex " << current_ref->DocURL() << endl; + current_ref->DocState(Reference_noindex); +} |