1 files changed, 517 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc
new file mode 100644
index 00000000..3f6d5e5f
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc
@@ -0,0 +1,517 @@
+//--------------------------------------------------------------------
+//
+// TextCollector.cc
+//
+// 2/6/2002 created for libhtdig
+//
+// Neal Richter [email protected]
+//
+// TextCollector:
+//            General Purpose Text Document Indexer.
+//            Calls appropriate parsers. 
+//            The  parser notifies the TextCollector object that it got something
+//            (got_* functions) and the TextCollector object feed the databases
+//            and statistics accordingly.
+//
+//
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: TextCollector.cc,v 1.4 2004/05/28 13:15:29 lha Exp $
+//
+//--------------------------------------------------------------------
+
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "TextCollector.h"
+#include "htdig.h"
+#include "HtWordList.h"
+#include "WordRecord.h"
+#include "URLRef.h"
+#include "Server.h"
+#include "Parsable.h"
+#include "BasicDocument.h"
+#include "StringList.h"
+#include "WordType.h"
+#include "md5.h"
+#include "defaults.h"
+
+#include <signal.h>
+#include <stdio.h>
+
+#include <sys/timeb.h>
+
+
+//*****************************************************************************
+// TextCollector::TextCollector()
+//
+TextCollector::TextCollector(TextCollectorLog flags):
+words(*(HtConfiguration::config()))
+{
+	HtConfiguration *config = HtConfiguration::config();
+	//FILE *urls_parsed;
+
+	currenthopcount = 0;
+
+    //turn on word tracking!
+    trackWords = 1;
+
+	//
+	// Initialize the flags for the various HTML factors
+	//
+    
+	// text_factor
+	factor[0] = FLAG_TEXT;
+	// title_factor
+	factor[1] = FLAG_TITLE;
+	// heading factor (now generic)
+	factor[2] = FLAG_HEADING;
+	factor[3] = FLAG_HEADING;
+	factor[4] = FLAG_HEADING;
+	factor[5] = FLAG_HEADING;
+	factor[6] = FLAG_HEADING;
+	factor[7] = FLAG_HEADING;
+	// img alt text
+	//factor[8] = FLAG_KEYWORDS;
+	factor[8] = FLAG_TEXT;	  // treat alt text as plain text, until it has
+	// its own FLAG and factor.
+	// keywords factor
+	factor[9] = FLAG_KEYWORDS;
+	// META description factor
+	factor[10] = FLAG_DESCRIPTION;
+
+	doc = NULL;
+	minimumWordLength = config->Value("minimum_word_length", 3);
+
+
+	//TODO put document-index log file stuff here via logs like Retriever
+
+	check_unique_md5 = config->Boolean("check_unique_md5", 0);
+	check_unique_date = config->Boolean("check_unique_date", 0);
+
+	d_md5 = 0;
+	if (check_unique_md5)
+	{
+		d_md5 = Database::getDatabaseInstance(DB_HASH);
+
+		if (d_md5->OpenReadWrite(config->Find("md5_db"), 0666) != OK)
+		{
+			cerr << "DocumentDB::Open: " << config->Find("md5_db") << " " << strerror(errno) << "\n";
+		}
+	}
+
+    temp_doc_count = 0;
+
+}
+
+
+//*****************************************************************************
+// TextCollector::~TextCollector()
+//
+TextCollector::~TextCollector()
+{
+	if (d_md5)
+		d_md5->Close();
+	//delete doc;
+
+    if(temp_doc_count != 0)
+    {
+        words.Flush();
+        temp_doc_count = 0;
+    }
+
+    words.Flush();
+	words.Close();
+    
+}
+
+
+//*****************************************************************************
+// void TextCollector::IndexDoc()
+//
+//
+
+int
+TextCollector::IndexDoc(BasicDocument & a_basicdoc)
+{
+	DocumentRef *ref;
+    time_t		date;
+    int			old_document = 0;
+    static int		index = 0;
+
+    //struct timeb tb;
+
+	//HtConfiguration *config = HtConfiguration::config();
+
+    doc = &a_basicdoc;
+
+	ref = docs[doc->Location()];	// It might be nice to have just an Exists() here
+	if (ref)
+	{
+		//
+		// We already have an entry for this document in our database.
+		// This means we can get the document ID and last modification
+		// time from there.
+		//
+		current_id = ref->DocID();
+		date = ref->DocTime();
+		if (ref->DocAccessed())
+			old_document = 1;
+		else  // we haven't retrieved it yet, so we only have the first link
+			old_document = 0;
+		ref->DocBackLinks(ref->DocBackLinks() + 1);	// we had a new link
+		ref->DocAccessed(time(0));
+		ref->DocState(Reference_normal);
+		currenthopcount = ref->DocHopCount();
+	}
+	else
+	{
+		//
+		// Never seen this document before.  We need to create an
+		// entry for it.  This implies that it gets a new document ID.
+		//
+
+        date = 0;
+       
+        current_id = docs.NextDocID();
+		ref = new DocumentRef;
+		ref->DocID(current_id);
+		ref->DocURL(doc->Location());
+		ref->DocState(Reference_normal);
+		ref->DocAccessed(time(0));
+		ref->DocHopCount(0);
+		ref->DocBackLinks(1); // We had to have a link to get here!
+		old_document = 0;
+	}
+
+	word_context.DocID(ref->DocID());
+
+	if (debug > 0)
+	{
+		//
+		// Display progress
+		//
+		cout << index++ << ':' << current_id << ':' << currenthopcount << ':' << doc->Location() <<
+			": ";
+		cout.flush();
+	}
+
+    //printf("New Doc\n");
+    //ftime(&tb);
+    //fprintf(stderr, "[1] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);
+
+	RetrievedDocument(ref);
+
+    //ftime(&tb);
+    //fprintf(stderr, "[2] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);
+
+    if(temp_doc_count > 250)
+    {
+        //words.Flush();
+        temp_doc_count = 0;
+    }
+    else
+    {
+        temp_doc_count++;
+    }
+
+    //ftime(&tb);
+    //fprintf(stderr, "[3] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);
+
+	docs.Add(*ref);
+
+    //ftime(&tb);
+    //fprintf(stderr, "[4] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);
+
+    delete ref;
+
+    words.Flush();
+    //words.Close();
+
+    if (urls_seen)
+    {
+        fprintf(urls_seen, "%s|%d|%s|%d|0|1\n",
+                (const char *) doc->Location(), doc->Length(), doc->ContentType(),
+                (int) doc->ModTime());
+    }
+
+    
+    return(1);
+}
+
+int TextCollector::FlushWordDB()
+{
+    if(temp_doc_count != 0)
+    {
+        words.Flush();
+        temp_doc_count = 0;
+    }
+
+    words.Flush();
+    words.Close();
+    return(1);
+}
+        
+//*****************************************************************************
+// void TextCollector::RetrievedDocument(Document &doc, const String &url, DocumentRef *ref)
+//   We found a document that needs to be parsed.  Since we don't know the
+//   document type, we'll let the Document itself return an appropriate
+//   Parsable object which we can call upon to parse the document contents.
+//
+void
+TextCollector::RetrievedDocument(DocumentRef * ref)
+{
+	n_links = 0;
+	current_ref = ref;
+	current_title = 0;
+	word_context.Anchor(0);
+	current_time = 0;
+	current_head = 0;
+	current_meta_dsc = 0;
+    time_t doc_time;
+
+    //Check if the Document is self-parseable
+    //We will pass ourselves as a callback object for all the got_*() routines
+	if (doc->SelfParseable() == TRUE)
+	{
+		doc->internalParser(*this);
+	}
+	else
+    {
+    	// Create a parser object and let it have a go at the document.
+	    // We will pass ourselves as a callback object for all the got_*()
+    	// routines.
+	    // This will generate the Parsable object as a specific parser
+	    /*
+		Parsable *parsable = doc->getParsable();
+		if (parsable)
+			parsable->parse(*this, *base);
+		else
+		{				  // If we didn't get a parser, then we should get rid of this!
+			ref->DocState(Reference_noindex);
+			return;
+		}
+        */
+	}
+
+	// We don't need to dispose of the parsable object since it will
+	// automatically be reused.
+
+
+	//
+	// Update the document reference
+	//
+	ref->DocTitle((char *) current_title);
+	ref->DocHead((char *) current_head);
+	ref->DocMetaDsc((char *) current_meta_dsc);
+	
+/*    if (current_time == 0)
+		ref->DocTime(doc->ModTime());
+	else
+		ref->DocTime(current_time); */
+    
+    doc_time = doc->ModTime();
+    if(doc_time != 0)
+        ref->DocTime(doc_time);
+    else
+        ref->DocTime(time(NULL));
+        
+	ref->DocSize(doc->Length());
+	ref->DocAccessed(time(0));
+	ref->DocLinks(n_links);
+}
+
+
+//*****************************************************************************
+// void TextCollector::got_word(char *word, int location, int heading)
+//   The location is normalized to be in the range 0 - 1000.
+//
+void
+TextCollector::got_word(const char *word, int location, int heading)
+{
+	if (debug > 3)
+		cout << "word: " << word << '@' << location << endl;
+	if (heading >= 11 || heading < 0)	// Current limits for headings
+		heading = 0;		  // Assume it's just normal text
+
+	if ((trackWords) && (strlen(word) >= minimumWordLength))
+	{
+		String w = word;
+		HtWordReference wordRef;
+
+		wordRef.Location(location);
+		wordRef.Flags(factor[heading]);
+
+		wordRef.Word(w);
+		words.Replace(WordReference::Merge(wordRef, word_context));
+
+#ifdef DEBUG
+        cout << "Adding: [" << w <<  "]"<< endl;  //NEALR
+#endif
+            
+		// Check for compound words...
+		String parts = word;
+		int added;
+		int nparts = 1;
+		do
+		{
+			added = 0;
+			char *start = parts.get();
+			char *punctp = 0, *nextp = 0, *p;
+			char punct;
+			int n;
+			while (*start)
+			{
+				p = start;
+				for (n = 0; n < nparts; n++)
+				{
+					while (HtIsStrictWordChar((unsigned char) *p))
+						p++;
+					punctp = p;
+					if (!*punctp && n + 1 < nparts)
+						break;
+					while (*p && !HtIsStrictWordChar((unsigned char) *p))
+						p++;
+					if (n == 0)
+						nextp = p;
+				}
+				if (n < nparts)
+					break;
+				punct = *punctp;
+				*punctp = '\0';
+				if (*start && (*p || start > parts.get()))
+				{
+					w = start;
+					HtStripPunctuation(w);
+					if (w.length() >= minimumWordLength)
+					{
+						wordRef.Word(w);
+						words.Replace(WordReference::Merge(wordRef, word_context));
+						if (debug > 3)
+							cout << "word part: " << start << '@' << location << endl;
+
+#ifdef DEBUG
+                        cout << "Adding: [" << w <<  "]"<< endl;  //NEALR
+#endif                            
+					}
+					added++;
+				}
+				start = nextp;
+				*punctp = punct;
+			}
+			nparts++;
+		}
+		while (added > 2);
+	}
+}
+
+
+//*****************************************************************************
+// void TextCollector::got_title(const char *title)
+//
+void
+TextCollector::got_title(const char *title)
+{
+	if (debug > 1)
+		cout << "\ntitle: " << title << endl;
+	current_title = title;
+}
+
+//*****************************************************************************
+// void TextCollector::got_time(const char *time)
+//
+void
+TextCollector::got_time(const char *time)
+{
+	HtDateTime new_time(current_time);
+
+	if (debug > 1)
+		cout << "\ntime: " << time << endl;
+
+	//
+	// As defined by the Dublin Core, this should be YYYY-MM-DD
+	// In the future, we'll need to deal with the scheme portion
+	//  in case someone picks a different format.
+	//
+	new_time.SetFTime(time, "%Y-%m-%d");
+	current_time = new_time.GetTime_t();
+
+	// If we can't convert it, current_time stays the same and we get
+	// the default--the date returned by the server...
+}
+
+//*****************************************************************************
+// void TextCollector::got_head(const char *head)
+//
+void
+TextCollector::got_head(const char *head)
+{
+	if (debug > 4)
+		cout << "head: " << head << endl;
+	current_head = head;
+}
+
+//*****************************************************************************
+// void TextCollector::got_meta_dsc(const char *md)
+//
+void
+TextCollector::got_meta_dsc(const char *md)
+{
+	if (debug > 4)
+		cout << "meta description: " << md << endl;
+	current_meta_dsc = md;
+}
+
+
+//*****************************************************************************
+// void TextCollector::got_meta_email(const char *e)
+//
+void
+TextCollector::got_meta_email(const char *e)
+{
+	if (debug > 1)
+		cout << "\nmeta email: " << e << endl;
+	current_ref->DocEmail(e);
+}
+
+
+//*****************************************************************************
+// void TextCollector::got_meta_notification(const char *e)
+//
+void
+TextCollector::got_meta_notification(const char *e)
+{
+	if (debug > 1)
+		cout << "\nmeta notification date: " << e << endl;
+	current_ref->DocNotification(e);
+}
+
+
+//*****************************************************************************
+// void TextCollector::got_meta_subject(const char *e)
+//
+void
+TextCollector::got_meta_subject(const char *e)
+{
+	if (debug > 1)
+		cout << "\nmeta subect: " << e << endl;
+	current_ref->DocSubject(e);
+}
+
+
+//*****************************************************************************
+// void TextCollector::got_noindex()
+//
+void
+TextCollector::got_noindex()
+{
+	if (debug > 1)
+		cout << "\nMETA ROBOT: Noindex " << current_ref->DocURL() << endl;
+	current_ref->DocState(Reference_noindex);
+}