diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htcommon/DocumentDB.cc')
-rw-r--r-- | debian/htdig/htdig-3.2.0b6/htcommon/DocumentDB.cc | 655 |
1 files changed, 655 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/DocumentDB.cc b/debian/htdig/htdig-3.2.0b6/htcommon/DocumentDB.cc new file mode 100644 index 00000000..0ccbf3cb --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/DocumentDB.cc @@ -0,0 +1,655 @@ +// +// DocumentDB.cc +// +// DocumentDB: This class is the interface to the database of document +// references. This database is only used while digging. +// An extract of this database is used for searching. +// This is because digging requires a different index +// than searching. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: DocumentDB.cc,v 1.34 2004/05/28 13:15:12 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "DocumentDB.h" +#include "Database.h" +#include "HtURLCodec.h" +#include "IntObject.h" +#include "HtZlibCodec.h" + +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> + +#ifndef _MSC_VER /* _WIN32 */ +#include <unistd.h> +#endif + +#ifdef HAVE_STD +#include <iostream> +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <iostream.h> +#include <fstream.h> +#endif /* HAVE_STD */ + +#include <errno.h> + +//***************************************************************************** +// DocumentDB::DocumentDB() +// +DocumentDB::DocumentDB() +{ + isopen = 0; + isread = 0; + + // The first document number (NEXT_DOC_ID_RECORD) is used to + // store the nextDocID number itself into. We avoid using + // an all-0 key for this, mostly for being superstitious + // about letting in bugs. + nextDocID = NEXT_DOC_ID_RECORD + 1; +} + + +//***************************************************************************** +// DocumentDB::~DocumentDB() +// +DocumentDB::~DocumentDB() +{ + Close(); +} + + +//***************************************************************************** +// int DocumentDB::Open(char *filename, char *indexname, char *headname) +// We will attempt to open up an existing document database. If it +// doesn't exist, we'll create a new one. If we are succesful in +// opening the database, we need to look for our special record +// which contains the next document ID to use. +// There may also be an URL -> DocID index database to take +// care of, as well as a DocID -> DocHead excerpt database. +// +int DocumentDB::Open(const String& filename, const String& indexfilename, const String& headname) +{ + // If the database is already open, we'll close it + // We might be opening this object with a new filename, so we'll be safe + Close(); + + dbf = 0; + i_dbf = 0; + h_dbf = 0; + + i_dbf = Database::getDatabaseInstance(DB_HASH); + + if (i_dbf->OpenReadWrite(indexfilename, 0666) != OK) { + cerr << "DocumentDB::Open: " << indexfilename << " " << strerror(errno) << "\n"; + return NOTOK; + } + + h_dbf = Database::getDatabaseInstance(DB_HASH); + + if (h_dbf->OpenReadWrite(headname, 0666) != OK) { + cerr << "DocumentDB::Open: " << headname << " " << strerror(errno) << "\n"; + return NOTOK; + } + + dbf = Database::getDatabaseInstance(DB_HASH); + + if (dbf->OpenReadWrite(filename, 0666) == OK) + { + String data; + int specialRecordNumber = NEXT_DOC_ID_RECORD; + String key((char *) &specialRecordNumber, + sizeof specialRecordNumber); + if (dbf->Get(key, data) == OK) + { + memcpy(&nextDocID, data.get(), sizeof nextDocID); + } + + isopen = 1; + return OK; + } + else { + cerr << "DocumentDB::Open: " << filename << " " << strerror(errno) << "\n"; + return NOTOK; + } +} + + +//***************************************************************************** +// int DocumentDB::Read(char *filename, char *indexname, char *headname) +// We will attempt to open up an existing document database, +// and accompanying index database and excerpt database +// +int DocumentDB::Read(const String& filename, const String& indexfilename , const String& headfilename ) +{ + // If the database is already open, we'll close it + // We might be opening this object with a new filename, so we'll be safe + Close(); + + dbf = 0; + i_dbf = 0; + h_dbf = 0; + + if (!indexfilename.empty()) + { + i_dbf = Database::getDatabaseInstance(DB_HASH); + + if (i_dbf->OpenRead(indexfilename) != OK) + return NOTOK; + } + + if (!headfilename.empty()) + { + h_dbf = Database::getDatabaseInstance(DB_HASH); + + if (h_dbf->OpenRead(headfilename) != OK) + return NOTOK; + } + + dbf = Database::getDatabaseInstance(DB_HASH); + + if (dbf->OpenRead(filename) == OK) + { + isopen = 1; + isread = 1; + return OK; + } + else + return NOTOK; +} + + +//***************************************************************************** +// int DocumentDB::Close() +// Close the database. Before we close it, we first need to update +// the special record which keeps track our nextDocID variable. +// +int DocumentDB::Close() +{ + if (!isopen) return OK; + + if (!isread) + { + int specialRecordNumber = NEXT_DOC_ID_RECORD; + String key((char *) &specialRecordNumber, + sizeof specialRecordNumber); + String data((char *) &nextDocID, sizeof nextDocID); + + dbf->Put(key, data); + } + + if (i_dbf) + { + i_dbf->Close(); + delete i_dbf; + i_dbf = 0; + } + if (h_dbf) + { + h_dbf->Close(); + delete h_dbf; + h_dbf = 0; + } + + dbf->Close(); + delete dbf; + dbf = 0; + isopen = 0; + isread = 0; + return OK; +} + + +//***************************************************************************** +// int DocumentDB::Add(DocumentRef &doc) +// +int DocumentDB::Add(DocumentRef &doc) +{ + int docID = doc.DocID(); + + String temp = 0; + + doc.Serialize(temp); + + String key((char *) &docID, sizeof docID); + dbf->Put(key, temp); + + if (h_dbf) + { + if (doc.DocHeadIsSet()) + { + temp = HtZlibCodec::instance()->encode(doc.DocHead()); + h_dbf->Put(key, temp); + } + } + else + // If there was no excerpt index when we write, something is wrong. + return NOTOK; + + if (i_dbf) + { + temp = doc.DocURL(); + i_dbf->Put(HtURLCodec::instance()->encode(temp), key); + return OK; + } + else + // If there was no index when we write, something is wrong. + return NOTOK; +} + + +//***************************************************************************** +// int DocumentDB::ReadExcerpt(DocumentRef &ref) +// We will attempt to access the excerpt for this ref +// +int DocumentDB::ReadExcerpt(DocumentRef &ref) +{ + String data; + int docID = ref.DocID(); + String key((char *) &docID, sizeof docID); + + if (!h_dbf) + return NOTOK; + if (h_dbf->Get(key, data) == NOTOK) + return NOTOK; + + ref.DocHead((char*)HtZlibCodec::instance()->decode(data)); + + return OK; +} + +//***************************************************************************** +// DocumentRef *DocumentDB::operator [] (int docID) +// +DocumentRef *DocumentDB::operator [] (int docID) +{ + String data; + String key((char *) &docID, sizeof docID); + + if (dbf->Get(key, data) == NOTOK) + return 0; + + DocumentRef *ref = new DocumentRef; + ref->Deserialize(data); + return ref; +} + + +//***************************************************************************** +// DocumentRef *DocumentDB::operator [] (const String& u) +// +DocumentRef *DocumentDB::operator [] (const String& u) +{ + String data; + String docIDstr; + + // If there is no index db, then just give up + // (do *not* construct a list and traverse it). + if (i_dbf == 0) + return 0; + else + { + String url(u); + + if (i_dbf->Get(HtURLCodec::instance()->encode(url), docIDstr) == NOTOK) + return 0; + } + + if (dbf->Get(docIDstr, data) == NOTOK) + return 0; + + DocumentRef *ref = new DocumentRef; + ref->Deserialize(data); + return ref; +} + +//***************************************************************************** +// int DocumentDB::Exists(int docID) +// +int DocumentDB::Exists(int docID) +{ + String key((char *) &docID, sizeof docID); + return dbf->Exists(key); +} + +//***************************************************************************** +// int DocumentDB::Delete(int docID) +// +int DocumentDB::Delete(int docID) +{ + String key((char*) &docID, sizeof docID); + String data; + + if (i_dbf == 0 || dbf->Get(key, data) == NOTOK) + return NOTOK; + + DocumentRef *ref = new DocumentRef; + ref->Deserialize(data); + String url = ref->DocURL(); + delete ref; + + // We have to be really careful about deleting by URL, we might + // have a newer "edition" with the same URL and different DocID + String docIDstr; + String encodedURL = HtURLCodec::instance()->encode(url); + if (i_dbf->Get(encodedURL, docIDstr) == NOTOK) + return NOTOK; + + // Only delete if we have a match between what we want to delete + // and what's in the database + if (key == docIDstr && i_dbf->Delete(encodedURL) == NOTOK) + return NOTOK; + + if (h_dbf == 0 || h_dbf->Delete(key) == NOTOK) + return NOTOK; + + return dbf->Delete(key); +} + +//***************************************************************************** +// int DocumentDB::DumpDB(char *filename, int verbose) +// Create an extract from our database which can be used by an +// external application. The extract will consist of lines with fields +// separated by tabs. +// +// The extract will likely not be sorted by anything in particular +// +int DocumentDB::DumpDB(const String& filename, int verbose) +{ + DocumentRef *ref; + List *descriptions, *anchors; + char *strkey; + String data; + FILE *fl; + String docKey(sizeof(int)); + + if((fl = fopen(filename, "w")) == 0) { + perror(form("DocumentDB::DumpDB: opening %s for writing", + (const char*)filename)); + return NOTOK; + } + + dbf->Start_Get(); + while ((strkey = dbf->Get_Next())) + { + int docID; + memcpy(&docID, strkey, sizeof docID); + + docKey = 0; + docKey.append((char *) &docID, sizeof docID); + + dbf->Get(docKey, data); + + if (docID != NEXT_DOC_ID_RECORD) + { + ref = new DocumentRef; + ref->Deserialize(data); + if (h_dbf) + { + h_dbf->Get(docKey,data); + ref->DocHead((char*)HtZlibCodec::instance()->decode(data)); + } + fprintf(fl, "%d", ref->DocID()); + fprintf(fl, "\tu:%s", ref->DocURL()); + fprintf(fl, "\tt:%s", ref->DocTitle()); + fprintf(fl, "\ta:%d", ref->DocState()); + fprintf(fl, "\tm:%d", (int) ref->DocTime()); + fprintf(fl, "\ts:%d", ref->DocSize()); + fprintf(fl, "\tH:%s", ref->DocHead()); + fprintf(fl, "\th:%s", ref->DocMetaDsc()); + fprintf(fl, "\tl:%d", (int) ref->DocAccessed()); + fprintf(fl, "\tL:%d", ref->DocLinks()); + fprintf(fl, "\tb:%d", ref->DocBackLinks()); + fprintf(fl, "\tc:%d", ref->DocHopCount()); + fprintf(fl, "\tg:%d", ref->DocSig()); + fprintf(fl, "\te:%s", ref->DocEmail()); + fprintf(fl, "\tn:%s", ref->DocNotification()); + fprintf(fl, "\tS:%s", ref->DocSubject()); + fprintf(fl, "\td:"); + descriptions = ref->Descriptions(); + String *description; + descriptions->Start_Get(); + int first = 1; + while ((description = (String *) descriptions->Get_Next())) + { + if (!first) + fprintf(fl, "\001"); + first = 0; + fprintf(fl, "%s", description->get()); + } + fprintf(fl, "\tA:"); + anchors = ref->DocAnchors(); + String *anchor; + anchors->Start_Get(); + first = 1; + while ((anchor = (String *) anchors->Get_Next())) + { + if (!first) + fprintf(fl, "\001"); + first = 0; + fprintf(fl, "%s", anchor->get()); + } + fprintf(fl, "\n"); + delete ref; + } + } + + fclose(fl); + + return OK; +} + +//***************************************************************************** +// int DocumentDB::LoadDB(const String &filename, int verbose) +// Load an extract to our database from an ASCII file +// The extract will consist of lines with fields separated by tabs. +// The lines need not be sorted in any fashion. +// +int DocumentDB::LoadDB(const String& filename, int verbose) +{ + FILE *input; + String docKey(sizeof(int)); + DocumentRef ref; + StringList descriptions, anchors; + char *token, field; + String data; + + if((input = fopen(filename, "r")) == 0) { + perror(form("DocumentDB::LoadDB: opening %s for reading", + (const char*)filename)); + return NOTOK; + } + + while (data.readLine(input)) + { + token = strtok(data, "\t"); + if (token == NULL) + continue; + + ref.DocID(atoi(token)); + + if (verbose) + cout << "\t loading document ID: " << ref.DocID() << endl; + + while ( (token = strtok(0, "\t")) ) + { + field = *token; + token += 2; + + if (verbose > 2) + cout << "\t field: " << field; + + switch(field) + { + case 'u': // URL + ref.DocURL(token); + break; + case 't': // Title + ref.DocTitle(token); + break; + case 'a': // State + ref.DocState(atoi(token)); + break; + case 'm': // Modified + ref.DocTime(atoi(token)); + break; + case 's': // Size + ref.DocSize(atoi(token)); + break; + case 'H': // Head + ref.DocHead(token); + break; + case 'h': // Meta Description + ref.DocMetaDsc(token); + break; + case 'l': // Accessed + ref.DocAccessed(atoi(token)); + break; + case 'L': // Links + ref.DocLinks(atoi(token)); + break; + case 'b': // BackLinks + ref.DocBackLinks(atoi(token)); + break; + case 'c': // HopCount + ref.DocHopCount(atoi(token)); + break; + case 'g': // Signature + ref.DocSig(atoi(token)); + break; + case 'e': // E-mail + ref.DocEmail(token); + break; + case 'n': // Notification + ref.DocNotification(token); + break; + case 'S': // Subject + ref.DocSubject(token); + break; + case 'd': // Descriptions + descriptions.Create(token, '\001'); + ref.Descriptions(descriptions); + break; + case 'A': // Anchors + anchors.Create(token, '\001'); + ref.DocAnchors(anchors); + break; + default: + break; + } + + } + + + // We must be careful if the document already exists + // So we'll delete the old document and add the new one + if (Exists(ref.DocID())) + { + Delete(ref.DocID()); + } + Add(ref); + + // If we add a record with an ID past nextDocID, update it + if (ref.DocID() > nextDocID) + nextDocID = ref.DocID() + 1; + + descriptions.Destroy(); + anchors.Destroy(); + } + + fclose(input); + return OK; +} + +//***************************************************************************** +// List *DocumentDB::URLs() +// Return a list of all the URLs in the database +// Only available when there's an URL -> DocID index db handy. +// +List *DocumentDB::URLs() +{ + List *list = new List; + char *coded_key; + + if (i_dbf == 0) + return 0; + + i_dbf->Start_Get(); + while ((coded_key = i_dbf->Get_Next())) + { + String *key = new String(HtURLCodec::instance()->decode(coded_key)); + list->Add(key); + } + return list; +} + + +//***************************************************************************** +// List *DocumentDB::DocIDs() +// Return a list of all the DocIDs in the database +// +List *DocumentDB::DocIDs() +{ + List *list = new List; + char *key; + + dbf->Start_Get(); + while ((key = dbf->Get_Next())) + { + int docID; + memcpy (&docID, key, sizeof docID); + + if (docID != NEXT_DOC_ID_RECORD) + list->Add(new IntObject(docID)); + } + return list; +} + +//***************************************************************************** +// private +// int readLine(FILE *in, String &line) +// +int readLine(FILE *in, String &line) +{ + char buffer[2048]; + int length; + + line = 0; + while (fgets(buffer, sizeof(buffer), in)) + { + length = strlen(buffer); + if (buffer[length - 1] == '\n') + { + // + // A full line has been read. Return it. + // + line << buffer; + line.chop('\n'); + return 1; + } + else + { + // + // Only a partial line was read. Append it to the line + // and read some more. + // + line << buffer; + } + } + return line.length() > 0; +} + +// End of DocumentDB.cc |