diff options
author | Slávek Banko <[email protected]> | 2021-11-05 13:28:23 +0100 |
---|---|---|
committer | Slávek Banko <[email protected]> | 2021-11-05 13:28:23 +0100 |
commit | 8c787c3591c1c885b91a54128835b400858c5cca (patch) | |
tree | eca1b776912a305c4d45b3964038278a2fae1ead /debian/htdig/htdig-3.2.0b6/htcommon | |
parent | fe188b907cdf30dfdfe0eba9412e7f8749fec158 (diff) | |
download | extra-dependencies-8c787c3591c1c885b91a54128835b400858c5cca.tar.gz extra-dependencies-8c787c3591c1c885b91a54128835b400858c5cca.zip |
DEB htdig: Added to repository.
Signed-off-by: Slávek Banko <[email protected]>
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htcommon')
35 files changed, 16288 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/.cvsignore b/debian/htdig/htdig-3.2.0b6/htcommon/.cvsignore new file mode 100644 index 00000000..09dc8ef2 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/.cvsignore @@ -0,0 +1,7 @@ +Makefile +*.lo +*.la +.purify +.pure +.deps +.libs diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/DocumentDB.cc b/debian/htdig/htdig-3.2.0b6/htcommon/DocumentDB.cc new file mode 100644 index 00000000..0ccbf3cb --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/DocumentDB.cc @@ -0,0 +1,655 @@ +// +// DocumentDB.cc +// +// DocumentDB: This class is the interface to the database of document +// references. This database is only used while digging. +// An extract of this database is used for searching. +// This is because digging requires a different index +// than searching. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: DocumentDB.cc,v 1.34 2004/05/28 13:15:12 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "DocumentDB.h" +#include "Database.h" +#include "HtURLCodec.h" +#include "IntObject.h" +#include "HtZlibCodec.h" + +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> + +#ifndef _MSC_VER /* _WIN32 */ +#include <unistd.h> +#endif + +#ifdef HAVE_STD +#include <iostream> +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <iostream.h> +#include <fstream.h> +#endif /* HAVE_STD */ + +#include <errno.h> + +//***************************************************************************** +// DocumentDB::DocumentDB() +// +DocumentDB::DocumentDB() +{ + isopen = 0; + isread = 0; + + // The first document number (NEXT_DOC_ID_RECORD) is used to + // store the nextDocID number itself into. We avoid using + // an all-0 key for this, mostly for being superstitious + // about letting in bugs. + nextDocID = NEXT_DOC_ID_RECORD + 1; +} + + +//***************************************************************************** +// DocumentDB::~DocumentDB() +// +DocumentDB::~DocumentDB() +{ + Close(); +} + + +//***************************************************************************** +// int DocumentDB::Open(char *filename, char *indexname, char *headname) +// We will attempt to open up an existing document database. If it +// doesn't exist, we'll create a new one. If we are succesful in +// opening the database, we need to look for our special record +// which contains the next document ID to use. +// There may also be an URL -> DocID index database to take +// care of, as well as a DocID -> DocHead excerpt database. +// +int DocumentDB::Open(const String& filename, const String& indexfilename, const String& headname) +{ + // If the database is already open, we'll close it + // We might be opening this object with a new filename, so we'll be safe + Close(); + + dbf = 0; + i_dbf = 0; + h_dbf = 0; + + i_dbf = Database::getDatabaseInstance(DB_HASH); + + if (i_dbf->OpenReadWrite(indexfilename, 0666) != OK) { + cerr << "DocumentDB::Open: " << indexfilename << " " << strerror(errno) << "\n"; + return NOTOK; + } + + h_dbf = Database::getDatabaseInstance(DB_HASH); + + if (h_dbf->OpenReadWrite(headname, 0666) != OK) { + cerr << "DocumentDB::Open: " << headname << " " << strerror(errno) << "\n"; + return NOTOK; + } + + dbf = Database::getDatabaseInstance(DB_HASH); + + if (dbf->OpenReadWrite(filename, 0666) == OK) + { + String data; + int specialRecordNumber = NEXT_DOC_ID_RECORD; + String key((char *) &specialRecordNumber, + sizeof specialRecordNumber); + if (dbf->Get(key, data) == OK) + { + memcpy(&nextDocID, data.get(), sizeof nextDocID); + } + + isopen = 1; + return OK; + } + else { + cerr << "DocumentDB::Open: " << filename << " " << strerror(errno) << "\n"; + return NOTOK; + } +} + + +//***************************************************************************** +// int DocumentDB::Read(char *filename, char *indexname, char *headname) +// We will attempt to open up an existing document database, +// and accompanying index database and excerpt database +// +int DocumentDB::Read(const String& filename, const String& indexfilename , const String& headfilename ) +{ + // If the database is already open, we'll close it + // We might be opening this object with a new filename, so we'll be safe + Close(); + + dbf = 0; + i_dbf = 0; + h_dbf = 0; + + if (!indexfilename.empty()) + { + i_dbf = Database::getDatabaseInstance(DB_HASH); + + if (i_dbf->OpenRead(indexfilename) != OK) + return NOTOK; + } + + if (!headfilename.empty()) + { + h_dbf = Database::getDatabaseInstance(DB_HASH); + + if (h_dbf->OpenRead(headfilename) != OK) + return NOTOK; + } + + dbf = Database::getDatabaseInstance(DB_HASH); + + if (dbf->OpenRead(filename) == OK) + { + isopen = 1; + isread = 1; + return OK; + } + else + return NOTOK; +} + + +//***************************************************************************** +// int DocumentDB::Close() +// Close the database. Before we close it, we first need to update +// the special record which keeps track our nextDocID variable. +// +int DocumentDB::Close() +{ + if (!isopen) return OK; + + if (!isread) + { + int specialRecordNumber = NEXT_DOC_ID_RECORD; + String key((char *) &specialRecordNumber, + sizeof specialRecordNumber); + String data((char *) &nextDocID, sizeof nextDocID); + + dbf->Put(key, data); + } + + if (i_dbf) + { + i_dbf->Close(); + delete i_dbf; + i_dbf = 0; + } + if (h_dbf) + { + h_dbf->Close(); + delete h_dbf; + h_dbf = 0; + } + + dbf->Close(); + delete dbf; + dbf = 0; + isopen = 0; + isread = 0; + return OK; +} + + +//***************************************************************************** +// int DocumentDB::Add(DocumentRef &doc) +// +int DocumentDB::Add(DocumentRef &doc) +{ + int docID = doc.DocID(); + + String temp = 0; + + doc.Serialize(temp); + + String key((char *) &docID, sizeof docID); + dbf->Put(key, temp); + + if (h_dbf) + { + if (doc.DocHeadIsSet()) + { + temp = HtZlibCodec::instance()->encode(doc.DocHead()); + h_dbf->Put(key, temp); + } + } + else + // If there was no excerpt index when we write, something is wrong. + return NOTOK; + + if (i_dbf) + { + temp = doc.DocURL(); + i_dbf->Put(HtURLCodec::instance()->encode(temp), key); + return OK; + } + else + // If there was no index when we write, something is wrong. + return NOTOK; +} + + +//***************************************************************************** +// int DocumentDB::ReadExcerpt(DocumentRef &ref) +// We will attempt to access the excerpt for this ref +// +int DocumentDB::ReadExcerpt(DocumentRef &ref) +{ + String data; + int docID = ref.DocID(); + String key((char *) &docID, sizeof docID); + + if (!h_dbf) + return NOTOK; + if (h_dbf->Get(key, data) == NOTOK) + return NOTOK; + + ref.DocHead((char*)HtZlibCodec::instance()->decode(data)); + + return OK; +} + +//***************************************************************************** +// DocumentRef *DocumentDB::operator [] (int docID) +// +DocumentRef *DocumentDB::operator [] (int docID) +{ + String data; + String key((char *) &docID, sizeof docID); + + if (dbf->Get(key, data) == NOTOK) + return 0; + + DocumentRef *ref = new DocumentRef; + ref->Deserialize(data); + return ref; +} + + +//***************************************************************************** +// DocumentRef *DocumentDB::operator [] (const String& u) +// +DocumentRef *DocumentDB::operator [] (const String& u) +{ + String data; + String docIDstr; + + // If there is no index db, then just give up + // (do *not* construct a list and traverse it). + if (i_dbf == 0) + return 0; + else + { + String url(u); + + if (i_dbf->Get(HtURLCodec::instance()->encode(url), docIDstr) == NOTOK) + return 0; + } + + if (dbf->Get(docIDstr, data) == NOTOK) + return 0; + + DocumentRef *ref = new DocumentRef; + ref->Deserialize(data); + return ref; +} + +//***************************************************************************** +// int DocumentDB::Exists(int docID) +// +int DocumentDB::Exists(int docID) +{ + String key((char *) &docID, sizeof docID); + return dbf->Exists(key); +} + +//***************************************************************************** +// int DocumentDB::Delete(int docID) +// +int DocumentDB::Delete(int docID) +{ + String key((char*) &docID, sizeof docID); + String data; + + if (i_dbf == 0 || dbf->Get(key, data) == NOTOK) + return NOTOK; + + DocumentRef *ref = new DocumentRef; + ref->Deserialize(data); + String url = ref->DocURL(); + delete ref; + + // We have to be really careful about deleting by URL, we might + // have a newer "edition" with the same URL and different DocID + String docIDstr; + String encodedURL = HtURLCodec::instance()->encode(url); + if (i_dbf->Get(encodedURL, docIDstr) == NOTOK) + return NOTOK; + + // Only delete if we have a match between what we want to delete + // and what's in the database + if (key == docIDstr && i_dbf->Delete(encodedURL) == NOTOK) + return NOTOK; + + if (h_dbf == 0 || h_dbf->Delete(key) == NOTOK) + return NOTOK; + + return dbf->Delete(key); +} + +//***************************************************************************** +// int DocumentDB::DumpDB(char *filename, int verbose) +// Create an extract from our database which can be used by an +// external application. The extract will consist of lines with fields +// separated by tabs. +// +// The extract will likely not be sorted by anything in particular +// +int DocumentDB::DumpDB(const String& filename, int verbose) +{ + DocumentRef *ref; + List *descriptions, *anchors; + char *strkey; + String data; + FILE *fl; + String docKey(sizeof(int)); + + if((fl = fopen(filename, "w")) == 0) { + perror(form("DocumentDB::DumpDB: opening %s for writing", + (const char*)filename)); + return NOTOK; + } + + dbf->Start_Get(); + while ((strkey = dbf->Get_Next())) + { + int docID; + memcpy(&docID, strkey, sizeof docID); + + docKey = 0; + docKey.append((char *) &docID, sizeof docID); + + dbf->Get(docKey, data); + + if (docID != NEXT_DOC_ID_RECORD) + { + ref = new DocumentRef; + ref->Deserialize(data); + if (h_dbf) + { + h_dbf->Get(docKey,data); + ref->DocHead((char*)HtZlibCodec::instance()->decode(data)); + } + fprintf(fl, "%d", ref->DocID()); + fprintf(fl, "\tu:%s", ref->DocURL()); + fprintf(fl, "\tt:%s", ref->DocTitle()); + fprintf(fl, "\ta:%d", ref->DocState()); + fprintf(fl, "\tm:%d", (int) ref->DocTime()); + fprintf(fl, "\ts:%d", ref->DocSize()); + fprintf(fl, "\tH:%s", ref->DocHead()); + fprintf(fl, "\th:%s", ref->DocMetaDsc()); + fprintf(fl, "\tl:%d", (int) ref->DocAccessed()); + fprintf(fl, "\tL:%d", ref->DocLinks()); + fprintf(fl, "\tb:%d", ref->DocBackLinks()); + fprintf(fl, "\tc:%d", ref->DocHopCount()); + fprintf(fl, "\tg:%d", ref->DocSig()); + fprintf(fl, "\te:%s", ref->DocEmail()); + fprintf(fl, "\tn:%s", ref->DocNotification()); + fprintf(fl, "\tS:%s", ref->DocSubject()); + fprintf(fl, "\td:"); + descriptions = ref->Descriptions(); + String *description; + descriptions->Start_Get(); + int first = 1; + while ((description = (String *) descriptions->Get_Next())) + { + if (!first) + fprintf(fl, "\001"); + first = 0; + fprintf(fl, "%s", description->get()); + } + fprintf(fl, "\tA:"); + anchors = ref->DocAnchors(); + String *anchor; + anchors->Start_Get(); + first = 1; + while ((anchor = (String *) anchors->Get_Next())) + { + if (!first) + fprintf(fl, "\001"); + first = 0; + fprintf(fl, "%s", anchor->get()); + } + fprintf(fl, "\n"); + delete ref; + } + } + + fclose(fl); + + return OK; +} + +//***************************************************************************** +// int DocumentDB::LoadDB(const String &filename, int verbose) +// Load an extract to our database from an ASCII file +// The extract will consist of lines with fields separated by tabs. +// The lines need not be sorted in any fashion. +// +int DocumentDB::LoadDB(const String& filename, int verbose) +{ + FILE *input; + String docKey(sizeof(int)); + DocumentRef ref; + StringList descriptions, anchors; + char *token, field; + String data; + + if((input = fopen(filename, "r")) == 0) { + perror(form("DocumentDB::LoadDB: opening %s for reading", + (const char*)filename)); + return NOTOK; + } + + while (data.readLine(input)) + { + token = strtok(data, "\t"); + if (token == NULL) + continue; + + ref.DocID(atoi(token)); + + if (verbose) + cout << "\t loading document ID: " << ref.DocID() << endl; + + while ( (token = strtok(0, "\t")) ) + { + field = *token; + token += 2; + + if (verbose > 2) + cout << "\t field: " << field; + + switch(field) + { + case 'u': // URL + ref.DocURL(token); + break; + case 't': // Title + ref.DocTitle(token); + break; + case 'a': // State + ref.DocState(atoi(token)); + break; + case 'm': // Modified + ref.DocTime(atoi(token)); + break; + case 's': // Size + ref.DocSize(atoi(token)); + break; + case 'H': // Head + ref.DocHead(token); + break; + case 'h': // Meta Description + ref.DocMetaDsc(token); + break; + case 'l': // Accessed + ref.DocAccessed(atoi(token)); + break; + case 'L': // Links + ref.DocLinks(atoi(token)); + break; + case 'b': // BackLinks + ref.DocBackLinks(atoi(token)); + break; + case 'c': // HopCount + ref.DocHopCount(atoi(token)); + break; + case 'g': // Signature + ref.DocSig(atoi(token)); + break; + case 'e': // E-mail + ref.DocEmail(token); + break; + case 'n': // Notification + ref.DocNotification(token); + break; + case 'S': // Subject + ref.DocSubject(token); + break; + case 'd': // Descriptions + descriptions.Create(token, '\001'); + ref.Descriptions(descriptions); + break; + case 'A': // Anchors + anchors.Create(token, '\001'); + ref.DocAnchors(anchors); + break; + default: + break; + } + + } + + + // We must be careful if the document already exists + // So we'll delete the old document and add the new one + if (Exists(ref.DocID())) + { + Delete(ref.DocID()); + } + Add(ref); + + // If we add a record with an ID past nextDocID, update it + if (ref.DocID() > nextDocID) + nextDocID = ref.DocID() + 1; + + descriptions.Destroy(); + anchors.Destroy(); + } + + fclose(input); + return OK; +} + +//***************************************************************************** +// List *DocumentDB::URLs() +// Return a list of all the URLs in the database +// Only available when there's an URL -> DocID index db handy. +// +List *DocumentDB::URLs() +{ + List *list = new List; + char *coded_key; + + if (i_dbf == 0) + return 0; + + i_dbf->Start_Get(); + while ((coded_key = i_dbf->Get_Next())) + { + String *key = new String(HtURLCodec::instance()->decode(coded_key)); + list->Add(key); + } + return list; +} + + +//***************************************************************************** +// List *DocumentDB::DocIDs() +// Return a list of all the DocIDs in the database +// +List *DocumentDB::DocIDs() +{ + List *list = new List; + char *key; + + dbf->Start_Get(); + while ((key = dbf->Get_Next())) + { + int docID; + memcpy (&docID, key, sizeof docID); + + if (docID != NEXT_DOC_ID_RECORD) + list->Add(new IntObject(docID)); + } + return list; +} + +//***************************************************************************** +// private +// int readLine(FILE *in, String &line) +// +int readLine(FILE *in, String &line) +{ + char buffer[2048]; + int length; + + line = 0; + while (fgets(buffer, sizeof(buffer), in)) + { + length = strlen(buffer); + if (buffer[length - 1] == '\n') + { + // + // A full line has been read. Return it. + // + line << buffer; + line.chop('\n'); + return 1; + } + else + { + // + // Only a partial line was read. Append it to the line + // and read some more. + // + line << buffer; + } + } + return line.length() > 0; +} + +// End of DocumentDB.cc diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/DocumentDB.h b/debian/htdig/htdig-3.2.0b6/htcommon/DocumentDB.h new file mode 100644 index 00000000..51ade173 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/DocumentDB.h @@ -0,0 +1,96 @@ +// +// DocumentDB.h +// +// DocumentDB: This class is the interface to the database of document +// references. This database is only used while digging. +// An extract of this database is used for searching. +// This is because digging requires a different index +// than searching. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: DocumentDB.h,v 1.14 2004/05/28 13:15:12 lha Exp $ +// + +#ifndef _DocumentDB_h_ +#define _DocumentDB_h_ + +#include "DocumentRef.h" +#include "List.h" +#include "Database.h" +#include "IntObject.h" + +/* This is where the running document counter is stored. + The first real document number is the next. */ +#define NEXT_DOC_ID_RECORD 1 + + +class DocumentDB +{ +public: + // + // Construction/Destruction + // + DocumentDB(); + ~DocumentDB(); + + + // + // Standard database operations + // + int Open(const String& filename, const String& indexfilename, const String& headname); + int Read(const String& filename, const String& indexfilename = 0, const String& headfilename = 0); + int Close(); + + int Add(DocumentRef &); + // These do not read in the excerpt + DocumentRef *operator [] (int DocID); + DocumentRef *operator [] (const String& url); + // You must call this to read the excerpt + int ReadExcerpt(DocumentRef &); + int Exists(int DocID); + int Delete(int DocID); + + // + // The database keeps track of document ids. Here is a way to get + // the next document id. + // + int NextDocID() {return nextDocID++;} + + // And here's a way to increment NextDocID after adding lots of records + // (for example when merging databases!) + void IncNextDocID (int next) {nextDocID += next;} + + // + // We will need to be able to iterate over the complete database. + // + + // This returns a list of all the URLs, as String * + List *URLs(); + + // This returns a list of all the DocIDs, as IntObject * + List *DocIDs(); + + // Dump the database out to an ASCII text file + int DumpDB(const String& filename, int verbose = 0); + + // Read in the database from an ASCII text file + // (created by DumpDB) + int LoadDB(const String& filename, int verbose = 0); + +private: + Database *dbf; + Database *i_dbf; + Database *h_dbf; + int isopen; + int isread; + int nextDocID; +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/DocumentRef.cc b/debian/htdig/htdig-3.2.0b6/htcommon/DocumentRef.cc new file mode 100644 index 00000000..97900cd3 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/DocumentRef.cc @@ -0,0 +1,547 @@ +// +// DocumentRef.cc +// +// DocumentRef: Reference to an indexed document. Keeps track of all +// information stored on the document, either by the dig +// or temporary search information. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: DocumentRef.cc,v 1.53 2004/05/28 13:15:12 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "DocumentRef.h" +#include "good_strtok.h" +#include "WordRecord.h" +#include "HtConfiguration.h" +#include "HtURLCodec.h" +#include "WordType.h" +#include "HtWordReference.h" +#include <stdlib.h> +#include <ctype.h> + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +// extern HtConfiguration config; + +//***************************************************************************** +// DocumentRef::DocumentRef() +// +DocumentRef::DocumentRef() +{ + Clear(); +} + + +//***************************************************************************** +// DocumentRef::~DocumentRef() +// +DocumentRef::~DocumentRef() +{ +} + + +//***************************************************************************** +// void DocumentRef::Clear() +// +void DocumentRef::Clear() +{ + docID = 0; + docURL = 0; + docTime = 0; + docAccessed = 0; + docHead = 0; + docHeadIsSet = 0; + docMetaDsc = 0; + docTitle = 0; + descriptions.Destroy(); + docState = Reference_normal; + docSize = 0; + docLinks = 0; + docBackLinks = 0; + docAnchors.Destroy(); + docHopCount = 0; + docSig = 0; + docEmail = 0; + docNotification = 0; + docSubject = 0; + docScore = 0; + docAnchor = 0; +} + +//***************************************************************************** +// void DocumentRef::DocState(int s) +// +void DocumentRef::DocState(int s) +{ + // You can't easily do this with a cast, so we'll use a switch + switch(s) + { + case 0: + docState = Reference_normal; + break; + case 1: + docState = Reference_not_found; + break; + case 2: + docState = Reference_noindex; + break; + case 3: + docState = Reference_obsolete; + break; + } +} + + +enum +{ + DOC_ID, // 0 + DOC_TIME, // 1 + DOC_ACCESSED, // 2 + DOC_STATE, // 3 + DOC_SIZE, // 4 + DOC_LINKS, // 5 + DOC_IMAGESIZE, // 6 -- No longer used + DOC_HOPCOUNT, // 7 + DOC_URL, // 8 + DOC_HEAD, // 9 + DOC_TITLE, // 10 + DOC_DESCRIPTIONS, // 11 + DOC_ANCHORS, // 12 + DOC_EMAIL, // 13 + DOC_NOTIFICATION, // 14 + DOC_SUBJECT, // 15 + DOC_STRING, // 16 + DOC_METADSC, // 17 + DOC_BACKLINKS, // 18 + DOC_SIG // 19 +}; + +// Must be powers of two never reached by the DOC_... enums. +#define CHARSIZE_MARKER_BIT 64 +#define SHORTSIZE_MARKER_BIT 128 + +//***************************************************************************** +// void DocumentRef::Serialize(String &s) +// Convert all the data in the object to a string. +// The data is in the string is tagged with +// +void DocumentRef::Serialize(String &s) +{ + int length; + String *str; + +// +// The following macros make the serialization process a little easier +// to follow. Note that if an object to be serialized has the default +// value for this class, it it NOT serialized. This means that +// storage will be saved... +// +#define addnum(id, out, var) \ + if (var != 0) \ + { \ + if (var <= (unsigned char) ~1) \ + { \ + unsigned char _tmp = var; \ + out << (char) (id | CHARSIZE_MARKER_BIT); \ + out.append((char *) &_tmp, sizeof(_tmp)); \ + } \ + else if (var <= (unsigned short int) ~1) \ + { \ + unsigned short int _tmp = var; \ + out << (char) (id | SHORTSIZE_MARKER_BIT); \ + out.append((char *) &_tmp, sizeof(_tmp)); \ + } \ + else \ + { \ + out << (char) id; \ + out.append((char *) &var, sizeof(var)); \ + } \ + } + +#define addstring(id, out, str) \ + if (str.length()) \ + { \ + length = str.length(); \ + if (length <= (unsigned char) ~1) \ + { \ + unsigned char _tmp = length; \ + out << (char) (id | CHARSIZE_MARKER_BIT); \ + out.append((char *) &_tmp, sizeof(_tmp)); \ + } \ + else if (length <= (unsigned short int) ~1) \ + { \ + unsigned short int _tmp = length; \ + out << (char) (id | SHORTSIZE_MARKER_BIT); \ + out.append((char *) &_tmp, sizeof(_tmp)); \ + } \ + else \ + { \ + out << (char) id; \ + out.append((char *) &length, sizeof(length)); \ + } \ + out.append(str); \ + } + +// To keep compatibility with old databases, don't bother +// with long lists at all. Bloat the size for long strings with +// one char to just keep a ~1 marker since we don't know the +// endianness; we don't know where to put a endian-safe +// size-marker, and we probably rather want the full char to +// keep the length. Only strings shorter than (unsigned char) ~1 +// will be "optimized"; trying to optimize strings that fit in +// (unsigned short) does not seem to give anything substantial. +#define addlist(id, out, list) \ + if (list.Count()) \ + { \ + length = list.Count(); \ + if (length <= (unsigned short int) ~1) \ + { \ + if (length <= (unsigned char) ~1) \ + { \ + unsigned char _tmp = length; \ + out << (char) (id | CHARSIZE_MARKER_BIT); \ + out.append((char *) &_tmp, sizeof(_tmp)); \ + } \ + else \ + { \ + unsigned short int _tmp = length; \ + out << (char) (id | SHORTSIZE_MARKER_BIT); \ + out.append((char *) &_tmp, sizeof(_tmp)); \ + } \ + list.Start_Get(); \ + while ((str = (String *) list.Get_Next())) \ + { \ + length = str->length(); \ + if (length < (unsigned char) ~1) \ + { \ + unsigned char _tmp = length; \ + out.append((char*) &_tmp, sizeof(_tmp)); \ + } \ + else \ + { \ + unsigned char _tmp = ~1; \ + out.append((char*) &_tmp, sizeof(_tmp)); \ + out.append((char*) &length, sizeof(length)); \ + } \ + out.append(*str); \ + } \ + } \ + else \ + { \ + out << (char) id; \ + out.append((char *) &length, sizeof(length)); \ + list.Start_Get(); \ + while ((str = (String *) list.Get_Next())) \ + { \ + length = str->length(); \ + out.append((char*) &length, sizeof(length)); \ + out.append(*str); \ + } \ + } \ + } + + addnum(DOC_ID, s, docID); + addnum(DOC_TIME, s, docTime); + addnum(DOC_ACCESSED, s, docAccessed); + addnum(DOC_STATE, s, docState); + addnum(DOC_SIZE, s, docSize); + addnum(DOC_LINKS, s, docLinks); + addnum(DOC_BACKLINKS, s, docBackLinks); + addnum(DOC_HOPCOUNT, s, docHopCount); + addnum(DOC_SIG, s, docSig); + + // Use a temporary since the addstring macro will evaluate + // this multiple times. + String tmps = HtURLCodec::instance()->encode(docURL); + addstring(DOC_URL, s, tmps); + // This is done in the DocumentDB code through the excerpt database + // addstring(DOC_HEAD, s, docHead); + addstring(DOC_METADSC, s, docMetaDsc); + addstring(DOC_TITLE, s, docTitle); + + addlist(DOC_DESCRIPTIONS, s, descriptions); + addlist(DOC_ANCHORS, s, docAnchors); + + addstring(DOC_EMAIL, s, docEmail); + addstring(DOC_NOTIFICATION, s, docNotification); + addstring(DOC_SUBJECT, s, docSubject); +} + + +//***************************************************************************** +// void DocumentRef::Deserialize(String &stream) +// Extract the contents of our private variables from the given +// character string. The character string is expected to have been +// created using the Serialize member. +// +void DocumentRef::Deserialize(String &stream) +{ + Clear(); + char *s = stream.get(); + char *end = s + stream.length(); + int length; + int count; + int i; + int x; + int throwaway; // As the name sounds--used for old fields + String *str; + +// There is a problem with getting a numeric value into a +// numeric unknown type that may be an enum (the other way +// around is simply by casting (int)). +// Supposedly the enum incarnates as a simple type, so we can +// just check the size and copy the bits. +#define MEMCPY_ASSIGN(to, from, type) \ + do { \ + type _tmp = (type) (from); \ + memcpy((char *) &(to), (char *) &_tmp, sizeof(to)); \ + } while (0) + +#define NUM_ASSIGN(to, from) \ + do { \ + if (sizeof(to) == sizeof(unsigned long int)) \ + MEMCPY_ASSIGN(to, from, unsigned long int); \ + else if (sizeof(to) == sizeof(unsigned int)) \ + MEMCPY_ASSIGN(to, from, unsigned int); \ + else if (sizeof(to) == sizeof(unsigned short int)) \ + MEMCPY_ASSIGN(to, from, unsigned short int); \ + else if (sizeof(to) == sizeof(unsigned char)) \ + MEMCPY_ASSIGN(to, from, unsigned char); \ + /* else fatal error here? */ \ + } while (0) + +#define getnum(type, in, var) \ + if (type & CHARSIZE_MARKER_BIT) \ + { \ + NUM_ASSIGN(var, *(unsigned char *) in); \ + in += sizeof(unsigned char); \ + } \ + else if (type & SHORTSIZE_MARKER_BIT) \ + { \ + unsigned short int _tmp0; \ + memcpy((char *) &_tmp0, (char *) (in), sizeof(unsigned short)); \ + NUM_ASSIGN(var, _tmp0); \ + in += sizeof(unsigned short int); \ + } \ + else \ + { \ + memcpy((char *) &var, in, sizeof(var)); \ + in += sizeof(var); \ + } + +#define getstring(type, in, str) \ + getnum(type, in, length); \ + str = 0; \ + str.append(in, length); \ + in += length + +#define getlist(type, in, list) \ + getnum(type, in, count); \ + if (type & (CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT)) \ + { \ + for (i = 0; i < count; i++) \ + { \ + unsigned char _tmp = *(unsigned char *) in; \ + in += sizeof(_tmp); \ + if (_tmp < (unsigned char) ~1) \ + length = _tmp; \ + else \ + getnum(~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT), in, \ + length); \ + str = new String; \ + str->append(in, length); \ + list.Add(str); \ + in += length; \ + } \ + } \ + else \ + { \ + for (i = 0; i < count; i++) \ + { \ + getnum(~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT), in, \ + length); \ + str = new String; \ + str->append(in, length); \ + list.Add(str); \ + in += length; \ + } \ + } + + while (s < end) + { + x = (unsigned char) *s++; + switch (x & ~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT)) + { + case DOC_ID: + getnum(x, s, docID); + break; + case DOC_TIME: + getnum(x, s, docTime); + break; + case DOC_ACCESSED: + getnum(x, s, docAccessed); + break; + case DOC_STATE: + getnum(x, s, docState); + break; + case DOC_SIZE: + getnum(x, s, docSize); + break; + case DOC_IMAGESIZE: // No longer used + getnum(x, s, throwaway); + break; + case DOC_LINKS: + getnum(x, s, docLinks); + break; + case DOC_HOPCOUNT: + getnum(x, s, docHopCount); + break; + case DOC_BACKLINKS: + getnum(x, s, docBackLinks); + break; + case DOC_SIG: + getnum(x, s, docSig); + break; + case DOC_URL: + { + // Use a temporary since the addstring macro will evaluate + // this multiple times. + String tmps; + getstring(x, s, tmps); + + docURL = HtURLCodec::instance()->decode(tmps); + } + break; + case DOC_HEAD: + getstring(x, s, docHead); docHeadIsSet = 1; + break; + case DOC_METADSC: + getstring(x, s, docMetaDsc); + break; + case DOC_TITLE: + getstring(x, s, docTitle); + break; + case DOC_DESCRIPTIONS: + getlist(x, s, descriptions); + break; + case DOC_ANCHORS: + getlist(x, s, docAnchors); + break; + case DOC_EMAIL: + getstring(x, s, docEmail); + break; + case DOC_NOTIFICATION: + getstring(x, s, docNotification); + break; + case DOC_SUBJECT: + getstring(x, s, docSubject); + break; + case DOC_STRING: + // This is just a debugging string. Ignore it. + break; + default: + cerr << "BAD TAG IN SERIALIZED DATA: " << x << endl; + return; + } + } +} + + +//***************************************************************************** +// void DocumentRef::AddDescription(char *d, HtWordList &words) +// +void DocumentRef::AddDescription(const char *d, HtWordList &words) +{ + if (!d || !*d) + return; + + while (isspace(*d)) + d++; + + if (!d || !*d) + return; + + String desc = d; + desc.chop(" \t"); + + // Add the description text to the word database with proper factor + // Do this first because we may have reached the max_description limit + // This also ensures we keep the proper weight on descriptions + // that occur many times + + // Parse words. + char *p = desc; + HtConfiguration* config= HtConfiguration::config(); + static int minimum_word_length = config->Value("minimum_word_length", 3); + static int max_descriptions = config->Value("max_descriptions", 5); + + String word; + HtWordReference wordRef; + wordRef.Flags(FLAG_LINK_TEXT); + wordRef.DocID(docID); + + while (*p) + { + // Reset contents before adding chars each round. + word = 0; + + while (*p && HtIsWordChar(*p)) + word << *p++; + + HtStripPunctuation(word); + + if (word.length() >= minimum_word_length) { + // The wordlist takes care of lowercasing; just add it. + wordRef.Location((p - (char*)desc) - word.length()); + wordRef.Word(word); + words.Replace(wordRef); + } + + while (*p && !HtIsStrictWordChar(*p)) + p++; + } + + // And let's flush the words! (nice comment hu :-) + words.Flush(); + + // Now are we at the max_description limit? + if (descriptions.Count() >= max_descriptions) + return; + + descriptions.Start_Get(); + String *description; + while ((description = (String *) descriptions.Get_Next())) + { + if (mystrcasecmp(description->get(), (char*)desc) == 0) + return; + } + descriptions.Add(new String(desc)); +} + + +//***************************************************************************** +// void DocumentRef::AddAnchor(char *a) +// +void DocumentRef::AddAnchor(const char *a) +{ + if (a) + docAnchors.Add(new String(a)); +} + + diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/DocumentRef.h b/debian/htdig/htdig-3.2.0b6/htcommon/DocumentRef.h new file mode 100644 index 00000000..446ff6f7 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/DocumentRef.h @@ -0,0 +1,171 @@ +// +// DocumentRef.h +// +// DocumentRef: Reference to an indexed document. Keeps track of all +// information stored on the document, either by the dig +// or temporary search information. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: DocumentRef.h,v 1.29 2004/05/28 13:15:12 lha Exp $ +// + +#ifndef _DocumentRef_h_ +#define _DocumentRef_h_ + +#include "htString.h" +#include "List.h" +#include "HtWordList.h" + +#include <time.h> + +enum ReferenceState +{ + Reference_normal, + Reference_not_found, + Reference_noindex, + Reference_obsolete +}; + +class DocumentRef : public Object +{ + public: + // + // Construction/Destruction + // + DocumentRef(); + ~DocumentRef(); + + // + // A DocumentRef can read itself from a character string and + // convert itself into a character string + // + void Serialize(String &s); + void Deserialize(String &s); + + // + // Access to the members + // + int DocID() {return docID;} + char *DocURL() {return docURL;} + time_t DocTime() {return docTime;} + char *DocTitle() {return docTitle;} + char *DocAuthor() {return docAuthor;} + char *DocHead() {return docHead;} + int DocHeadIsSet() {return docHeadIsSet;} + char *DocMetaDsc() {return docMetaDsc;} + time_t DocAccessed() {return docAccessed;} + int DocLinks() {return docLinks;} + int DocBackLinks() {return docBackLinks;} + List *Descriptions() {return &descriptions;} + ReferenceState DocState() {return docState;} + int DocSize() {return docSize;} + List *DocAnchors() {return &docAnchors;} + double DocScore() {return docScore;} + int DocSig() {return docSig;} + int DocAnchor() {return docAnchor;} + int DocHopCount() {return docHopCount;} + char *DocEmail() {return docEmail;} + char *DocNotification() {return docNotification;} + char *DocSubject() {return docSubject;} + + void DocID(int d) {docID = d;} + void DocURL(const char *u) {docURL = u;} + void DocTime(time_t t) {docTime = t;} + void DocTitle(const char *t) {docTitle = t;} + void DocAuthor(const char *a) {docAuthor = a;} + void DocHead(const char *h) {docHeadIsSet = 1; docHead = h;} + void DocMetaDsc(const char *md) {docMetaDsc = md;} + void DocAccessed(time_t t) {docAccessed = t;} + void DocLinks(int l) {docLinks = l;} + void DocBackLinks(int l) {docBackLinks = l;} + void Descriptions(List &l) {descriptions = l;} + void AddDescription(const char *d, HtWordList &words); + void DocState(ReferenceState s) {docState = s;} + void DocState(int s); + void DocSize(int s) {docSize = s;} + void DocSig(int s) {docSig = s;} + void DocAnchors(List &l) {docAnchors = l;} + void AddAnchor(const char *a); + void DocScore(double s) {docScore = s;} + void DocAnchor(int a) {docAnchor = a;} + void DocHopCount(int h) {docHopCount = h;} + void DocEmail(const char *e) {docEmail = e;} + void DocNotification(const char *n) {docNotification = n;} + void DocSubject(const char *s) {docSubject = s;} + + void Clear(); // Reset everything + + protected: + // + // These values will be stored when serializing + // + + // This is the index number of the document in the database. + int docID; + // This is the URL of the document. + String docURL; + // This is the time specified in the document's header + // Usually that's the last modified time, for servers that return it. + time_t docTime; + // This is the time that the last retrieval occurred. + time_t docAccessed; + // This is the stored excerpt of the document, just text. + String docHead; + // This indicates if the stored excerpt of the document has been set. + int docHeadIsSet; + // This is the document-specified description. + // For HTML, that's the META description tag. + String docMetaDsc; + // This is the title of the document. + String docTitle; + // This is the author of the document, as specified in meta information + String docAuthor; + // This is a list of Strings, the text of links pointing to this document. + // (e.g. <a href="docURL">description</a> + List descriptions; + // This is the state of the document--modified, normal, etc. + ReferenceState docState; + // This is the size of the original document. + int docSize; + // This is a count of the links in the document (outgoing links). + int docLinks; + // This is a count of the links to the document (incoming links). + int docBackLinks; + // This is a list of the anchors in the document (i.e. <A NAME=...) + List docAnchors; + // This is a count of the number of hops from start_urls to here. + int docHopCount; + // This is a signature of the document. (e.g. md5sum, checksum...) + // This is currently unused. + long int docSig; + + // + // The following values are for the email notification of expiration + // + + // This is the email destination for htnotify. + String docEmail; + // This is the date that htnotify should use as comparison. + String docNotification; + // This is the subject of the email sent out by htnotify. + String docSubject; + + // + // This is used for searching and is not stored in the database + // + + // This is the current score of this document. + double docScore; + // This is the nearest anchor for the search word. + int docAnchor; + +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/HtConfiguration.cc b/debian/htdig/htdig-3.2.0b6/htcommon/HtConfiguration.cc new file mode 100644 index 00000000..ad3ce4f6 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/HtConfiguration.cc @@ -0,0 +1,271 @@ +// +// HtConfiguration.cc +// +// HtConfiguration: extends Configuration class +// to implement Apache-style config. Uses parser +// generated by Bison from conf_parser.yxx +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: HtConfiguration.cc,v 1.10 2004/05/28 13:15:12 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <stdio.h> +#include "HtConfiguration.h" +#include <stdlib.h> + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +#include <stdlib.h> +#include <ctype.h> +#include <locale.h> + + +//******************************************************************** +// Add complex entry to the configuration +// +void +HtConfiguration::Add(const char *name, const char *value, Configuration *aList) { + + if (strcmp("url",name)==0) { //add URL entry + URL tmpUrl(strdup(value)); + Dictionary *paths= NULL; + if ( (paths=(Dictionary *)dcUrls[tmpUrl.host()]) ) { + paths->Add(tmpUrl.path(),aList); + } else { + paths=new Dictionary(); + paths->Add(tmpUrl.path(),aList); + dcUrls.Add(tmpUrl.host(),paths); + } + } else { + + Object *treeEntry=dcBlocks[name]; + if (treeEntry!=NULL) { + ((Dictionary *)treeEntry)->Add(value,aList); + } else { + treeEntry=new Dictionary(16); + ((Dictionary *)treeEntry)->Add(value,aList); + dcBlocks.Add(name, treeEntry); + } + } +} + +//********************************************************************* +const String HtConfiguration::Find(const char *blockName,const char *name,const char *value) const +{ + if (!(blockName && name && value) ) + return String(); + union { + void *ptr; + Object *obj; + Dictionary *dict; + HtConfiguration *conf; + } tmpPtr; + String chr; + + if (strcmp("url",blockName)==0) { // URL needs special compare + URL paramUrl(name); // split URL to compare separatly host and path + chr=Find(¶mUrl,value); + if (chr[0]!=0) { + return chr; + } + } + else { // end "server" + tmpPtr.obj=dcBlocks.Find(blockName); + if (tmpPtr.ptr) { + tmpPtr.obj = tmpPtr.dict->Find(name); + if (tmpPtr.ptr) { + chr = tmpPtr.conf->Find(value); + if (chr[0] != 0) + return chr; + } + } + } + + // If this parameter is defined in global then return it + chr=Find(value); + if (chr[0]!=0) { + return chr; + } +#ifdef DEBUG + cerr << "Could not find configuration option " << blockName<<":" + <<name<<":"<<value<< "\n"; +#endif + return String(); +} + +//********************************************************************* +// +const String HtConfiguration::Find(URL *aUrl, const char *value) const +{ + if (!aUrl) + return String(); + Dictionary *tmpPtr=(Dictionary *)dcUrls.Find( aUrl->host() ); + if (tmpPtr) { // We've got such host in config + tmpPtr->Start_Get(); + // Try to find best matched URL + // + struct candidate { + Object *obj; + unsigned int len; + String value; + } candidate; + candidate.len=0; + String returnValue; + // Begin competition: which URL is better? + // + // TODO: move this loop into Dictionary + // (or create Dictionary::FindBest ?) + // or make url list sorted ? + // or implement abstract Dictionary::Compare? + const char *strParamUrl=(const char *)aUrl->path(); + char* confUrl= NULL; + bool found(false); + while ((confUrl=tmpPtr->Get_Next()) ) { + if (strncmp(confUrl,strParamUrl,strlen(confUrl))==0 + && (strlen(confUrl)>=candidate.len)) { + // it seems this URL match better + candidate.obj=tmpPtr->Find(confUrl); + + // Let's see if it exists + if (((HtConfiguration *)candidate.obj)->Exists(value)) + { + // yes, it has! We've got new candidate. + candidate.value=((HtConfiguration *)candidate.obj)->Find(value); + returnValue=candidate.value; + candidate.len=candidate.value.length(); + found = true; + } + } + } + + if (found) + return ParsedString(returnValue).get(dcGlobalVars); + + } + return Find(value); +} + + +//********************************************************************* +int HtConfiguration::Value(const char *blockName, const char *name, + const char *value, int default_value ) { +int retValue=default_value; +String tmpStr=Find(blockName,name,value); + if (tmpStr[0]!=0) { + retValue=atoi(tmpStr.get()); + } +return retValue; +} + +//********************************************************************* +double HtConfiguration::Double(const char *blockName, const char *name, + const char *value, double default_value ) { +double retValue=default_value; +String tmpStr=Find(blockName,name,value); + if (tmpStr[0]!=0) { + retValue=atof(tmpStr.get()); + } +return retValue; +} + +//********************************************************************* +int HtConfiguration::Boolean(const char *blockName, const char *name, + const char *value, int default_value ) { +int retValue=default_value; +String tmpStr=Find(blockName,name,value); + if (tmpStr[0]!=0) { + if (mystrcasecmp((char*)tmpStr, "true") == 0 || + mystrcasecmp((char*)tmpStr, "yes") == 0 || + mystrcasecmp((char*)tmpStr, "1") == 0) + retValue = 1; + else if (mystrcasecmp((char*)tmpStr, "false") == 0 || + mystrcasecmp((char*)tmpStr, "no") == 0 || + mystrcasecmp((char*)tmpStr, "0") == 0) + retValue = 0; + + } +return retValue; +} + +//********************************************************************* +//********************************************************************* +int HtConfiguration::Value(URL *aUrl, const char *value, + int default_value ) { +int retValue=default_value; +String tmpStr=Find(aUrl,value); + if (tmpStr[0]!=0) { + retValue=atoi(tmpStr.get()); + } +return retValue; +} + +//********************************************************************* +double HtConfiguration::Double(URL *aUrl,const char *value, + double default_value ) { +double retValue=default_value; +String tmpStr=Find(aUrl,value); + if (tmpStr[0]!=0) { + retValue=atof(tmpStr.get()); + } +return retValue; +} + +//********************************************************************* +int HtConfiguration::Boolean(URL *aUrl,const char *value, + int default_value ) { +int retValue=default_value; +String tmpStr=Find(aUrl,value); + if (tmpStr[0]!=0) { + if (mystrcasecmp((char*)tmpStr, "true") == 0 || + mystrcasecmp((char*)tmpStr, "yes") == 0 || + mystrcasecmp((char*)tmpStr, "1") == 0) + retValue = 1; + else if (mystrcasecmp((char*)tmpStr, "false") == 0 || + mystrcasecmp((char*)tmpStr, "no") == 0 || + mystrcasecmp((char*)tmpStr, "0") == 0) + retValue = 0; + + } +return retValue; +} + +//********************************************************************* +// +int +HtConfiguration::Read(const String& filename) +{ +extern FILE* yyin; +extern int yyparse(void*); +if ((yyin=fopen(filename,"r"))==NULL) + return NOTOK; + +FileName=filename; // need to be before yyparse() because is used in it +yyparse(this); +fclose(yyin); +return OK; +} + +HtConfiguration* HtConfiguration::_config= NULL; + +HtConfiguration* const HtConfiguration::config() { + if(_config == NULL) { + _config= new HtConfiguration(); + } + return _config; +} diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/HtConfiguration.h b/debian/htdig/htdig-3.2.0b6/htcommon/HtConfiguration.h new file mode 100644 index 00000000..accb379a --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/HtConfiguration.h @@ -0,0 +1,95 @@ +// +// HtConfiguration.h +// +// HtConfiguration: extends Configuration class +// to implement Apache-style config. Uses parser +// generated by Bison from conf_parser.yxx +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: HtConfiguration.h,v 1.5 2004/05/28 13:15:12 lha Exp $ +// + + +#ifndef _HtConfiguration_h_ +#define _HtConfiguration_h_ + +#include"Configuration.h" +#include "ParsedString.h" +#include "URL.h" + +class HtConfiguration : public Configuration +{ + public: + const String Find(const String& name) const {return(Configuration::Find(name));} + int Value(const String& name, int default_value = 0) const + {return(Configuration::Value (name,default_value));} + double Double(const String& name, double default_value = 0) const + {return(Configuration::Double (name,default_value));} + int Boolean(const String& name, int default_value = 0) const + {return(Configuration::Boolean(name,default_value));} + void Add(const String& str){Configuration::Add(str);} + void Add(const String& name, const String& value) + {Configuration::Add(name,value);} + void AddParsed(const String& name, const String& value) + {Configuration::AddParsed(name,value);} + + void Add(const char *name, const char *value, + Configuration *aList); + const String Find(URL *aUrl, const char *value) const; + const String Find(const char *blockName, const char *name, const char *value) const; + int Value(const char *blockName, const char *name, const char *value, + int default_value = 0); + double Double(const char *blockName, const char *name, const char *value, + double default_value = 0); + int Boolean(const char *blockName, const char *name, const char *value, + int default_value = 0); + int Value(URL *aUrl,const char *value,int default_value = 0); + double Double(URL *aUrl,const char *value,double default_value = 0); + int Boolean(URL *aUrl,const char *value,int default_value = 0); + inline + String ParseString(const char*) const; // parse ${var} string + String getFileName() const { return FileName; } + + // + // We need some way of reading in the database from a configuration file + // ... this uses the parser + virtual int Read(const String& filename); + + protected: + Dictionary dcBlocks; + Dictionary dcUrls; + String FileName; // config's file name + + public: + HtConfiguration():Configuration() + {;} + + HtConfiguration(const HtConfiguration& config) : + Configuration(config), + dcBlocks(config.dcBlocks), + dcUrls(config.dcUrls) + { + ; + } + + static HtConfiguration* const config(); + + private: + static HtConfiguration* _config; +}; + +//******************************************************************** +// +inline +String HtConfiguration::ParseString(const char *str) const { + return ParsedString(str).get(dcGlobalVars); +} + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/HtSGMLCodec.cc b/debian/htdig/htdig-3.2.0b6/htcommon/HtSGMLCodec.cc new file mode 100644 index 00000000..23518119 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/HtSGMLCodec.cc @@ -0,0 +1,124 @@ +// +// HtSGMLCodec.cc +// +// HtSGMLCodec: A Specialized HtWordCodec class to convert between SGML +// ISO 8859-1 entities and high-bit characters. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: HtSGMLCodec.cc,v 1.6 2004/06/01 18:25:01 angusgb Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "HtSGMLCodec.h" +#include "HtConfiguration.h" + +// Constructor: parses the appropriate parameters using the +// encapsulated HtWordCodec class. +// Only used in privacy. +HtSGMLCodec::HtSGMLCodec() +{ + HtConfiguration* config= HtConfiguration::config(); + int translate_latin1 = config->Boolean("translate_latin1", 1); + StringList *myTextFromList = new StringList(); // For &foo; + StringList *myNumFromList = new StringList(); // For &#nnn; + StringList *myToList = new StringList(); + String myTextFromString(770); // Full text list + + // Is this really the best way to do this? + if (!translate_latin1 ) + { + myTextFromString = " "; + } + else + { + // this set has been slightly modified in order to manage the € entity + // the resulting charset is therefore a ISO-8859-1 partially moved to ISO-8859-15 + myTextFromString = " |¡|¢|£|€|¥|¦|§|"; + myTextFromString << "¨|©|ª|«|¬|­|®|¯|°|"; + myTextFromString << "±|²|³|´|µ|¶|·|¸|"; + myTextFromString << "¹|º|»|¼|½|¾|¿|À|"; + myTextFromString << "Á|Â|Ã|Ä|Å|Æ|Ç|È|"; + myTextFromString << "É|Ê|Ë|Ì|Í|Î|Ï|Ð|"; + myTextFromString << "Ñ|Ò|Ó|Ô|Õ|Ö|×|Ø|"; + myTextFromString << "Ù|Ú|Û|Ü|Ý|Þ|ß|à|"; + myTextFromString << "á|â|ã|ä|å|æ|ç|è|"; + myTextFromString << "é|ê|ë|ì|í|î|ï|ð|"; + myTextFromString << "ñ|ò|ó|ô|õ|ö|÷|ø|"; + myTextFromString << "ù|ú|û|ü|ý|þ|ÿ"; + } + + myTextFromList->Create(myTextFromString, '|'); + + for (int i = 160; i <= 255; i++) + { + String temp = 0; + temp << (char) i; + myToList->Add(temp); + + temp = 0; + temp << "&#" << i << ";"; + myNumFromList->Add(temp); + if (!translate_latin1 ) + break; + } + + // Now let's take care of the low-bit characters with encodings. + myTextFromList->Add("""); + myToList->Add("\""); + myNumFromList->Add("""); + + myTextFromList->Add("&"); + myToList->Add("&"); + myNumFromList->Add("&"); + + myTextFromList->Add("<"); + myToList->Add("<"); + myNumFromList->Add("<"); + + myTextFromList->Add(">"); + myToList->Add(">"); + myNumFromList->Add(">"); + + myTextWordCodec = new HtWordCodec(myTextFromList, myToList, '|'); + myNumWordCodec = new HtWordCodec(myNumFromList, myToList, '|'); +} + + +HtSGMLCodec::~HtSGMLCodec() +{ + delete myTextWordCodec; + delete myNumWordCodec; +} + + +// Supposedly used as HtSGMLCodec::instance()->ErrMsg() +// to check if HtWordCodec liked what was fed. +String& HtSGMLCodec::ErrMsg() +{ + return myErrMsg; +} + + +// Canonical singleton interface. +HtSGMLCodec * +HtSGMLCodec::instance() +{ + static HtSGMLCodec *_instance = 0; + + if (_instance == 0) + { + _instance = new HtSGMLCodec(); + } + + return _instance; +} + +// End of HtSGMLCodec.cc diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/HtSGMLCodec.h b/debian/htdig/htdig-3.2.0b6/htcommon/HtSGMLCodec.h new file mode 100644 index 00000000..2fef9f90 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/HtSGMLCodec.h @@ -0,0 +1,63 @@ +// +// HtSGMLCodec.h +// +// HtSGMLCodec: A Specialized HtWordCodec class to convert between SGML +// ISO 8859-1 entities and high-bit characters. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: HtSGMLCodec.h,v 1.4 2004/05/28 13:15:12 lha Exp $ +// +#ifndef __HtSGMLCodec_h +#define __HtSGMLCodec_h + +#include "HtWordCodec.h" + +// Container for a HtWordCodec (not subclassed from it due to +// portability-problems using initializers). +// Not for subclassing. +class HtSGMLCodec +{ +public: + static HtSGMLCodec *instance(); + virtual ~HtSGMLCodec(); + + // Similar to the HtWordCodec class. Each string may contain + // zero or more of words from the lists. Here we need to run + // it through two codecs because we might have two different forms + inline String encode(const String &uncoded) const + { return myTextWordCodec->encode(myNumWordCodec->encode(uncoded)); } + + // But we only want to decode into one form i.e. &foo; NOT &#nnn; + String decode(const String &coded) const + { return myTextWordCodec->decode(coded); } + + // If an error was discovered during the parsing of + // entities, this returns an error message + String& ErrMsg(); + + // egcs-1.1 (and some earlier versions) always erroneously + // warns (even without warning flags) about classic singleton + // constructs ("only defines private constructors and has no + // friends"). Rather than adding autoconf tests to shut these + // versions up with -Wno-ctor-dtor-privacy, we fake normal + // conformism for it here (the minimal effort). + friend void my_friend_Harvey__a_faked_friend_function(); + +private: + // Hide default-constructor, copy-constructor and assignment + // operator, making this a singleton. + HtSGMLCodec(); + HtSGMLCodec(const HtSGMLCodec &); + void operator= (const HtSGMLCodec &); + + HtWordCodec *myTextWordCodec; // For &foo; + HtWordCodec *myNumWordCodec; // For &#foo; + String myErrMsg; +}; + +#endif /* __HtSGMLCodec_h */ diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/HtURLCodec.cc b/debian/htdig/htdig-3.2.0b6/htcommon/HtURLCodec.cc new file mode 100644 index 00000000..16a68c8e --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/HtURLCodec.cc @@ -0,0 +1,66 @@ +// +// HtURLCodec.cc +// +// HtURLCodec: Specialized HtWordCodec which just caters to the +// needs of "url_part_aliases" and "common_url_parts". +// Used for coding URLs when they are on disk; the key and the +// href field in db.docdb. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: HtURLCodec.cc,v 1.4 2004/05/28 13:15:12 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "HtURLCodec.h" +#include "defaults.h" // For "config" + +// Constructor: parses the appropriate parameters using the +// encapsulated HtWordCodec class. +// Only used in privacy. +HtURLCodec::HtURLCodec() +{ + HtConfiguration* config= HtConfiguration::config(); + StringList l1(config->Find("url_part_aliases"), " \t"); + StringList l2(config->Find("common_url_parts"), " \t"); + + myWordCodec = new HtWordCodec(l1, l2, myErrMsg); +} + + +HtURLCodec::~HtURLCodec() +{ + delete myWordCodec; +} + + +// Supposedly used as HtURLCodec::instance()->ErrMsg() +// to check if HtWordCodec liked what was fed. +String& HtURLCodec::ErrMsg() +{ + return myErrMsg; +} + + +// Canonical singleton interface. +HtURLCodec * +HtURLCodec::instance() +{ + static HtURLCodec *_instance = 0; + + if (_instance == 0) + { + _instance = new HtURLCodec(); + } + + return _instance; +} + +// End of HtURLCodec.cc diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/HtURLCodec.h b/debian/htdig/htdig-3.2.0b6/htcommon/HtURLCodec.h new file mode 100644 index 00000000..a6aa2804 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/HtURLCodec.h @@ -0,0 +1,64 @@ +// +// HtURLCodec.h +// +// HtURLCodec: Specialized HtWordCodec which just caters to the +// needs of "url_part_aliases" and "common_url_parts". +// Used for coding URLs when they are on disk; the key and the +// href field in db.docdb. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: HtURLCodec.h,v 1.4 2004/05/28 13:15:12 lha Exp $ +// + +#ifndef __HtURLCodec_h +#define __HtURLCodec_h + +#include "HtWordCodec.h" + +// Container for a HtWordCodec (not subclassed from it due to +// portability-problems using initializers). +// Not for subclassing. +class HtURLCodec +{ +public: + static HtURLCodec *instance(); + virtual ~HtURLCodec(); + + // Same as in the HtWordCodec class. Each string may contain + // zero or more of words from the lists. + inline String encode(const String &uncoded) const + { return myWordCodec->encode(uncoded); } + + String decode(const String &coded) const + { return myWordCodec->decode(coded); } + + // If an error was discovered during the parsing of + // url_part_aliases or common_url_parts, this member gives a + // nonempty String with an error message. + String& ErrMsg(); + + // egcs-1.1 (and some earlier versions) always erroneously + // warns (even without warning flags) about classic singleton + // constructs ("only defines private constructors and has no + // friends"). Rather than adding autoconf tests to shut these + // versions up with -Wno-ctor-dtor-privacy, we fake normal + // conformism for it here (the minimal effort). + friend void my_friend_Harvey__a_faked_friend_function(); + +private: + // Hide default-constructor, copy-constructor and assignment + // operator, making this a singleton. + HtURLCodec(); + HtURLCodec(const HtURLCodec &); + void operator= (const HtURLCodec &); + + HtWordCodec *myWordCodec; + String myErrMsg; +}; + +#endif /* __HtURLCodec_h */ diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/HtURLRewriter.cc b/debian/htdig/htdig-3.2.0b6/htcommon/HtURLRewriter.cc new file mode 100644 index 00000000..76c9faa1 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/HtURLRewriter.cc @@ -0,0 +1,59 @@ +// +// HtURLRewriter.cc +// +// HtURLRewriter: Container for a HtRegexReplaceList (not subclassed from it due to +// portability-problems using initializers). +// Not for subclassing. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 2000-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: HtURLRewriter.cc,v 1.4 2004/05/28 13:15:12 lha Exp $ +// + +#include "HtURLRewriter.h" +#include "defaults.h" // For "config" + +// Constructor: parses the appropriate parameters using the +// encapsulated RegexReplaceList class. +// Only used in privacy. +HtURLRewriter::HtURLRewriter() +{ + HtConfiguration* config= HtConfiguration::config(); + StringList list(config->Find("url_rewrite_rules"), " \t"); + + myRegexReplace = new HtRegexReplaceList(list); +} + + +HtURLRewriter::~HtURLRewriter() +{ + delete myRegexReplace; +} + +// Supposedly used as HtURLRewriter::instance()->ErrMsg() +// to check if RegexReplaceList liked what was fed. +const String& HtURLRewriter::ErrMsg() +{ + return myRegexReplace->lastError(); +} + + +// Canonical singleton interface. +HtURLRewriter * +HtURLRewriter::instance() +{ + static HtURLRewriter *_instance = 0; + + if (_instance == 0) + { + _instance = new HtURLRewriter(); + } + + return _instance; +} + +// End of HtURLRewriter.cc diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/HtURLRewriter.h b/debian/htdig/htdig-3.2.0b6/htcommon/HtURLRewriter.h new file mode 100644 index 00000000..d0197c07 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/HtURLRewriter.h @@ -0,0 +1,52 @@ +// +// HtURLRewriter.h +// +// HtURLRewriter: Container for a HtRegexReplaceList (not subclassed from it due to +// portability-problems using initializers). +// Not for subclassing. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 2000-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: HtURLRewriter.h,v 1.4 2004/05/28 13:15:12 lha Exp $ +// +#ifndef __HtURLRewriter_h +#define __HtURLRewriter_h + +#include "HtRegexReplaceList.h" + +class HtURLRewriter +{ +public: + static HtURLRewriter *instance(); + virtual ~HtURLRewriter(); + + inline int replace(String &src) { return myRegexReplace->replace(src); } + + // If an error was discovered during the parsing of + // config directives, this member gives a + // nonempty String with an error message. + const String& ErrMsg(); + + // egcs-1.1 (and some earlier versions) always erroneously + // warns (even without warning flags) about classic singleton + // constructs ("only defines private constructors and has no + // friends"). Rather than adding autoconf tests to shut these + // versions up with -Wno-ctor-dtor-privacy, we fake normal + // conformism for it here (the minimal effort). + friend void my_friend_Harvey__a_faked_friend_function(); + +private: + // Hide default-constructor, copy-constructor and assignment + // operator, making this a singleton. + HtURLRewriter(); + HtURLRewriter(const HtURLRewriter &); + void operator= (const HtURLRewriter &); + + HtRegexReplaceList *myRegexReplace; +}; + +#endif /* __HtURLRewriter_h */ diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/HtWordList.cc b/debian/htdig/htdig-3.2.0b6/htcommon/HtWordList.cc new file mode 100644 index 00000000..566898c1 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/HtWordList.cc @@ -0,0 +1,209 @@ +// +// HtWordList.cc +// +// HtWordList: Specialized WordList class that can hold a list +// of words waiting to be inserted in the database. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: HtWordList.cc,v 1.7 2004/05/28 13:15:12 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "HtWordList.h" +#include "HtWordReference.h" +#include "WordRecord.h" +#include "WordType.h" +#include "HtConfiguration.h" +#include "htString.h" + +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> + +#ifdef HAVE_STD +#include <iostream> +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <iostream.h> +#include <fstream.h> +#endif /* HAVE_STD */ + +#include <errno.h> + +//***************************************************************************** +// HtWordList::~HtWordList() +// +HtWordList::~HtWordList() +{ + delete words; +} + +//***************************************************************************** +// +HtWordList::HtWordList(const HtConfiguration& config_arg) : + WordList(config_arg) +{ + words = new List; +} + +//***************************************************************************** +// +void HtWordList::Replace(const WordReference& arg) +{ + // + // New word. Create a new reference for it and cache it in the object. + // + words->Add(new WordReference(arg)); +} + +//***************************************************************************** +// void HtWordList::Flush() +// Dump the current list of words to the database. After +// the words have been dumped, the list will be destroyed to make +// room for the words of the next document. +// +void HtWordList::Flush() +{ + HtWordReference *wordRef; + + // Provided for backwards compatibility + if (!isopen) + Open(config["word_db"], O_RDWR); + + words->Start_Get(); + while ((wordRef = (HtWordReference *) words->Get_Next())) + { + if (wordRef->Word().length() == 0) { + cerr << "HtWordList::Flush: unexpected empty word\n"; + continue; + } + + Override(*wordRef); + } + + // Cleanup + words->Destroy(); +} + +//***************************************************************************** +// void HtWordList::Skip() +// The current document has disappeared or been modified. +// We do not need to store these words. +// +void HtWordList::Skip() +{ + words->Destroy(); +} + +// +// Callback data dedicated to Dump and dump_word communication +// +class DumpWordData : public Object +{ +public: + DumpWordData(FILE* fl_arg) { fl = fl_arg; } + + FILE* fl; +}; + +//***************************************************************************** +// +// Write the ascii representation of a word occurence. Helper +// of WordList::Dump +// +static int dump_word(WordList *, WordDBCursor &, const WordReference *word, Object &data) +{ + const HtWordReference *word_tmp = (const HtWordReference *)word; + + DumpWordData &info = (DumpWordData &)data; + + word_tmp->Dump(info.fl); + + return OK; +} + +//***************************************************************************** +// int HtWordList::Dump(char* filename) +// +// Write an ascii version of the word database in <filename> +// +int HtWordList::Dump(const String& filename) +{ + FILE *fl; + + if (!isopen) { + cerr << "WordList::Dump: database must be opened first\n"; + return NOTOK; + } + + if((fl = fopen(filename, "w")) == 0) { + perror(form("WordList::Dump: opening %s for writing", (const char*)filename)); + return NOTOK; + } + + HtWordReference::DumpHeader(fl); + DumpWordData data(fl); + WordCursor* search = Cursor(dump_word, &data); + search->Walk(); + delete search; + + fclose(fl); + + return OK; +} + +//***************************************************************************** +// int HtWordList::Load(char* filename) +// +// Read in an ascii version of the word database in <filename> +// +int HtWordList::Load(const String& filename) +{ + FILE *fl; + String data; + HtWordReference *next; + + if (!isopen) { + cerr << "WordList::Load: database must be opened first\n"; + return NOTOK; + } + + if((fl = fopen(filename, "r")) == 0) { + perror(form("WordList::Load: opening %s for reading", (const char*)filename)); + return NOTOK; + } + + if (HtWordReference::LoadHeader(fl) != OK) + { + cerr << "WordList::Load: header is not correct\n"; + return NOTOK; + } + + while (data.readLine(fl)) + { + next = new HtWordReference; + if (next->Load(data) != OK) + { + delete next; + continue; + } + + words->Add(next); + } + + Flush(); + fclose(fl); + + return OK; +} diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/HtWordList.h b/debian/htdig/htdig-3.2.0b6/htcommon/HtWordList.h new file mode 100644 index 00000000..1fd60789 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/HtWordList.h @@ -0,0 +1,69 @@ +// +// HtWordList.h +// +// HtWordList: Specialized WordList class that can hold a list +// of words waiting to be inserted in the database. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: HtWordList.h,v 1.6 2004/05/28 13:15:12 lha Exp $ +// + +#ifndef _HtWordList_h_ +#define _HtWordList_h_ + +#include <fcntl.h> +#include <stdlib.h> + +#include"HtConfiguration.h" +#include "WordList.h" + +class HtWordList : public WordList +{ +public: + // + // Construction/Destruction + // + HtWordList(const Configuration & config_arg) : WordList(config_arg) + { + cerr << "HtWordList::HtWordList(Configuration) is not valid" << endl; + abort(); + } + HtWordList(const HtConfiguration& config_arg); + virtual ~HtWordList(); + + // + // Update/add a word, perform sanity checking and + // fill information. + // + void Replace(const WordReference& wordRef); + + // + // Skip this document -- ignore all words stored in the object + // from this document + // + void Skip(); + + // + // Flush the words stored in the object to the database + // + void Flush(); + + // Write an ascii version of the word database in <filename> + int Dump(const String& filename); + + // Read in an ascii version of the word database in <filename> + int Load(const String& filename); + +private: + + List *words; +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/HtWordReference.cc b/debian/htdig/htdig-3.2.0b6/htcommon/HtWordReference.cc new file mode 100644 index 00000000..3b603855 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/HtWordReference.cc @@ -0,0 +1,94 @@ +// +// HtWordReference.cc +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: HtWordReference.cc,v 1.5 2004/05/28 13:15:12 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "HtWordReference.h" +#include <stdio.h> +#include <stdlib.h> + +// +// Explain the format of data output of the Dump function +// +int HtWordReference::DumpHeader(FILE *fl) +{ + fprintf(fl, "#word\tdocument id\tflags\tlocation\tanchor\n"); + return OK; +} + +// +// Ascii representation of a word occurence. +// +int HtWordReference::Dump(FILE *fl) const +{ + fprintf(fl, "%s\t%d\t%d\t%d\t%d\n", + (char*)Word(), + DocID(), + Flags(), + Location(), + Anchor()); + return OK; +} + +// +// Check the header of the file +// +int HtWordReference::LoadHeader(FILE *fl) +{ + String header; + header.readLine(fl); + if (mystrcasecmp("#word\tdocument id\tflags\tlocation\tanchor", header.get()) == 0) + return OK; + else + return NOTOK; +} + +// +// Ascii representation of a word occurence. +// +int HtWordReference::Load(const String& s) +{ + String data(s); + char *token; + + // Format is "%s\t%d\t%d\t%d\t%d + + token = strtok(data, "\t"); + if (!token) + return NOTOK; + Word(token); + + token = strtok(0, "\t"); + if (!token) + return NOTOK; + DocID(atoi(token)); + + token = strtok(0, "\t"); + if (!token) + return NOTOK; + Flags(atoi(token)); + + token = strtok(0, "\t"); + if (!token) + return NOTOK; + Location(atoi(token)); + + token = strtok(0, "\t"); + if (!token) + return NOTOK; + Anchor(atoi(token)); + + return OK; +} + diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/HtWordReference.h b/debian/htdig/htdig-3.2.0b6/htcommon/HtWordReference.h new file mode 100644 index 00000000..2b7a6db4 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/HtWordReference.h @@ -0,0 +1,93 @@ +// +// HtWordReference.h +// +// HtWordReference: Reference to a word, derived from WordReference and +// implementing explicit accessors. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: HtWordReference.h,v 1.6 2004/05/28 13:15:12 lha Exp $ +// +#ifndef _HtWordReference_h_ +#define _HtWordReference_h_ + +#include "WordReference.h" +#include <stdio.h> + +// +// Flags +// (If extra flags added, also update htsearch.cc:colonPrefix +// +#define FLAG_TEXT 0 +#define FLAG_CAPITAL 1 +#define FLAG_TITLE 2 +#define FLAG_HEADING 4 +#define FLAG_KEYWORDS 8 +#define FLAG_DESCRIPTION 16 +#define FLAG_AUTHOR 32 +#define FLAG_LINK_TEXT 64 +#define FLAG_URL 128 + +// For field-restricted search, at least one of these flags must be set +// in document. (255 = OR of the above...) +#define FLAGS_MATCH_ONE (255 | FLAG_PLAIN) + +// The following are not stored in the database, but are used by WeightWord +#define FLAG_PLAIN 4096 +#define FLAG_EXACT 8192 +#define FLAG_HIDDEN 16384 +#define FLAG_IGNORE 32768 +// The remainder are undefined + +class HtWordReference : public WordReference +{ +public: + // + // Construction/Destruction + // + HtWordReference() {} + HtWordReference(const String& key, const String& record) : + WordReference(key, record) { } + HtWordReference(const String& word) : + WordReference(word) { } + HtWordReference(String word, unsigned int docid, unsigned int flags, unsigned int location, unsigned int anchor) { + Word(word); + DocID(docid); + Location(location); + Anchor(anchor); + Flags(flags); + } + + ~HtWordReference() {} + + // + // Accessors + // + String Word() const { return key.GetWord(); } + void Word(const String& arg) { key.SetWord(arg); } + unsigned int DocID() const { return key.Get( 1 ); } + void DocID(const unsigned int arg) { key.Set( 1, arg); } + unsigned int Flags() const { return key.Get( 2 ); } + void Flags(const unsigned int arg) { key.Set( 2, arg); } + unsigned int Location() const { return key.Get( 3 ); } + void Location(const unsigned int arg) { key.Set( 3, arg); } + unsigned int Anchor() const { return record.info.data; } + void Anchor(const unsigned int arg) { record.info.data = arg; } + + // + // Dumping/Loading + // + int Dump(FILE *fl) const; + static int DumpHeader(FILE *fl); + int Load(const String& s); + static int LoadHeader(FILE *fl); +}; + + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/HtZlibCodec.cc b/debian/htdig/htdig-3.2.0b6/htcommon/HtZlibCodec.cc new file mode 100644 index 00000000..ac48877f --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/HtZlibCodec.cc @@ -0,0 +1,136 @@ +// +// HtZlibCodec.cc +// +// HtZlibCodec: Provide a generic access to the zlib compression routines. +// If zlib is not present, encode and decode are simply +// assignment functions. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: HtZlibCodec.cc,v 1.4 2004/05/28 13:15:12 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "HtZlibCodec.h" +#include "defaults.h" // For "config" + +#if defined(HAVE_LIBZ) && defined(HAVE_ZLIB_H) +#include <zlib.h> +#endif + + +HtZlibCodec::HtZlibCodec() +{ +} + +HtZlibCodec::~HtZlibCodec() +{ +} + +String HtZlibCodec::encode(const String &str) const +{ + String s = str; +#if defined(HAVE_LIBZ) && defined(HAVE_ZLIB_H) + HtConfiguration* config= HtConfiguration::config(); + static int cf=config->Value("compression_level",0); + if (cf) { + // + // Now compress s into c_s + // + unsigned char c_buffer[16384]; + String c_s; + z_stream c_stream; /* compression stream */ + c_stream.zalloc=(alloc_func)0; + c_stream.zfree=(free_func)0; + c_stream.opaque=(voidpf)0; + // Get compression factor, default to best + if (cf<-1) cf=-1; else if (cf>9) cf=9; + int err=deflateInit(&c_stream,cf); + if (err!=Z_OK) return 0; + int len=s.length(); + c_stream.next_in=(Bytef*)(char *)s; + c_stream.avail_in=len; + while (err==Z_OK && c_stream.total_in!=(uLong)len) { + c_stream.next_out=c_buffer; + c_stream.avail_out=sizeof(c_buffer); + err=deflate(&c_stream,Z_NO_FLUSH); + c_s.append((char *)c_buffer,c_stream.next_out-c_buffer); + } + // Finish the stream + for (;;) { + c_stream.next_out=c_buffer; + c_stream.avail_out=sizeof(c_buffer); + err=deflate(&c_stream,Z_FINISH); + c_s.append((char *)c_buffer,c_stream.next_out-c_buffer); + if (err==Z_STREAM_END) break; + //CHECK_ERR(err, "deflate"); + } + err=deflateEnd(&c_stream); + s=c_s; + } +#endif // HAVE_LIBZ && HAVE_ZLIB_H + return s; +} + + +String HtZlibCodec::decode(const String &str) const +{ + String s = str; +#if defined(HAVE_LIBZ) && defined(HAVE_ZLIB_H) + HtConfiguration* config= HtConfiguration::config(); + static int cf=config->Value("compression_level",0); + if (cf) { + String c_s; + // Decompress stream + unsigned char c_buffer[16384]; + z_stream d_stream; + d_stream.zalloc=(alloc_func)0; + d_stream.zfree=(free_func)0; + d_stream.opaque=(voidpf)0; + + unsigned int len=s.length(); + d_stream.next_in=(Bytef*)(char *)s; + d_stream.avail_in=len; + + int err=inflateInit(&d_stream); + if (err!=Z_OK) return 1; + + while (err==Z_OK && d_stream.total_in<len) { + d_stream.next_out=c_buffer; + d_stream.avail_out=sizeof(c_buffer); + err=inflate(&d_stream,Z_NO_FLUSH); + c_s.append((char *)c_buffer,d_stream.next_out-c_buffer); + if (err==Z_STREAM_END) break; + } + + err=inflateEnd(&d_stream); + s=c_s; + } +#endif // HAVE_LIBZ && HAVE_ZLIB_H + return s; +} + + +// Canonical singleton interface. +HtZlibCodec * +HtZlibCodec::instance() +{ + static HtZlibCodec *_instance = 0; + + if (_instance == 0) + { + _instance = new HtZlibCodec(); + } + + return _instance; +} + + +// End of HtZlibCodec.cc diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/HtZlibCodec.h b/debian/htdig/htdig-3.2.0b6/htcommon/HtZlibCodec.h new file mode 100644 index 00000000..c4355c97 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/HtZlibCodec.h @@ -0,0 +1,51 @@ +// +// HtZlibCodec.h +// +// HtZlibCodec: Provide a generic access to the zlib compression routines. +// If zlib is not present, encode and decode are simply +// assignment functions. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: HtZlibCodec.h,v 1.4 2004/05/28 13:15:12 lha Exp $ +// +// +#ifndef __HtZlibCodec_h +#define __HtZlibCodec_h + +#include "htString.h" +#include "HtCodec.h" + +class HtZlibCodec : public HtCodec +{ +public: + static HtZlibCodec *instance(); + ~HtZlibCodec(); + + // Code what's in this string. + String encode(const String &) const; + + // Decode what's in this string. + String decode(const String &) const; + + // egcs-1.1 (and some earlier versions) always erroneously + // warns (even without warning flags) about classic singleton + // constructs ("only defines private constructors and has no + // friends"). Rather than adding autoconf tests to shut these + // versions up with -Wno-ctor-dtor-privacy, we fake normal + // conformism for it here (the minimal effort). + friend void my_friend_Harvey__a_faked_friend_function(); + +private: + // Hide default-constructor, copy-constructor and assignment + // operator, making this a singleton. + HtZlibCodec(); + HtZlibCodec(const HtZlibCodec &); + void operator= (const HtZlibCodec &); +}; + +#endif /* __HtZlibCodec_h */ diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/Makefile.am b/debian/htdig/htdig-3.2.0b6/htcommon/Makefile.am new file mode 100644 index 00000000..0a487c94 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/Makefile.am @@ -0,0 +1,44 @@ + +include $(top_srcdir)/Makefile.config + +# +# Do not output #line because it confuses the dependencies +# generator of GCC if configure run out of source tree. +# Comment these to ease debugging. +# +AM_LFLAGS = -L +AM_YFLAGS = -l -d + +EXTRA_DIST=conf_lexer.cxx conf_parser.cxx + +pkglib_LTLIBRARIES = libcommon.la + +libcommon_la_SOURCES = DocumentDB.cc DocumentRef.cc \ + HtWordReference.cc HtWordList.cc defaults.cc \ + HtURLCodec.cc URL.cc URLTrans.cc \ + HtZlibCodec.cc cgi.cc HtSGMLCodec.cc \ + HtConfiguration.cc HtURLRewriter.cc \ + conf_lexer.lxx conf_parser.yxx + +libcommon_la_LDFLAGS = -release $(HTDIG_MAJOR_VERSION).$(HTDIG_MINOR_VERSION).$(HTDIG_MICRO_VERSION) ${extra_ldflags} + +noinst_HEADERS = DocumentDB.h \ + DocumentRef.h \ + HtWordReference.h \ + HtWordList.h \ + HtURLCodec.h \ + HtSGMLCodec.h \ + URL.h \ + cgi.h \ + HtZlibCodec.h \ + defaults.h \ + HtConfiguration.h \ + HtURLRewriter.h \ + conf_parser.h + +LOCAL_DEFINES= -DBIN_DIR=\"$(bindir)\" \ + -DCOMMON_DIR=\"$(COMMON_DIR)\" \ + -DCONFIG_DIR=\"$(CONFIG_DIR)\" \ + -DDATABASE_DIR=\"$(DATABASE_DIR)\" \ + -DIMAGE_URL_PREFIX=\"$(IMAGE_URL_PREFIX)\" + diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/Makefile.in b/debian/htdig/htdig-3.2.0b6/htcommon/Makefile.in new file mode 100644 index 00000000..6ff19250 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/Makefile.in @@ -0,0 +1,570 @@ +# Makefile.in generated by automake 1.7.9 from Makefile.am. +# @configure_input@ + +# Copyright 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003 +# Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +# +# To compile with profiling do the following: +# +# make CFLAGS=-g CXXFLAGS=-g PROFILING=-p all +# + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +top_builddir = .. + +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +INSTALL = @INSTALL@ +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +host_triplet = @host@ +ACLOCAL = @ACLOCAL@ +ALLOCA = @ALLOCA@ +AMDEP_FALSE = @AMDEP_FALSE@ +AMDEP_TRUE = @AMDEP_TRUE@ +AMTAR = @AMTAR@ +APACHE = @APACHE@ +APACHE_MODULES = @APACHE_MODULES@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CGIBIN_DIR = @CGIBIN_DIR@ +COMMON_DIR = @COMMON_DIR@ +CONFIG_DIR = @CONFIG_DIR@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DATABASE_DIR = @DATABASE_DIR@ +DEFAULT_CONFIG_FILE = @DEFAULT_CONFIG_FILE@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +ECHO = @ECHO@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +F77 = @F77@ +FFLAGS = @FFLAGS@ +FIND = @FIND@ +GUNZIP = @GUNZIP@ +HAVE_SSL = @HAVE_SSL@ +HTDIG_MAJOR_VERSION = @HTDIG_MAJOR_VERSION@ +HTDIG_MICRO_VERSION = @HTDIG_MICRO_VERSION@ +HTDIG_MINOR_VERSION = @HTDIG_MINOR_VERSION@ +IMAGE_DIR = @IMAGE_DIR@ +IMAGE_URL_PREFIX = @IMAGE_URL_PREFIX@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +MAINT = @MAINT@ +MAINTAINER_MODE_FALSE = @MAINTAINER_MODE_FALSE@ +MAINTAINER_MODE_TRUE = @MAINTAINER_MODE_TRUE@ +MAKEINFO = @MAKEINFO@ +MV = @MV@ +OBJEXT = @OBJEXT@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PERL = @PERL@ +RANLIB = @RANLIB@ +RRDTOOL = @RRDTOOL@ +SEARCH_DIR = @SEARCH_DIR@ +SEARCH_FORM = @SEARCH_FORM@ +SED = @SED@ +SENDMAIL = @SENDMAIL@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +TAR = @TAR@ +TESTS_FALSE = @TESTS_FALSE@ +TESTS_TRUE = @TESTS_TRUE@ +TIME = @TIME@ +TIMEV = @TIMEV@ +USER = @USER@ +VERSION = @VERSION@ +YACC = @YACC@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_F77 = @ac_ct_F77@ +ac_ct_RANLIB = @ac_ct_RANLIB@ +ac_ct_STRIP = @ac_ct_STRIP@ +am__fastdepCC_FALSE = @am__fastdepCC_FALSE@ +am__fastdepCC_TRUE = @am__fastdepCC_TRUE@ +am__fastdepCXX_FALSE = @am__fastdepCXX_FALSE@ +am__fastdepCXX_TRUE = @am__fastdepCXX_TRUE@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +datadir = @datadir@ +exec_prefix = @exec_prefix@ +extra_ldflags = @extra_ldflags@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +oldincludedir = @oldincludedir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +subdirs = @subdirs@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ + +AUTOMAKE_OPTIONS = foreign no-dependencies + +INCLUDES = -DDEFAULT_CONFIG_FILE=\"$(DEFAULT_CONFIG_FILE)\" \ + -I$(top_srcdir)/include -I$(top_srcdir)/htlib \ + -I$(top_srcdir)/htnet -I$(top_srcdir)/htcommon \ + -I$(top_srcdir)/htword \ + -I$(top_srcdir)/db -I$(top_builddir)/db \ + $(LOCAL_DEFINES) $(PROFILING) + + +HTLIBS = $(top_builddir)/htnet/libhtnet.la \ + $(top_builddir)/htcommon/libcommon.la \ + $(top_builddir)/htword/libhtword.la \ + $(top_builddir)/htlib/libht.la \ + $(top_builddir)/htcommon/libcommon.la \ + $(top_builddir)/htword/libhtword.la \ + $(top_builddir)/db/libhtdb.la \ + $(top_builddir)/htlib/libht.la + + +# +# Do not output #line because it confuses the dependencies +# generator of GCC if configure run out of source tree. +# Comment these to ease debugging. +# +AM_LFLAGS = -L +AM_YFLAGS = -l -d + +EXTRA_DIST = conf_lexer.cxx conf_parser.cxx + +pkglib_LTLIBRARIES = libcommon.la + +libcommon_la_SOURCES = DocumentDB.cc DocumentRef.cc \ + HtWordReference.cc HtWordList.cc defaults.cc \ + HtURLCodec.cc URL.cc URLTrans.cc \ + HtZlibCodec.cc cgi.cc HtSGMLCodec.cc \ + HtConfiguration.cc HtURLRewriter.cc \ + conf_lexer.lxx conf_parser.yxx + + +libcommon_la_LDFLAGS = -release $(HTDIG_MAJOR_VERSION).$(HTDIG_MINOR_VERSION).$(HTDIG_MICRO_VERSION) ${extra_ldflags} + +noinst_HEADERS = DocumentDB.h \ + DocumentRef.h \ + HtWordReference.h \ + HtWordList.h \ + HtURLCodec.h \ + HtSGMLCodec.h \ + URL.h \ + cgi.h \ + HtZlibCodec.h \ + defaults.h \ + HtConfiguration.h \ + HtURLRewriter.h \ + conf_parser.h + + +LOCAL_DEFINES = -DBIN_DIR=\"$(bindir)\" \ + -DCOMMON_DIR=\"$(COMMON_DIR)\" \ + -DCONFIG_DIR=\"$(CONFIG_DIR)\" \ + -DDATABASE_DIR=\"$(DATABASE_DIR)\" \ + -DIMAGE_URL_PREFIX=\"$(IMAGE_URL_PREFIX)\" + +subdir = htcommon +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_HEADER = $(top_builddir)/include/config.h +CONFIG_CLEAN_FILES = +LTLIBRARIES = $(pkglib_LTLIBRARIES) + +libcommon_la_LIBADD = +am_libcommon_la_OBJECTS = DocumentDB.lo DocumentRef.lo \ + HtWordReference.lo HtWordList.lo defaults.lo HtURLCodec.lo \ + URL.lo URLTrans.lo HtZlibCodec.lo cgi.lo HtSGMLCodec.lo \ + HtConfiguration.lo HtURLRewriter.lo conf_lexer.lo \ + conf_parser.lo +libcommon_la_OBJECTS = $(am_libcommon_la_OBJECTS) + +DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir)/include +depcomp = +am__depfiles_maybe = +CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) +LTCXXCOMPILE = $(LIBTOOL) --mode=compile $(CXX) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CXXFLAGS) $(CXXFLAGS) +CXXLD = $(CXX) +CXXLINK = $(LIBTOOL) --mode=link $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +LEXCOMPILE = $(LEX) $(LFLAGS) $(AM_LFLAGS) +LTLEXCOMPILE = $(LIBTOOL) --mode=compile $(LEX) $(LFLAGS) $(AM_LFLAGS) +YACCCOMPILE = $(YACC) $(YFLAGS) $(AM_YFLAGS) +LTYACCCOMPILE = $(LIBTOOL) --mode=compile $(YACC) $(YFLAGS) $(AM_YFLAGS) +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) \ + $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +CCLD = $(CC) +LINK = $(LIBTOOL) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +DIST_SOURCES = $(libcommon_la_SOURCES) +HEADERS = $(noinst_HEADERS) + +DIST_COMMON = $(noinst_HEADERS) $(srcdir)/Makefile.in \ + $(top_srcdir)/Makefile.config Makefile.am conf_lexer.cxx \ + conf_parser.cxx conf_parser.h +SOURCES = $(libcommon_la_SOURCES) + +all: all-am + +.SUFFIXES: +.SUFFIXES: .cc .cxx .lo .lxx .o .obj .yxx +$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/Makefile.config $(top_srcdir)/configure.in $(ACLOCAL_M4) + cd $(top_srcdir) && \ + $(AUTOMAKE) --foreign htcommon/Makefile +Makefile: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe) +pkglibLTLIBRARIES_INSTALL = $(INSTALL) +install-pkglibLTLIBRARIES: $(pkglib_LTLIBRARIES) + @$(NORMAL_INSTALL) + $(mkinstalldirs) $(DESTDIR)$(pkglibdir) + @list='$(pkglib_LTLIBRARIES)'; for p in $$list; do \ + if test -f $$p; then \ + f="`echo $$p | sed -e 's|^.*/||'`"; \ + echo " $(LIBTOOL) --mode=install $(pkglibLTLIBRARIES_INSTALL) $(INSTALL_STRIP_FLAG) $$p $(DESTDIR)$(pkglibdir)/$$f"; \ + $(LIBTOOL) --mode=install $(pkglibLTLIBRARIES_INSTALL) $(INSTALL_STRIP_FLAG) $$p $(DESTDIR)$(pkglibdir)/$$f; \ + else :; fi; \ + done + +uninstall-pkglibLTLIBRARIES: + @$(NORMAL_UNINSTALL) + @list='$(pkglib_LTLIBRARIES)'; for p in $$list; do \ + p="`echo $$p | sed -e 's|^.*/||'`"; \ + echo " $(LIBTOOL) --mode=uninstall rm -f $(DESTDIR)$(pkglibdir)/$$p"; \ + $(LIBTOOL) --mode=uninstall rm -f $(DESTDIR)$(pkglibdir)/$$p; \ + done + +clean-pkglibLTLIBRARIES: + -test -z "$(pkglib_LTLIBRARIES)" || rm -f $(pkglib_LTLIBRARIES) + @list='$(pkglib_LTLIBRARIES)'; for p in $$list; do \ + dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \ + test "$$dir" = "$$p" && dir=.; \ + echo "rm -f \"$${dir}/so_locations\""; \ + rm -f "$${dir}/so_locations"; \ + done +conf_parser.h: conf_parser.cxx + @if test ! -f $@; then \ + rm -f conf_parser.cxx; \ + $(MAKE) conf_parser.cxx; \ + else :; fi +libcommon.la: $(libcommon_la_OBJECTS) $(libcommon_la_DEPENDENCIES) + $(CXXLINK) -rpath $(pkglibdir) $(libcommon_la_LDFLAGS) $(libcommon_la_OBJECTS) $(libcommon_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) core *.core + +distclean-compile: + -rm -f *.tab.c + +.cc.o: + $(CXXCOMPILE) -c -o $@ `test -f '$<' || echo '$(srcdir)/'`$< + +.cc.obj: + $(CXXCOMPILE) -c -o $@ `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` + +.cc.lo: + $(LTCXXCOMPILE) -c -o $@ `test -f '$<' || echo '$(srcdir)/'`$< + +.cxx.o: + $(CXXCOMPILE) -c -o $@ `test -f '$<' || echo '$(srcdir)/'`$< + +.cxx.obj: + $(CXXCOMPILE) -c -o $@ `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` + +.cxx.lo: + $(LTCXXCOMPILE) -c -o $@ `test -f '$<' || echo '$(srcdir)/'`$< + +.lxx.cxx: + $(LEXCOMPILE) `test -f $< || echo '$(srcdir)/'`$< + sed '/^#/ s|$(LEX_OUTPUT_ROOT)\.c|$@|' $(LEX_OUTPUT_ROOT).c >$@ + rm -f $(LEX_OUTPUT_ROOT).c + +.yxx.cxx: + $(YACCCOMPILE) `test -f '$<' || echo '$(srcdir)/'`$< + if test -f y.tab.h; then \ + to=`echo "$*_H" | sed \ + -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ + -e 's/[^ABCDEFGHIJKLMNOPQRSTUVWXYZ]/_/g'`; \ + sed "/^#/ s/Y_TAB_H/$$to/g" y.tab.h >$*.ht; \ + rm -f y.tab.h; \ + if cmp -s $*.ht $*.h; then \ + rm -f $*.ht ;\ + else \ + mv $*.ht $*.h; \ + fi; \ + fi + if test -f y.output; then \ + mv y.output $*.output; \ + fi + sed '/^#/ s|y\.tab\.c|$@|' y.tab.c >$@t && mv $@t $@ + rm -f y.tab.c + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +distclean-libtool: + -rm -f libtool +uninstall-info-am: + +ETAGS = etags +ETAGSFLAGS = + +CTAGS = ctags +CTAGSFLAGS = + +tags: TAGS + +ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + mkid -fID $$unique + +TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(ETAGS_ARGS)$$tags$$unique" \ + || $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$tags $$unique + +ctags: CTAGS +CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(CTAGS_ARGS)$$tags$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$tags $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && cd $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) $$here + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) + +top_distdir = .. +distdir = $(top_distdir)/$(PACKAGE)-$(VERSION) + +distdir: $(DISTFILES) + $(mkinstalldirs) $(distdir)/.. + @srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \ + list='$(DISTFILES)'; for file in $$list; do \ + case $$file in \ + $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \ + $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \ + esac; \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test "$$dir" != "$$file" && test "$$dir" != "."; then \ + dir="/$$dir"; \ + $(mkinstalldirs) "$(distdir)$$dir"; \ + else \ + dir=''; \ + fi; \ + if test -d $$d/$$file; then \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \ + fi; \ + cp -pR $$d/$$file $(distdir)$$dir || exit 1; \ + else \ + test -f $(distdir)/$$file \ + || cp -p $$d/$$file $(distdir)/$$file \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) $(HEADERS) + +installdirs: + $(mkinstalldirs) $(DESTDIR)$(pkglibdir) +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + `test -z '$(STRIP)' || \ + echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f $(CONFIG_CLEAN_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." + -rm -f conf_lexer.cxx + -rm -f conf_parser.cxx + -rm -f conf_parser.h +clean: clean-am + +clean-am: clean-generic clean-libtool clean-pkglibLTLIBRARIES \ + mostlyclean-am + +distclean: distclean-am + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-libtool distclean-tags + +dvi: dvi-am + +dvi-am: + +info: info-am + +info-am: + +install-data-am: + +install-exec-am: install-pkglibLTLIBRARIES + +install-info: install-info-am + +install-man: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: uninstall-info-am uninstall-pkglibLTLIBRARIES + +.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ + clean-libtool clean-pkglibLTLIBRARIES ctags distclean \ + distclean-compile distclean-generic distclean-libtool \ + distclean-tags distdir dvi dvi-am info info-am install \ + install-am install-data install-data-am install-exec \ + install-exec-am install-info install-info-am install-man \ + install-pkglibLTLIBRARIES install-strip installcheck \ + installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags uninstall uninstall-am uninstall-info-am \ + uninstall-pkglibLTLIBRARIES + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/Makefile.win32 b/debian/htdig/htdig-3.2.0b6/htcommon/Makefile.win32 new file mode 100644 index 00000000..62aef30c --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/Makefile.win32 @@ -0,0 +1,25 @@ + +TARGET = $(LIBDIR)/libcommon$(LIBSFX) + +# ---------------------------------------------------------------------------- +# add new library members to this list + +# ---------------------------------------------------------------------------- + +include ../Makedefs.win32 + +CXXSRC = DocumentDB.cc DocumentRef.cc HtWordReference.cc \ + HtWordList.cc defaults.cc HtURLCodec.cc URL.cc URLTrans.cc \ + HtZlibCodec.cc cgi.cc HtSGMLCodec.cc HtConfiguration.cc \ + HtURLRewriter.cc + +LXXSRC = conf_lexer.lxx +YXXSRC = conf_parser.yxx + +CPPFLAGS += -DYY_NEVER_INTERACTIVE -DHAVE_CONFIG_H -I../db -I../htlib -I../htword + +$(TARGET): $(OBJDIRDEP) $(LIBDIRDEP) $(OBJS) + $(AR) $(ARFLAGS) $(OBJS) + +include ../Makerules.win32 + diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/URL.cc b/debian/htdig/htdig-3.2.0b6/htcommon/URL.cc new file mode 100644 index 00000000..9ccbe5d5 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/URL.cc @@ -0,0 +1,936 @@ +// +// URL.cc +// +// URL: A URL parsing class, implementing as closely as possible the standard +// laid out in RFC2396 (e.g. http://www.faqs.org/rfcs/rfc2396.html) +// including support for multiple services. (schemes in the RFC) +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: URL.cc,v 1.16 2004/06/04 08:51:01 angusgb Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "URL.h" +#include "QuotedStringList.h" +#include "Dictionary.h" +#include "HtConfiguration.h" +#include "StringMatch.h" +#include "StringList.h" +#include "HtURLRewriter.h" + +#include <string.h> +#include <stdlib.h> +#include <stdio.h> + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +#include <sys/types.h> +#include <ctype.h> + +#ifndef _MSC_VER /* _WIN32 */ +#include <sys/socket.h> +#include <netdb.h> +#include <arpa/inet.h> +#endif + +#define NNTP_DEFAULT_PORT 119 + +static Dictionary *slashCount = 0; + +//***************************************************************************** +// URL::URL() +// Default Constructor +// +URL::URL() +: _url(0), + _path(0), + _service(0), + _host(0), + _port(0), + _normal(0), + _hopcount(0), + _signature(0), + _user(0) +{ +} + + +//***************************************************************************** +// URL::URL(const URL& rhs) +// Copy constructor +// +URL::URL(const URL& rhs) +: _url(rhs._url), + _path(rhs._path), + _service(rhs._service), + _host(rhs._host), + _port(rhs._port), + _normal(rhs._normal), + _hopcount(rhs._hopcount), + _signature(rhs._signature), + _user(rhs._user) +{ +} + + +//***************************************************************************** +// URL::URL(const String &nurl) +// Construct a URL from a String (obviously parses the string passed in) +// +URL::URL(const String &nurl) +: _url(0), + _path(0), + _service(0), + _host(0), + _port(0), + _normal(0), + _hopcount(0), + _signature(0), + _user(0) +{ + parse(nurl); +} + + +//***************************************************************************** +// Assignment operator +const URL &URL::operator = (const URL &rhs) +{ + if (this == &rhs) + return *this; + + // Copy the attributes + _url = rhs._url; + _path = rhs._path; + _service = rhs._service; + _host = rhs._host; + _port = rhs._port; + _normal = rhs._normal; + _hopcount = rhs._hopcount; + _signature = rhs._signature; + _user = rhs._user; + + return *this; +} + +//***************************************************************************** +// URL::URL(const String &url, const URL &parent) +// Parse a reference given a parent url. This is needed to resolve relative +// references which do NOT have a full url. +// +URL::URL(const String &url, const URL &parent) +: _url(0), + _path(0), + _service(parent._service), + _host(parent._host), + _port(parent._port), + _normal(parent._normal), + _hopcount(parent._hopcount + 1), // Since this is one hop *after* the parent, we should account for this + _signature(parent._signature), + _user(parent._user) +{ + HtConfiguration* config= HtConfiguration::config(); + int allowspace = config->Boolean("allow_space_in_url", 0); + String temp; + const char *urp = url.get(); + while (*urp) + { + if (*urp == ' ' && temp.length() > 0 && allowspace) + { + // Replace space character with %20 if there's more non-space + // characters to come... + const char *s = urp+1; + while (*s && isspace(*s)) + s++; + if (*s) + temp << "%20"; + } + else if (!isspace(*urp)) + temp << *urp; + urp++; + } + char* ref = temp; + + // + // Strip any optional anchor from the reference. If, however, the + // reference contains CGI parameters after the anchor, the parameters + // will be moved left to replace the anchor. The overall effect is that + // the anchor is removed. + // Thanks goes to David Filiatrault <[email protected]> for suggesting + // this removal process. + // + char *anchor = strchr(ref, '#'); + char *params = strchr(ref, '?'); + if (anchor) + { + *anchor = '\0'; + if (params) + { + if (anchor < params) + { + while (*params) + { + *anchor++ = *params++; + } + *anchor = '\0'; + } + } + } + + // + // If, after the removal of a possible '#' we have nothing left, + // we just want to use the base URL (we're on the same page but + // different anchors) + // + if (!*ref) + { + // We've already copied much of the info + _url = parent._url; + _path = parent._path; + // Since this is on the same page, we want the same hopcount + _hopcount = parent._hopcount; + return; + } + + // OK, now we need to work out what type of child URL this is + char *p = ref; + while (isalpha(*p)) // Skip through the service portion + p++; + int hasService = (*p == ':'); + // Why single out http? Shouldn't others be the same? + // Child URL of the form https:/child or ftp:child called "full" + // How about using slashes()? + if (hasService && ((strncmp(ref, "http://", 7) == 0) || + (strncmp(ref, "http:", 5) != 0))) + { + // + // No need to look at the parent url since this is a complete url... + // + parse(ref); + } + else if (strncmp(ref, "//", 2) == 0) + { + // look at the parent url's _service, to make this is a complete url... + String fullref(parent._service); + fullref << ':' << ref; + parse((char*)fullref); + } + else + { + if (hasService) + ref = p + 1; // Relative URL, skip "http:" + + if (*ref == '/') + { + // + // The reference is on the same server as the parent, but + // an absolute path was given... + // + _path = ref; + + // + // Get rid of loop-causing constructs in the path + // + normalizePath(); + } + else + { + // + // The reference is relative to the parent + // + + _path = parent._path; + int i = _path.indexOf('?'); + if (i >= 0) + { + _path.chop(_path.length() - i); + } + + // + // Remove any leading "./" sequences which could get us into + // recursive loops. + // + while (strncmp(ref, "./", 2) == 0) + ref += 2; + + if (_path.last() == '/') + { + // + // Parent was a directory. Easy enough: just append + // the current ref to it + // + _path << ref; + } + else + { + // + // Parent was a file. We need to strip the last part + // of the path before we add the reference to it. + // + String temp = _path; + p = strrchr((char*)temp, '/'); + if (p) + { + p[1] = '\0'; + _path = temp.get(); + _path << ref; + } + else + { + // + // Something must be wrong since there were no '/' + // found in the parent url. + // + // We do nothing here. The new url is the parent. + // + } + } + + // + // Get rid of loop-causing constructs in the path + // + normalizePath(); + } + + // + // Build the url. (Note, the host name has NOT been normalized!) + // No need for this if we have called URL::parse. + // + constructURL(); + } +} + + +//***************************************************************************** +// void URL::rewrite() +// +void URL::rewrite() +{ + if (HtURLRewriter::instance()->replace(_url) > 0) + parse(_url.get()); +} + + +//***************************************************************************** +// void URL::parse(const String &u) +// Given a URL string, extract the service, host, port, and path from it. +// +void URL::parse(const String &u) +{ + HtConfiguration* config= HtConfiguration::config(); + int allowspace = config->Boolean("allow_space_in_url", 0); + String temp; + const char *urp = u.get(); + while (*urp) + { + if (*urp == ' ' && temp.length() > 0 && allowspace) + { + // Replace space character with %20 if there's more non-space + // characters to come... + const char *s = urp+1; + while (*s && isspace(*s)) + s++; + if (*s) + temp << "%20"; + } + else if (!isspace(*urp)) + temp << *urp; + urp++; + } + char *nurl = temp; + + // + // Ignore any part of the URL that follows the '#' since this is just + // an index into a document. + // + char *p = strchr(nurl, '#'); + if (p) + *p = '\0'; + + // Some members need to be reset. If not, the caller would + // have used URL::URL(char *ref, URL &parent) + // (which may call us, if the URL is found to be absolute). + _normal = 0; + _signature = 0; + _user = 0; + + // + // Extract the service + // + p = strchr(nurl, ':'); + if (p) + { + _service = strtok(nurl, ":"); + p = strtok(0, "\n"); + } + else + { + _service = "http"; + p = strtok(nurl, "\n"); + } + _service.lowercase(); + + // + // Extract the host + // + if (!p || strncmp(p, "//", 2) != 0) + { + // No host specified, it's all a path. + _host = 0; + _port = 0; + _url = 0; + if (p) // if non-NULL, skip (some) leading slashes in path + { + int i; + for (i = slashes (_service); i > 0 && *p == '/'; i--) + p++; + if (i) // if fewer slashes than specified for protocol don't + // delete any. -> Backwards compatible (necessary??) + p -= slashes (_service) - i; + } + _path = p; + if (strcmp((char*)_service, "file") == 0 || slashes (_service) < 2) + _host = "localhost"; + } + else + { + p += 2; + + // + // p now points to the host + // + char *q = strchr(p, ':'); + char *slash = strchr(p, '/'); + + _path = "/"; + if (strcmp((char*)_service, "file") == 0) + { + // These should be of the form file:/// (i.e. no host) + // if there is a file://host/path then strip the host + if (strncmp(p, "/", 1) != 0) + { + p = strtok(p, "/"); + _path << strtok(0, "\n"); + } + else + _path << strtok(p+1, "\n"); // _path is "/" - don't double + _host = "localhost"; + _port = 0; + } + else if (q && ((slash && slash > q) || !slash)) + { + _host = strtok(p, ":"); + p = strtok(0, "/"); + if (p) + _port = atoi(p); + if (!p || _port <= 0) + _port = DefaultPort(); + // + // The rest of the input string is the path. + // + _path << strtok(0, "\n"); + + } + else + { + _host = strtok(p, "/"); + _host.chop(" \t"); + _port = DefaultPort(); + + // + // The rest of the input string is the path. + // + _path << strtok(0, "\n"); + + } + + // Check to see if host contains a user@ portion + int atMark = _host.indexOf('@'); + if (atMark != -1) + { + _user = _host.sub(0, atMark); + _host = _host.sub(atMark + 1); + } + } + + // + // Get rid of loop-causing constructs in the path + // + normalizePath(); + + // + // Build the url. (Note, the host name has NOT been normalized!) + // + constructURL(); +} + + +//***************************************************************************** +// void URL::normalizePath() +// Called from: URL(const String &url, const URL &parent) +// +void URL::normalizePath() +{ + // + // Rewrite the path to be the minimal. + // Remove "//", "/../" and "/./" components + // + HtConfiguration* config= HtConfiguration::config(); + + int i, limit; + int leadingdotdot = 0; + String newPath; + int pathend = _path.indexOf('?'); // Don't mess up query strings. + if (pathend < 0) + pathend = _path.length(); + + // + // get rid of "//" first, or "/foo//../" will become "/foo/" not "/" + // Some database lookups interpret empty paths (// != /), so give + // the use the option to turn this off. + // + if (!config->Boolean ("allow_double_slash")) + while ((i = _path.indexOf("//")) >= 0 && i < pathend) + { + newPath = _path.sub(0, i).get(); + newPath << _path.sub(i + 1).get(); + _path = newPath; + pathend = _path.indexOf('?'); + if (pathend < 0) + pathend = _path.length(); + } + + // + // Next get rid of redundant "/./". This could cause infinite + // loops. Moreover, "/foo/./../" should become "/", not "/foo/" + // + while ((i = _path.indexOf("/./")) >= 0 && i < pathend) + { + newPath = _path.sub(0, i).get(); + newPath << _path.sub(i + 2).get(); + _path = newPath; + pathend = _path.indexOf('?'); + if (pathend < 0) + pathend = _path.length(); + } + if ((i = _path.indexOf("/.")) >= 0 && i == pathend-2) + { + newPath = _path.sub(0, i+1).get(); // keep trailing slash + newPath << _path.sub(i + 2).get(); + _path = newPath; + pathend--; + } + + // + // Now that "empty" path components are gone, remove ("/../"). + // + while ((i = _path.indexOf("/../")) >= 0 && i < pathend) + { + if ((limit = _path.lastIndexOf('/', i - 1)) >= 0) + { + newPath = _path.sub(0, limit).get(); + newPath << _path.sub(i + 3).get(); + _path = newPath; + } + else + { + _path = _path.sub(i + 3).get(); + leadingdotdot++; + } + pathend = _path.indexOf('?'); + if (pathend < 0) + pathend = _path.length(); + } + if ((i = _path.indexOf("/..")) >= 0 && i == pathend-3) + { + if ((limit = _path.lastIndexOf('/', i - 1)) >= 0) + newPath = _path.sub(0, limit+1).get(); // keep trailing slash + else + { + newPath = '/'; + leadingdotdot++; + } + newPath << _path.sub(i + 3).get(); + _path = newPath; + pathend = _path.indexOf('?'); + if (pathend < 0) + pathend = _path.length(); + } + // The RFC gives us a choice of what to do when we have .. left and + // we're at the top level. By principle of least surprise, we'll just + // toss any "leftovers" Otherwise, we'd have a loop here to add them. + + // Finally change all "%7E" to "~" for sanity + while ((i = _path.indexOf("%7E")) >= 0 && i < pathend) + { + newPath = _path.sub(0, i).get(); + newPath << "~"; + newPath << _path.sub(i + 3).get(); + _path = newPath; + pathend = _path.indexOf('?'); + if (pathend < 0) + pathend = _path.length(); + } + + // If the server *isn't* case sensitive, we want to lowercase the path + if (!config->Boolean("case_sensitive", 1)) + _path.lowercase(); + + // And don't forget to remove index.html or similar file. +// if (strcmp((char*)_service, "file") != 0) (check is now internal) + removeIndex(_path, _service); +} + +//***************************************************************************** +// void URL::dump() +// +void URL::dump() +{ + cout << "service = " << _service.get() << endl; + cout << "user = " << _user.get() << endl; + cout << "host = " << _host.get() << endl; + cout << "port = " << _port << endl; + cout << "path = " << _path << endl; + cout << "url = " << _url << endl; +} + + +//***************************************************************************** +// void URL::path(const String &newpath) +// +void URL::path(const String &newpath) +{ + HtConfiguration* config= HtConfiguration::config(); + _path = newpath; + if (!config->Boolean("case_sensitive",1)) + _path.lowercase(); + constructURL(); +} + + +//***************************************************************************** +// void URL::removeIndex(String &path, String &service) +// Attempt to remove the remove_default_doc from the end of a URL path if +// the service allows that. (File, ftp don't. Do others?) +// This needs to be done to normalize the paths and make .../ the +// same as .../index.html +// Called from: URL::normalize() from URL::signature() [redundant?] +// URL::normalizePath() +// +void URL::removeIndex(String &path, String &service) +{ + HtConfiguration* config= HtConfiguration::config(); + static StringMatch *defaultdoc = 0; + + if (strcmp((char*)_service, "file") == 0 || + strcmp((char*)_service, "ftp") == 0) + return; + + if (path.length() == 0 || strchr((char*)path, '?')) + return; + + int filename = path.lastIndexOf('/') + 1; + if (filename == 0) + return; + + if (! defaultdoc) + { + StringList l(config->Find("remove_default_doc"), " \t"); + defaultdoc = new StringMatch(); + defaultdoc->IgnoreCase(); + defaultdoc->Pattern(l.Join('|')); + } + int which, length; + if (defaultdoc->hasPattern() && + defaultdoc->CompareWord((char*)path.sub(filename), which, length) && + filename+length == path.length()) + path.chop(path.length() - filename); +} + + +//***************************************************************************** +// void URL::normalize() +// Make sure that URLs are always in the same format. +// +void URL::normalize() +{ + HtConfiguration* config= HtConfiguration::config(); + static int hits = 0, misses = 0; + + if (_service.length() == 0 || _normal) + return; + + +// if (strcmp((char*)_service, "http") != 0) + // if service specifies "doesn't specify an IP host", don't normalize it + if (slashes (_service) != 2) + return; + +// if (strcmp ((char*)_service, "http") == 0) (check is now internal) + removeIndex(_path, _service); + + // + // Convert a hostname to an IP address + // + _host.lowercase(); + + if (!config->Boolean("allow_virtual_hosts", 1)) + { + static Dictionary hostbyname; + unsigned long addr; + struct hostent *hp; + + String *ip = (String *) hostbyname[_host]; + if (ip) + { + memcpy((char *) &addr, ip->get(), ip->length()); + hits++; + } + else + { + addr = inet_addr(_host.get()); + if (addr == 0xffffffff) + { + hp = gethostbyname(_host.get()); + if (hp == NULL) + { + return; + } + memcpy((char *)&addr, (char *)hp->h_addr, hp->h_length); + ip = new String((char *) &addr, hp->h_length); + hostbyname.Add(_host, ip); + misses++; + } + } + + static Dictionary machines; + String key; + key << int(addr); + String *realname = (String *) machines[key]; + if (realname) + _host = realname->get(); + else + machines.Add(key, new String(_host)); + } + ServerAlias(); + + // + // Reconstruct the url + // + constructURL(); + _normal = 1; + _signature = 0; +} + + +//***************************************************************************** +// const String &URL::signature() +// Return a string which uniquely identifies the server the current +// URL is refering to. +// This is the first portion of a url: service://user@host:port/ +// (in short this is the URL pointing to the root of this server) +// +const String &URL::signature() +{ + if (_signature.length()) + return _signature; + + if (!_normal) + normalize(); + _signature = _service; + _signature << "://"; + if (_user.length()) + _signature << _user << '@'; + _signature << _host; + _signature << ':' << _port << '/'; + return _signature; +} + +//***************************************************************************** +// void URL::ServerAlias() +// Takes care of the server aliases, which attempt to simplify virtual +// host problems +// +void URL::ServerAlias() +{ + HtConfiguration* config= HtConfiguration::config(); + static Dictionary *serveraliases= 0; + + if (! serveraliases) + { + String l= config->Find("server_aliases"); + String from, *to; + serveraliases = new Dictionary(); + char *p = strtok(l, " \t"); + char *salias= NULL; + while (p) + { + salias = strchr(p, '='); + if (! salias) + { + p = strtok(0, " \t"); + continue; + } + *salias++= '\0'; + from = p; + from.lowercase(); + if (from.indexOf(':') == -1) + from.append(":80"); + to= new String(salias); + to->lowercase(); + if (to->indexOf(':') == -1) + to->append(":80"); + serveraliases->Add(from.get(), to); + // fprintf (stderr, "Alias: %s->%s\n", from.get(), to->get()); + p = strtok(0, " \t"); + } + } + + String *al= 0; + int newport; + int delim; + String serversig = _host; + serversig << ':' << _port; + if ((al= (String *) serveraliases->Find(serversig))) + { + delim= al->indexOf(':'); + // fprintf(stderr, "\nOld URL: %s->%s\n", (char *) serversig, (char *) *al); + _host= al->sub(0,delim).get(); + sscanf((char*)al->sub(delim+1), "%d", &newport); + _port= newport; + // fprintf(stderr, "New URL: %s:%d\n", (char *) _host, _port); + } +} + +//***************************************************************************** +// int URL::slash(const String &protocol) +// Returns number of slashes folowing the service name for protocol +// +int +URL::slashes(const String &protocol) +{ + if (!slashCount) + { + HtConfiguration* config= HtConfiguration::config(); + slashCount = new Dictionary(); + + slashCount->Add (String("mailto"), new String("0")); + slashCount->Add (String("news"), new String("0")); + slashCount->Add (String("http"), new String("2")); + slashCount->Add (String("ftp"), new String("2")); + // file:/// has three, but the last counts as part of the path... + slashCount->Add (String("file"), new String("2")); + + QuotedStringList qsl(config->Find("external_protocols"), " \t"); + String from; + int i; + int sep,colon; + + for (i = 0; qsl[i]; i += 2) + { + from = qsl[i]; + sep = from.indexOf("->"); + if (sep != -1) + from = from.sub(0, sep).get(); // "get" aids portability... + + colon = from.indexOf(":"); + // if service specified as "help:/" or "man:", note trailing slashes + // Default is 2. + if (colon != -1) + { + int i; + char count [2]; + for (i = colon+1; from[i] == '/'; i++) + ; + count [0] = i - colon + '0' - 1; + count [1] = '\0'; + from = from.sub(0,colon).get(); + slashCount->Add (from, new String (count)); + } else + slashCount->Add (from, new String ("2")); + } + } + + // Default to two slashes for unknown protocols + String *count = (String *)slashCount->Find(protocol); + return count ? (count->get()[0] - '0') : 2; +} + +//***************************************************************************** +// void URL::constructURL() +// Constructs the _url member from everything else +// Also ensures the port number is correct for the service +// Called from URL::URL(const String &url, const URL &parent) +// URL::parse(const String &u) +// URL::path(const String &newpath) +// URL::normalize() +// +void URL::constructURL() +{ + if (strcmp((char*)_service, "file") != 0 && _host.length() == 0) { + _url = ""; + return; + } + + _url = _service; + _url << ":"; + + // Add correct number of slashes after service name + int i; + for (i = slashes (_service); i > 0; i--) + { + _url << "/"; + } + + if (slashes (_service) == 2) // services specifying a particular + { // IP host must begin "service://" + if (strcmp((char*)_service, "file") != 0) + { + if (_user.length()) + _url << _user << '@'; + _url << _host; + } + + if (_port != DefaultPort() && _port != 0) // Different than the default port + _url << ':' << _port; + } + + _url << _path; +} + + +/////// + // Get the default port for the recognised service +/////// + +int URL::DefaultPort() +{ + if (strcmp((char*)_service, "http") == 0) + return 80; + else if (strcmp((char*)_service, "https") == 0) + return 443; + else if (strcmp((char*)_service, "ftp") == 0) + return 21; + else if (strcmp((char*)_service, "gopher") == 0) + return 70; + else if (strcmp((char*)_service, "file") == 0) + return 0; + else if (strcmp((char*)_service, "news") == 0) + return NNTP_DEFAULT_PORT; + else return 80; +} diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/URL.h b/debian/htdig/htdig-3.2.0b6/htcommon/URL.h new file mode 100644 index 00000000..4cea16ee --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/URL.h @@ -0,0 +1,100 @@ +// +// URL.h +// +// URL: A URL parsing class, implementing as closely as possible the standard +// laid out in RFC2396 (e.g. http://www.faqs.org/rfcs/rfc2396.html) +// including support for multiple schemes. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: URL.h,v 1.8 2004/05/28 13:15:12 lha Exp $ +// + +#ifndef _URL_h_ +#define _URL_h_ + +#include "htString.h" + +class URL +{ +public: + URL(); + URL(const String &url); + URL(const URL& rhs); + URL(const String &ref, const URL &parent); + + void parse(const String &url); + + const String &host() const {return _host;} + void host(const String &h) {_host = h;} + + int port() const {return _port;} + void port(const int p) {_port = p;} + int DefaultPort(); + + const String &service() const {return _service;} + void service(const String &s) {_service = s;} + + const String &path() const {return _path;} + void path(const String &p); + + int hopcount() const {return _hopcount;} + void hopcount(int h) {_hopcount = h;} + + const String &user() const {return _user;} + void user(const String &u) {_user = u;} + + const String &get() const {return _url;} + void dump(); + void normalize(); + void rewrite(); + const String &signature(); + + const URL &operator = (const URL &rhs); + +private: + String _url; + String _path; + String _service; + String _host; + int _port; + int _normal; + int _hopcount; + String _signature; + String _user; + + void removeIndex(String &, String &); + void normalizePath(); + void ServerAlias(); + void constructURL(); + // Number of slashes following service specifier. eg service("http")=2 + static int slashes(const String &); +}; + + +// Unreserved punctuation allowed unencoded in URLs. We use a more restricted +// list of unreserved characters than allowed by RFC 2396 (which revises and +// replaces RFC 1738), because it can't hurt to encode any of these +// characters, and they can pose problems in some contexts. RFC 2396 says +// that only alphanumerics, the unreserved characters "-_.!~*'(),", and +// reserved characters used for their reserved purposes may be used +// unencoded within a URL. We encode reserved characters because we now +// encode URL parameter values individually before piecing together the whole +// query string using reserved characters. + +#define UNRESERVED "-_.!~*" + +//String &encodeURL(String &, char *valid = "?_@.=&/:"); +//String &encodeURL(String &, char *reserved = ";/?:@&=+$,"); +// char *unreserved = "-_.!~*'()"); +String &encodeURL(String &, char *valid = (char *)UNRESERVED); + +String &decodeURL(String &); + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/URLTrans.cc b/debian/htdig/htdig-3.2.0b6/htcommon/URLTrans.cc new file mode 100644 index 00000000..82515177 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/URLTrans.cc @@ -0,0 +1,93 @@ +// +// URLTrans.cc +// +// URLTrans: Helper functions for the implementation of the URL class. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: URLTrans.cc,v 1.5 2004/05/28 13:15:12 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "URL.h" +#include "htString.h" +#include "lib.h" + +#include <ctype.h> + + +//***************************************************************************** +// String &decodeURL(String &str) +// Convert the given URL string to a normal string. This means that +// all escaped characters are converted to their normal values. The +// escape character is '%' and is followed by 2 hex digits +// representing the octet. +// +String &decodeURL(String &str) +{ + String temp; + char *p; + + for (p = str; p && *p; p++) + { + if (*p == '%') + { + // + // 2 hex digits follow... + // + int value = 0; + for (int i = 0; p[1] && i < 2; i++) + { + p++; + value <<= 4; + if (isdigit(*p)) + value += *p - '0'; + else + value += toupper(*p) - 'A' + 10; + } + temp << char(value); + } + else + temp << *p; + } + str = temp; + return (str); +} + + +//***************************************************************************** +// String &encodeURL(String &str, char *valid) +// Convert a normal string to a URL 'safe' string. This means that +// all characters not explicitly mentioned in the URL BNF will be +// escaped. The escape character is '%' and is followed by 2 hex +// digits representing the octet. +// +String &encodeURL(String &str, char *valid) +{ + String temp; + static char *digits = "0123456789ABCDEF"; + char *p; + + for (p = str; p && *p; p++) + { + if (isascii(*p) && (isdigit(*p) || isalpha(*p) || strchr(valid, *p))) + temp << *p; + else + { + temp << '%'; + temp << digits[(*p >> 4) & 0x0f]; + temp << digits[*p & 0x0f]; + } + } + str = temp; + return (str); +} + + diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/cgi.cc b/debian/htdig/htdig-3.2.0b6/htcommon/cgi.cc new file mode 100644 index 00000000..557d90ee --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/cgi.cc @@ -0,0 +1,213 @@ +// +// cgi.cc +// +// cgi: Parse cgi arguments and put them in a dictionary. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: cgi.cc,v 1.9 2004/05/28 13:15:12 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#ifdef _MSC_VER /* _WIN32 */ +#include <io.h> +#endif + +#include "cgi.h" +#include "htString.h" +#include "Dictionary.h" +#include "good_strtok.h" +#include "StringList.h" +#include "URL.h" + +#include <stdlib.h> + +#ifndef _MSC_VER /* _WIN32 */ +#include <unistd.h> +#endif + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +//***************************************************************************** +// cgi::cgi() +// +cgi::cgi() +{ + init(""); +} + + +//***************************************************************************** +// cgi::cgi(char *s) +// +cgi::cgi(char *s) +{ + init(s); +} + + +//***************************************************************************** +// void cgi::init(char *s) +// +void +cgi::init(char *s) +{ + pairs = new Dictionary; + + int i; + String method(getenv("REQUEST_METHOD")); + + if ((!s || !*s) && method.length() == 0) + { + // + // Interactive mode + // + query = 1; + return; + } + query = 0; + String results; + + if (s && *s && method.length() == 0) + { + results = s; + } + else if (strcmp((char*)method, "GET") == 0) + { + results = getenv("QUERY_STRING"); + } + else + { + int n; + char *buf; + + buf = getenv("CONTENT_LENGTH"); + if (!buf || !*buf || (n = atoi(buf)) <= 0) + return; // null query + buf = new char[n + 1]; + int r, i = 0; + while (i < n && (r = read(0, buf+i, n-i)) > 0) + i += r; + buf[i] = '\0'; + results = buf; + delete [] buf; + } + + // + // Now we need to split the line up into name/value pairs + // + StringList list(results, "&;"); + + // + // Each name/value pair now needs to be added to the dictionary + // + for (i = 0; i < list.Count(); i++) + { + char *name = good_strtok(list[i], '='); + String value(good_strtok(NULL, '\n')); + value.replace('+', ' '); + decodeURL(value); + String *str = (String *) pairs->Find(name); + if (str) + { + // + // Entry was already there. Append it to the string. + // + str->append('\001'); + str->append(value); + } + else + { + // + // New entry. Add a new string + // + pairs->Add(name, new String(value)); + } + } +} + + +//***************************************************************************** +// cgi::~cgi() +// +cgi::~cgi() +{ + delete pairs; +} + + +//***************************************************************************** +// char *cgi::operator [] (char *name) +// +char *cgi::operator [] (char *name) +{ + return get(name); +} + + +//***************************************************************************** +// char *cgi::get(char *name) +// +char *cgi::get(char *name) +{ + String *str = (String *) (*pairs)[name]; + if (str) + return str->get(); + else + { + if (query) + { + char buffer[1000]; + cerr << "Enter value for " << name << ": "; + cin.getline(buffer, sizeof(buffer)); + pairs->Add(name, new String(buffer)); + str = (String *) (*pairs)[name]; + return str->get(); + } + return 0; + } +} + + +//***************************************************************************** +// int cgi::exists(char *name) +// +int +cgi::exists(char *name) +{ + return pairs->Exists(name); +} + +//***************************************************************************** +// char *cgi::path() +// +char *cgi::path() +{ + static char buffer[1000] = ""; + + if (query) + { + if (*buffer) + return buffer; + cerr << "Enter PATH_INFO: "; + cin.getline(buffer, sizeof(buffer)); + return buffer; + } + return getenv("PATH_INFO"); +} + + diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/cgi.h b/debian/htdig/htdig-3.2.0b6/htcommon/cgi.h new file mode 100644 index 00000000..c1232f05 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/cgi.h @@ -0,0 +1,40 @@ +// +// cgi.h +// +// cgi: Parse cgi arguments and put them in a dictionary. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: cgi.h,v 1.4 2004/05/28 13:15:12 lha Exp $ +// + +#ifndef _cgi_h_ +#define _cgi_h_ + +class Dictionary; + +class cgi +{ +public: + cgi(); + cgi(char *s); + ~cgi(); + + char *operator [] (char *); + char *get(char *); + int exists(char *); + char *path(); + +private: + Dictionary *pairs; + int query; + void init(char *s); +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/conf_lexer.cxx b/debian/htdig/htdig-3.2.0b6/htcommon/conf_lexer.cxx new file mode 100644 index 00000000..5ac1ceaf --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/conf_lexer.cxx @@ -0,0 +1,2006 @@ +#line 2 "conf_lexer.cxx" + +#line 4 "conf_lexer.cxx" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 5 +#define YY_FLEX_SUBMINOR_VERSION 31 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <stdlib.h> + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */ + +#if defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L +#include <inttypes.h> +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; +#endif /* ! C99 */ + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#endif /* ! FLEXINT_H */ + +#ifdef __cplusplus + +/* The "const" storage-class-modifier is valid. */ +#define YY_USE_CONST + +#else /* ! __cplusplus */ + +#if __STDC__ + +#define YY_USE_CONST + +#endif /* __STDC__ */ +#endif /* ! __cplusplus */ + +#ifdef YY_USE_CONST +#define yyconst const +#else +#define yyconst +#endif + +/* Returned upon end-of-file. */ +#define YY_NULL 0 + +/* Promotes a possibly negative, possibly signed char to an unsigned + * integer for use as an array index. If the signed char is negative, + * we want to instead treat it as an 8-bit unsigned char, hence the + * double cast. + */ +#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c) + +/* Enter a start condition. This macro really ought to take a parameter, + * but we do it the disgusting crufty way forced on us by the ()-less + * definition of BEGIN. + */ +#define BEGIN (yy_start) = 1 + 2 * + +/* Translate the current start state into a value that can be later handed + * to BEGIN to return to the state. The YYSTATE alias is for lex + * compatibility. + */ +#define YY_START (((yy_start) - 1) / 2) +#define YYSTATE YY_START + +/* Action number for EOF rule of a given start state. */ +#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1) + +/* Special action meaning "start processing a new file". */ +#define YY_NEW_FILE yyrestart(yyin ) + +#define YY_END_OF_BUFFER_CHAR 0 + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#define YY_BUF_SIZE 16384 +#endif + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +extern int yyleng; + +extern FILE *yyin, *yyout; + +#define EOB_ACT_CONTINUE_SCAN 0 +#define EOB_ACT_END_OF_FILE 1 +#define EOB_ACT_LAST_MATCH 2 + + /* Note: We specifically omit the test for yy_rule_can_match_eol because it requires + * access to the local variable yy_act. Since yyless() is a macro, it would break + * existing scanners that call yyless() from OUTSIDE yylex. + * One obvious solution it to make yy_act a global. I tried that, and saw + * a 5% performance hit in a non-yylineno scanner, because yy_act is + * normally declared as a register variable-- so it is not worth it. + */ + #define YY_LESS_LINENO(n) \ + do { \ + int yyl;\ + for ( yyl = n; yyl < yyleng; ++yyl )\ + if ( yytext[yyl] == '\n' )\ + --yylineno;\ + }while(0) + +/* Return all but the first "n" matched characters back to the input stream. */ +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + *yy_cp = (yy_hold_char); \ + YY_RESTORE_YY_MORE_OFFSET \ + (yy_c_buf_p) = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \ + YY_DO_BEFORE_ACTION; /* set up yytext again */ \ + } \ + while ( 0 ) + +#define unput(c) yyunput( c, (yytext_ptr) ) + +/* The following is because we cannot portably get our hands on size_t + * (without autoconf's help, which isn't available because we want + * flex-generated scanners to compile on their own). + */ + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef unsigned int yy_size_t; +#endif + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + yy_size_t yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + int yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + +#define YY_BUFFER_NEW 0 +#define YY_BUFFER_NORMAL 1 + /* When an EOF's been seen but there's still some text to process + * then we mark the buffer as YY_EOF_PENDING, to indicate that we + * shouldn't try reading from the input source any more. We might + * still have a bunch of tokens to match, though, because of + * possible backing-up. + * + * When we actually see the EOF, we change the status to "new" + * (via yyrestart()), so that the user can continue scanning by + * just pointing yyin at a new input file. + */ +#define YY_BUFFER_EOF_PENDING 2 + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +/* Stack of input buffers. */ +static size_t yy_buffer_stack_top = 0; /**< index of top of stack. */ +static size_t yy_buffer_stack_max = 0; /**< capacity of stack. */ +static YY_BUFFER_STATE * yy_buffer_stack = 0; /**< Stack as an array. */ + +/* We provide macros for accessing buffer states in case in the + * future we want to put the buffer states in a more general + * "scanner state". + * + * Returns the top of the stack, or NULL. + */ +#define YY_CURRENT_BUFFER ( (yy_buffer_stack) \ + ? (yy_buffer_stack)[(yy_buffer_stack_top)] \ + : NULL) + +/* Same as previous macro, but useful when we know that the buffer stack is not + * NULL or when we need an lvalue. For internal use only. + */ +#define YY_CURRENT_BUFFER_LVALUE (yy_buffer_stack)[(yy_buffer_stack_top)] + +/* yy_hold_char holds the character lost when yytext is formed. */ +static char yy_hold_char; +static int yy_n_chars; /* number of characters read into yy_ch_buf */ +int yyleng; + +/* Points to current character in buffer. */ +static char *yy_c_buf_p = (char *) 0; +static int yy_init = 1; /* whether we need to initialize */ +static int yy_start = 0; /* start state number */ + +/* Flag which is used to allow yywrap()'s to do buffer switches + * instead of setting up a fresh yyin. A bit of a hack ... + */ +static int yy_did_buffer_switch_on_eof; + +void yyrestart (FILE *input_file ); +void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ); +YY_BUFFER_STATE yy_create_buffer (FILE *file,int size ); +void yy_delete_buffer (YY_BUFFER_STATE b ); +void yy_flush_buffer (YY_BUFFER_STATE b ); +void yypush_buffer_state (YY_BUFFER_STATE new_buffer ); +void yypop_buffer_state (void ); + +static void yyensure_buffer_stack (void ); +static void yy_load_buffer_state (void ); +static void yy_init_buffer (YY_BUFFER_STATE b,FILE *file ); + +#define YY_FLUSH_BUFFER yy_flush_buffer(YY_CURRENT_BUFFER ) + +YY_BUFFER_STATE yy_scan_buffer (char *base,yy_size_t size ); +YY_BUFFER_STATE yy_scan_string (yyconst char *yy_str ); +YY_BUFFER_STATE yy_scan_bytes (yyconst char *bytes,int len ); + +void *yyalloc (yy_size_t ); +void *yyrealloc (void *,yy_size_t ); +void yyfree (void * ); + +#define yy_new_buffer yy_create_buffer + +#define yy_set_interactive(is_interactive) \ + { \ + if ( ! YY_CURRENT_BUFFER ){ \ + yyensure_buffer_stack (); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer(yyin,YY_BUF_SIZE ); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \ + } + +#define yy_set_bol(at_bol) \ + { \ + if ( ! YY_CURRENT_BUFFER ){\ + yyensure_buffer_stack (); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer(yyin,YY_BUF_SIZE ); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \ + } + +#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol) + +/* Begin user sect3 */ + +#define yywrap(n) 1 +#define YY_SKIP_YYWRAP + +typedef unsigned char YY_CHAR; + +FILE *yyin = (FILE *) 0, *yyout = (FILE *) 0; + +typedef int yy_state_type; + +extern int yylineno; + +int yylineno = 1; + +extern char *yytext; +#define yytext_ptr yytext + +static yy_state_type yy_get_previous_state (void ); +static yy_state_type yy_try_NUL_trans (yy_state_type current_state ); +static int yy_get_next_buffer (void ); +static void yy_fatal_error (yyconst char msg[] ); + +/* Done after the current pattern has been matched and before the + * corresponding action - sets up yytext. + */ +#define YY_DO_BEFORE_ACTION \ + (yytext_ptr) = yy_bp; \ + yyleng = (size_t) (yy_cp - yy_bp); \ + (yy_hold_char) = *yy_cp; \ + *yy_cp = '\0'; \ + (yy_c_buf_p) = yy_cp; + +#define YY_NUM_RULES 22 +#define YY_END_OF_BUFFER 23 +/* This struct is not used in this scanner, + but its presence is necessary. */ +struct yy_trans_info + { + flex_int32_t yy_verify; + flex_int32_t yy_nxt; + }; +static yyconst flex_int16_t yy_accept[63] = + { 0, + 0, 0, 16, 16, 0, 0, 0, 0, 0, 0, + 23, 21, 3, 20, 11, 5, 8, 8, 3, 2, + 19, 21, 3, 18, 14, 14, 10, 14, 17, 6, + 7, 9, 3, 8, 8, 3, 2, 19, 0, 0, + 1, 3, 14, 14, 0, 14, 14, 10, 15, 17, + 9, 8, 12, 13, 13, 8, 8, 8, 8, 0, + 4, 0 + } ; + +static yyconst flex_int32_t yy_ec[256] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 5, 6, 7, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 8, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 10, 5, 11, + 5, 12, 5, 5, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 5, 14, 5, 5, 13, 5, 13, 13, 15, 16, + + 17, 13, 13, 13, 18, 13, 13, 19, 13, 20, + 13, 13, 13, 13, 13, 13, 21, 13, 13, 13, + 13, 13, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5 + } ; + +static yyconst flex_int32_t yy_meta[22] = + { 0, + 1, 2, 3, 1, 4, 4, 4, 5, 5, 4, + 6, 6, 5, 4, 5, 5, 5, 5, 5, 5, + 5 + } ; + +static yyconst flex_int16_t yy_base[73] = + { 0, + 0, 20, 27, 41, 55, 59, 61, 0, 78, 89, + 87, 183, 84, 183, 183, 183, 0, 65, 100, 82, + 0, 80, 80, 183, 64, 107, 23, 32, 0, 183, + 183, 0, 75, 0, 61, 0, 0, 0, 72, 68, + 183, 68, 54, 35, 61, 121, 62, 25, 183, 0, + 0, 35, 183, 47, 0, 31, 35, 9, 38, 64, + 183, 183, 135, 141, 147, 20, 153, 156, 162, 165, + 171, 176 + } ; + +static yyconst flex_int16_t yy_def[73] = + { 0, + 62, 1, 63, 63, 64, 64, 1, 7, 65, 65, + 62, 62, 62, 62, 62, 62, 66, 66, 62, 19, + 20, 67, 62, 62, 68, 69, 68, 68, 70, 62, + 62, 71, 62, 66, 66, 19, 20, 20, 67, 67, + 62, 62, 68, 68, 72, 69, 46, 68, 62, 70, + 71, 66, 62, 72, 46, 66, 66, 66, 66, 62, + 62, 0, 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62 + } ; + +static yyconst flex_int16_t yy_nxt[205] = + { 0, + 12, 13, 14, 12, 12, 12, 12, 12, 12, 15, + 16, 12, 17, 12, 17, 17, 17, 18, 17, 17, + 17, 19, 20, 21, 34, 59, 22, 12, 23, 24, + 12, 48, 26, 48, 49, 27, 44, 53, 44, 60, + 28, 12, 23, 24, 12, 44, 26, 61, 44, 27, + 58, 57, 54, 56, 28, 12, 13, 12, 12, 12, + 13, 12, 12, 24, 53, 60, 54, 44, 30, 42, + 41, 12, 31, 61, 41, 52, 33, 44, 17, 13, + 12, 42, 41, 38, 35, 33, 62, 62, 12, 12, + 13, 12, 62, 62, 62, 62, 62, 62, 62, 12, + + 12, 36, 37, 38, 62, 62, 39, 45, 45, 62, + 45, 62, 62, 62, 62, 62, 62, 62, 62, 62, + 47, 45, 45, 62, 45, 62, 55, 62, 62, 62, + 62, 62, 62, 62, 47, 25, 25, 25, 25, 25, + 25, 29, 29, 29, 29, 29, 29, 32, 32, 32, + 32, 32, 32, 40, 40, 40, 40, 40, 40, 43, + 43, 43, 46, 46, 62, 46, 46, 46, 50, 50, + 50, 51, 62, 62, 51, 51, 45, 45, 62, 45, + 45, 45, 11, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, + + 62, 62, 62, 62 + } ; + +static yyconst flex_int16_t yy_chk[205] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 66, 58, 2, 3, 3, 3, + 3, 27, 3, 48, 28, 3, 27, 44, 48, 59, + 3, 4, 4, 4, 4, 28, 4, 59, 44, 4, + 57, 56, 54, 52, 4, 5, 5, 5, 5, 6, + 6, 6, 6, 7, 47, 60, 45, 43, 7, 42, + 40, 7, 7, 60, 39, 35, 33, 25, 7, 9, + 9, 23, 22, 20, 18, 13, 11, 0, 9, 9, + 10, 10, 0, 0, 0, 0, 0, 0, 0, 10, + + 10, 19, 19, 19, 0, 0, 19, 26, 26, 0, + 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 26, 46, 46, 0, 46, 0, 46, 0, 0, 0, + 0, 0, 0, 0, 46, 63, 63, 63, 63, 63, + 63, 64, 64, 64, 64, 64, 64, 65, 65, 65, + 65, 65, 65, 67, 67, 67, 67, 67, 67, 68, + 68, 68, 69, 69, 0, 69, 69, 69, 70, 70, + 70, 71, 0, 0, 71, 71, 72, 72, 0, 72, + 72, 72, 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, + + 62, 62, 62, 62 + } ; + +/* Table of booleans, true if rule could match eol. */ +static yyconst flex_int32_t yy_rule_can_match_eol[23] = + { 0, +1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, + 1, 1, 0, }; + +static yy_state_type yy_last_accepting_state; +static char *yy_last_accepting_cpos; + +extern int yy_flex_debug; +int yy_flex_debug = 0; + +/* The intent behind this definition is that it'll catch + * any uses of REJECT which flex missed. + */ +#define REJECT reject_used_but_not_detected +#define yymore() yymore_used_but_not_detected +#define YY_MORE_ADJ 0 +#define YY_RESTORE_YY_MORE_OFFSET +char *yytext; +#line 1 "conf_lexer.lxx" +/* +// +// conf_lexer.lxx +// +// This lexical parser is used to parse ht://Dig config +// files. +// +// Note: The resulting .cxx file produces warnings of unused +// labels. As at 2003-06-02, these have been manually +// removed, but they will reappear when (f)lex is re-run. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: conf_lexer.cxx,v 1.11 2004/06/10 14:48:38 angusgb Exp $ +// +*/ +#line 22 "conf_lexer.lxx" +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + + + + +#line 34 "conf_lexer.lxx" +#ifdef HAVE_STRINGS_H +#include <strings.h> +#endif + +#include "HtConfiguration.h" +//#include "Dictionary.h" +#include "conf_parser.h" +#define MAX_INCLUDE_DEPTH 10 +YY_BUFFER_STATE include_stack[MAX_INCLUDE_DEPTH]; +String *name_stack[MAX_INCLUDE_DEPTH]; +int lineno_stack[MAX_INCLUDE_DEPTH]; +int include_stack_ptr = 0; +#line 571 "conf_lexer.cxx" + +#define INITIAL 0 +#define t_right 1 +#define incl 2 +#define bracket 3 +#define br_string 4 + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include <unistd.h> +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int yywrap (void ); +#else +extern int yywrap (void ); +#endif +#endif + +#ifndef yytext_ptr +static void yy_flex_strncpy (char *,yyconst char *,int ); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * ); +#endif + +#ifndef YY_NO_INPUT + +#ifdef __cplusplus +static int yyinput (void ); +#else +static int input (void ); +#endif + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#define YY_READ_BUF_SIZE 8192 +#endif + +/* Copy whatever the last rule matched to the standard output. */ +#ifndef ECHO +/* This used to be an fputs(), but since the string might contain NUL's, + * we now use fwrite(). + */ +#define ECHO (void) fwrite( yytext, yyleng, 1, yyout ) +#endif + +/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL, + * is returned in "result". + */ +#ifndef YY_INPUT +#define YY_INPUT(buf,result,max_size) \ + if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \ + { \ + int c = '*'; \ + size_t n; \ + for ( n = 0; n < max_size && \ + (c = getc( yyin )) != EOF && c != '\n'; ++n ) \ + buf[n] = (char) c; \ + if ( c == '\n' ) \ + buf[n++] = (char) c; \ + if ( c == EOF && ferror( yyin ) ) \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + result = n; \ + } \ + else \ + { \ + errno=0; \ + while ( (result = fread(buf, 1, max_size, yyin))==0 && ferror(yyin)) \ + { \ + if( errno != EINTR) \ + { \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + break; \ + } \ + errno=0; \ + clearerr(yyin); \ + } \ + }\ +\ + +#endif + +/* No semi-colon after return; correct usage is to write "yyterminate();" - + * we don't want an extra ';' after the "return" because that will cause + * some compilers to complain about unreachable statements. + */ +#ifndef yyterminate +#define yyterminate() return YY_NULL +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Report a fatal error. */ +#ifndef YY_FATAL_ERROR +#define YY_FATAL_ERROR(msg) yy_fatal_error( msg ) +#endif + +/* end tables serialization structures and prototypes */ + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int yylex (void); + +#define YY_DECL int yylex (void) +#endif /* !YY_DECL */ + +/* Code executed at the beginning of each rule, after yytext and yyleng + * have been set up. + */ +#ifndef YY_USER_ACTION +#define YY_USER_ACTION +#endif + +/* Code executed at the end of each rule. */ +#ifndef YY_BREAK +#define YY_BREAK break; +#endif + +#define YY_RULE_SETUP \ + if ( yyleng > 0 ) \ + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = \ + (yytext[yyleng - 1] == '\n'); \ + YY_USER_ACTION + +/** The main scanner function which does all the work. + */ +YY_DECL +{ + register yy_state_type yy_current_state; + register char *yy_cp, *yy_bp; + register int yy_act; + +#line 52 "conf_lexer.lxx" + + +#line 730 "conf_lexer.cxx" + + if ( (yy_init) ) + { + (yy_init) = 0; + +#ifdef YY_USER_INIT + YY_USER_INIT; +#endif + + if ( ! (yy_start) ) + (yy_start) = 1; /* first start state */ + + if ( ! yyin ) + yyin = stdin; + + if ( ! yyout ) + yyout = stdout; + + if ( ! YY_CURRENT_BUFFER ) { + yyensure_buffer_stack (); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer(yyin,YY_BUF_SIZE ); + } + + yy_load_buffer_state( ); + } + + while ( 1 ) /* loops until end-of-file is reached */ + { + yy_cp = (yy_c_buf_p); + + /* Support of yytext. */ + *yy_cp = (yy_hold_char); + + /* yy_bp points to the position in yy_ch_buf of the start of + * the current run. + */ + yy_bp = yy_cp; + + yy_current_state = (yy_start); + yy_current_state += YY_AT_BOL(); +yy_match: + do + { + register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)]; + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 63 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + ++yy_cp; + } + while ( yy_base[yy_current_state] != 183 ); + +yy_find_action: + yy_act = yy_accept[yy_current_state]; + if ( yy_act == 0 ) + { /* have to back up */ + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + yy_act = yy_accept[yy_current_state]; + } + + YY_DO_BEFORE_ACTION; + + if ( yy_act != YY_END_OF_BUFFER && yy_rule_can_match_eol[yy_act] ) + { + int yyl; + for ( yyl = 0; yyl < yyleng; ++yyl ) + if ( yytext[yyl] == '\n' ) + + yylineno++; +; + } + +do_action: /* This label is used only to access EOF actions. */ + + switch ( yy_act ) + { /* beginning of action switch */ + case 0: /* must back up */ + /* undo the effects of YY_DO_BEFORE_ACTION */ + *yy_cp = (yy_hold_char); + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + goto yy_find_action; + +case 1: +/* rule 1 can match eol */ +YY_RULE_SETUP +#line 54 "conf_lexer.lxx" +/* Ignore comments */ + YY_BREAK +case 2: +/* rule 2 can match eol */ +YY_RULE_SETUP +#line 55 "conf_lexer.lxx" +/* Ignore empty lines */ + YY_BREAK +case 3: +YY_RULE_SETUP +#line 56 "conf_lexer.lxx" +/* Ignore spaces */ + YY_BREAK +case 4: +YY_RULE_SETUP +#line 57 "conf_lexer.lxx" +BEGIN(incl); + YY_BREAK +case 5: +YY_RULE_SETUP +#line 58 "conf_lexer.lxx" +{ BEGIN(bracket); return(T_LEFT_BR); } + YY_BREAK +case 6: +YY_RULE_SETUP +#line 59 "conf_lexer.lxx" +return(T_SLASH); + YY_BREAK +case 7: +YY_RULE_SETUP +#line 60 "conf_lexer.lxx" +return(T_RIGHT_BR); + YY_BREAK +case 8: +YY_RULE_SETUP +#line 61 "conf_lexer.lxx" +{ + //yylval.str = (char *)malloc(yyleng+1); + yylval.str = new char[yyleng+1]; + strcpy(yylval.str,yytext); + return(T_KEYWORD); + } + YY_BREAK +case 9: +YY_RULE_SETUP +#line 67 "conf_lexer.lxx" +{ + BEGIN(bracket); + yylval.str = new char[yyleng+1]; + strcpy(yylval.str,yytext); + return(T_STRING); + } + YY_BREAK +case 10: +YY_RULE_SETUP +#line 74 "conf_lexer.lxx" +{ + //yylval.str = (char*)malloc(yyleng+1); + yylval.str = new char[yyleng+1]; + strcpy(yylval.str,yytext); + return(T_NUMBER); + } + YY_BREAK +case 11: +YY_RULE_SETUP +#line 81 "conf_lexer.lxx" +{ + if (YY_START==bracket) + BEGIN(br_string); + else + BEGIN(t_right); + return(T_DELIMITER); + } + YY_BREAK +case 12: +/* rule 12 can match eol */ +YY_RULE_SETUP +#line 89 "conf_lexer.lxx" +{ + //yylval.str = (char *)malloc(yyleng+1-2); + yylval.str = new char[yyleng+1-2]; + //strcpy(yylval.str,yytext); + memcpy(yylval.str,yytext,yyleng-2); + yylval.str[yyleng-2]='\0'; + return(T_STRING); + } + YY_BREAK +case 13: +YY_RULE_SETUP +#line 98 "conf_lexer.lxx" +{ + yylval.str = new char[yyleng+1]; + strcpy(yylval.str,yytext); + return(T_STRING); + } + YY_BREAK +case 14: +YY_RULE_SETUP +#line 104 "conf_lexer.lxx" +{ + //yylval.str = (char *)malloc(yyleng+1); + yylval.str = new char[yyleng+1]; + strcpy(yylval.str,yytext); + return(T_STRING); + } + YY_BREAK +case 15: +/* rule 15 can match eol */ +YY_RULE_SETUP +#line 110 "conf_lexer.lxx" + + YY_BREAK + +/* Ignore newline after "\" */ + +case 16: +YY_RULE_SETUP +#line 114 "conf_lexer.lxx" + + YY_BREAK + +/* Ignore spaces */ + +case 17: +YY_RULE_SETUP +#line 118 "conf_lexer.lxx" +{ /* got the include file name */ + if ( include_stack_ptr >= MAX_INCLUDE_DEPTH ) + { + fprintf(stderr,"Includes nested too deeply\n"); + // exit(1); // Seems too harsh! + return(T_NEWLINE); + } + include_stack[include_stack_ptr++] = + YY_CURRENT_BUFFER; + + HtConfiguration* config= HtConfiguration::config(); + // handle ${var} in file name + String ParsedFilename = + config->ParseString(yytext); + + if (ParsedFilename[0] != '/') + { // Given file name not fully qualified + // so strip dir. name from current one + String str; + if (include_stack_ptr > 1) + str = *name_stack[include_stack_ptr-2]; + else // still at top level config + str = config->getFileName(); + int len = str.lastIndexOf('/') + 1; + if (len > 0) + { // Current name has directory path + // component, so use it for new name + str.chop(str.length() - len); + str << ParsedFilename; + ParsedFilename = str; + } + } + + yyin = fopen( ParsedFilename.get(), "r" ); + + if ( ! yyin ) { + fprintf(stderr,"can't find file: %s\n",yytext); + // exit(1); // Seems too harsh! + include_stack_ptr--; + return(T_NEWLINE); + } + name_stack[include_stack_ptr-1] = + new String(ParsedFilename.get()); + lineno_stack[include_stack_ptr-1] = yylineno; + yylineno = 1; + yy_switch_to_buffer(yy_create_buffer(yyin,YY_BUF_SIZE ) ); + + BEGIN(INITIAL); + } + YY_BREAK +case YY_STATE_EOF(INITIAL): +case YY_STATE_EOF(t_right): +case YY_STATE_EOF(incl): +case YY_STATE_EOF(bracket): +case YY_STATE_EOF(br_string): +#line 168 "conf_lexer.lxx" +{ + if ( include_stack_ptr <= 0 ) + { + static int termnext = 0; + // fix to allow unterminated final line + if (++termnext <= 1) + return(T_NEWLINE); + termnext = 0; // in case we're called again + yyterminate(); + } + else + { + delete name_stack[include_stack_ptr-1]; + yylineno = lineno_stack[include_stack_ptr-1]; + yy_delete_buffer(YY_CURRENT_BUFFER ); + yy_switch_to_buffer(include_stack[--include_stack_ptr] ); + } + } + YY_BREAK +case 18: +/* rule 18 can match eol */ +YY_RULE_SETUP +#line 189 "conf_lexer.lxx" +{ + BEGIN(INITIAL); + return(T_NEWLINE); + } + YY_BREAK +case 19: +/* rule 19 can match eol */ +YY_RULE_SETUP +#line 193 "conf_lexer.lxx" + + YY_BREAK +case 20: +/* rule 20 can match eol */ +YY_RULE_SETUP +#line 195 "conf_lexer.lxx" + + YY_BREAK +case 21: +/* rule 21 can match eol */ +YY_RULE_SETUP +#line 196 "conf_lexer.lxx" +{ + HtConfiguration* config= HtConfiguration::config(); + String str; + if (include_stack_ptr > 0) + str = *name_stack[include_stack_ptr-1]; + else // still at top level config + str = config->getFileName(); + fprintf(stderr,"Unknown char in file %s line %d: %s\n",str.get(),yylineno,yytext); + // exit(1); // Seems too harsh! + } + YY_BREAK +case 22: +YY_RULE_SETUP +#line 206 "conf_lexer.lxx" +ECHO; + YY_BREAK +#line 1070 "conf_lexer.cxx" + + case YY_END_OF_BUFFER: + { + /* Amount of text matched not including the EOB char. */ + int yy_amount_of_matched_text = (int) (yy_cp - (yytext_ptr)) - 1; + + /* Undo the effects of YY_DO_BEFORE_ACTION. */ + *yy_cp = (yy_hold_char); + YY_RESTORE_YY_MORE_OFFSET + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW ) + { + /* We're scanning a new file or input source. It's + * possible that this happened because the user + * just pointed yyin at a new source and called + * yylex(). If so, then we have to assure + * consistency between YY_CURRENT_BUFFER and our + * globals. Here is the right place to do so, because + * this is the first action (other than possibly a + * back-up) that will match for the new input source. + */ + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL; + } + + /* Note that here we test for yy_c_buf_p "<=" to the position + * of the first EOB in the buffer, since yy_c_buf_p will + * already have been incremented past the NUL character + * (since all states make transitions on EOB to the + * end-of-buffer state). Contrast this with the test + * in input(). + */ + if ( (yy_c_buf_p) <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) + { /* This was really a NUL. */ + yy_state_type yy_next_state; + + (yy_c_buf_p) = (yytext_ptr) + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( ); + + /* Okay, we're now positioned to make the NUL + * transition. We couldn't have + * yy_get_previous_state() go ahead and do it + * for us because it doesn't know how to deal + * with the possibility of jamming (and we don't + * want to build jamming into it because then it + * will run more slowly). + */ + + yy_next_state = yy_try_NUL_trans( yy_current_state ); + + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + + if ( yy_next_state ) + { + /* Consume the NUL. */ + yy_cp = ++(yy_c_buf_p); + yy_current_state = yy_next_state; + goto yy_match; + } + + else + { + yy_cp = (yy_c_buf_p); + goto yy_find_action; + } + } + + else switch ( yy_get_next_buffer( ) ) + { + case EOB_ACT_END_OF_FILE: + { + (yy_did_buffer_switch_on_eof) = 0; + + if ( yywrap( ) ) + { + /* Note: because we've taken care in + * yy_get_next_buffer() to have set up + * yytext, we can now set up + * yy_c_buf_p so that if some total + * hoser (like flex itself) wants to + * call the scanner after we return the + * YY_NULL, it'll still work - another + * YY_NULL will get returned. + */ + (yy_c_buf_p) = (yytext_ptr) + YY_MORE_ADJ; + + yy_act = YY_STATE_EOF(YY_START); + goto do_action; + } + + else + { + if ( ! (yy_did_buffer_switch_on_eof) ) + YY_NEW_FILE; + } + break; + } + + case EOB_ACT_CONTINUE_SCAN: + (yy_c_buf_p) = + (yytext_ptr) + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( ); + + yy_cp = (yy_c_buf_p); + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + goto yy_match; + + case EOB_ACT_LAST_MATCH: + (yy_c_buf_p) = + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)]; + + yy_current_state = yy_get_previous_state( ); + + yy_cp = (yy_c_buf_p); + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + goto yy_find_action; + } + break; + } + + default: + YY_FATAL_ERROR( + "fatal flex scanner internal error--no action found" ); + } /* end of action switch */ + } /* end of scanning one token */ +} /* end of yylex */ + +/* yy_get_next_buffer - try to read in a new buffer + * + * Returns a code representing an action: + * EOB_ACT_LAST_MATCH - + * EOB_ACT_CONTINUE_SCAN - continue scanning from current position + * EOB_ACT_END_OF_FILE - end of file + */ +static int yy_get_next_buffer (void) +{ + register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf; + register char *source = (yytext_ptr); + register int number_to_move, i; + int ret_val; + + if ( (yy_c_buf_p) > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] ) + YY_FATAL_ERROR( + "fatal flex scanner internal error--end of buffer missed" ); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 ) + { /* Don't try to fill the buffer, so this is an EOF. */ + if ( (yy_c_buf_p) - (yytext_ptr) - YY_MORE_ADJ == 1 ) + { + /* We matched a single character, the EOB, so + * treat this as a final EOF. + */ + return EOB_ACT_END_OF_FILE; + } + + else + { + /* We matched some text prior to the EOB, first + * process it. + */ + return EOB_ACT_LAST_MATCH; + } + } + + /* Try to read more data. */ + + /* First move last chars to start of buffer. */ + number_to_move = (int) ((yy_c_buf_p) - (yytext_ptr)) - 1; + + for ( i = 0; i < number_to_move; ++i ) + *(dest++) = *(source++); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING ) + /* don't do the read, it's not guaranteed to return an EOF, + * just force an EOF + */ + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars) = 0; + + else + { + size_t num_to_read = + YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1; + + while ( num_to_read <= 0 ) + { /* Not enough room in the buffer - grow it. */ + + /* just a shorter name for the current buffer */ + YY_BUFFER_STATE b = YY_CURRENT_BUFFER; + + int yy_c_buf_p_offset = + (int) ((yy_c_buf_p) - b->yy_ch_buf); + + if ( b->yy_is_our_buffer ) + { + int new_size = b->yy_buf_size * 2; + + if ( new_size <= 0 ) + b->yy_buf_size += b->yy_buf_size / 8; + else + b->yy_buf_size *= 2; + + b->yy_ch_buf = (char *) + /* Include room in for 2 EOB chars. */ + yyrealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2 ); + } + else + /* Can't grow it, we don't own it. */ + b->yy_ch_buf = 0; + + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( + "fatal error - scanner input buffer overflow" ); + + (yy_c_buf_p) = &b->yy_ch_buf[yy_c_buf_p_offset]; + + num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size - + number_to_move - 1; + + } + + if ( num_to_read > YY_READ_BUF_SIZE ) + num_to_read = YY_READ_BUF_SIZE; + + /* Read in more data. */ + YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]), + (yy_n_chars), num_to_read ); + + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + if ( (yy_n_chars) == 0 ) + { + if ( number_to_move == YY_MORE_ADJ ) + { + ret_val = EOB_ACT_END_OF_FILE; + yyrestart(yyin ); + } + + else + { + ret_val = EOB_ACT_LAST_MATCH; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = + YY_BUFFER_EOF_PENDING; + } + } + + else + ret_val = EOB_ACT_CONTINUE_SCAN; + + (yy_n_chars) += number_to_move; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] = YY_END_OF_BUFFER_CHAR; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] = YY_END_OF_BUFFER_CHAR; + + (yytext_ptr) = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0]; + + return ret_val; +} + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + + static yy_state_type yy_get_previous_state (void) +{ + register yy_state_type yy_current_state; + register char *yy_cp; + + yy_current_state = (yy_start); + yy_current_state += YY_AT_BOL(); + + for ( yy_cp = (yytext_ptr) + YY_MORE_ADJ; yy_cp < (yy_c_buf_p); ++yy_cp ) + { + register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1); + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 63 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + } + + return yy_current_state; +} + +/* yy_try_NUL_trans - try to make a transition on the NUL character + * + * synopsis + * next_state = yy_try_NUL_trans( current_state ); + */ + static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state ) +{ + register int yy_is_jam; + register char *yy_cp = (yy_c_buf_p); + + register YY_CHAR yy_c = 1; + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 63 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + yy_is_jam = (yy_current_state == 62); + + return yy_is_jam ? 0 : yy_current_state; +} + +#ifndef YY_NO_INPUT +#ifdef __cplusplus + static int yyinput (void) +#else + static int input (void) +#endif + +{ + int c; + + *(yy_c_buf_p) = (yy_hold_char); + + if ( *(yy_c_buf_p) == YY_END_OF_BUFFER_CHAR ) + { + /* yy_c_buf_p now points to the character we want to return. + * If this occurs *before* the EOB characters, then it's a + * valid NUL; if not, then we've hit the end of the buffer. + */ + if ( (yy_c_buf_p) < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) + /* This was really a NUL. */ + *(yy_c_buf_p) = '\0'; + + else + { /* need more input */ + int offset = (yy_c_buf_p) - (yytext_ptr); + ++(yy_c_buf_p); + + switch ( yy_get_next_buffer( ) ) + { + case EOB_ACT_LAST_MATCH: + /* This happens because yy_g_n_b() + * sees that we've accumulated a + * token and flags that we need to + * try matching the token before + * proceeding. But for input(), + * there's no matching to consider. + * So convert the EOB_ACT_LAST_MATCH + * to EOB_ACT_END_OF_FILE. + */ + + /* Reset buffer status. */ + yyrestart(yyin ); + + /*FALLTHROUGH*/ + + case EOB_ACT_END_OF_FILE: + { + if ( yywrap( ) ) + return EOF; + + if ( ! (yy_did_buffer_switch_on_eof) ) + YY_NEW_FILE; +#ifdef __cplusplus + return yyinput(); +#else + return input(); +#endif + } + + case EOB_ACT_CONTINUE_SCAN: + (yy_c_buf_p) = (yytext_ptr) + offset; + break; + } + } + } + + c = *(unsigned char *) (yy_c_buf_p); /* cast for 8-bit char's */ + *(yy_c_buf_p) = '\0'; /* preserve yytext */ + (yy_hold_char) = *++(yy_c_buf_p); + + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = (c == '\n'); + if ( YY_CURRENT_BUFFER_LVALUE->yy_at_bol ) + + yylineno++; +; + + return c; +} +#endif /* ifndef YY_NO_INPUT */ + +/** Immediately switch to a different input stream. + * @param input_file A readable stream. + * + * @note This function does not reset the start condition to @c INITIAL . + */ + void yyrestart (FILE * input_file ) +{ + + if ( ! YY_CURRENT_BUFFER ){ + yyensure_buffer_stack (); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer(yyin,YY_BUF_SIZE ); + } + + yy_init_buffer(YY_CURRENT_BUFFER,input_file ); + yy_load_buffer_state( ); +} + +/** Switch to a different input buffer. + * @param new_buffer The new input buffer. + * + */ + void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ) +{ + + /* TODO. We should be able to replace this entire function body + * with + * yypop_buffer_state(); + * yypush_buffer_state(new_buffer); + */ + yyensure_buffer_stack (); + if ( YY_CURRENT_BUFFER == new_buffer ) + return; + + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *(yy_c_buf_p) = (yy_hold_char); + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + YY_CURRENT_BUFFER_LVALUE = new_buffer; + yy_load_buffer_state( ); + + /* We don't actually know whether we did this switch during + * EOF (yywrap()) processing, but the only time this flag + * is looked at is after yywrap() is called, so it's safe + * to go ahead and always set it. + */ + (yy_did_buffer_switch_on_eof) = 1; +} + +static void yy_load_buffer_state (void) +{ + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + (yytext_ptr) = (yy_c_buf_p) = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos; + yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file; + (yy_hold_char) = *(yy_c_buf_p); +} + +/** Allocate and initialize an input buffer state. + * @param file A readable stream. + * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE. + * + * @return the allocated buffer state. + */ + YY_BUFFER_STATE yy_create_buffer (FILE * file, int size ) +{ + YY_BUFFER_STATE b; + + b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state ) ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_buf_size = size; + + /* yy_ch_buf has to be 2 characters longer than the size given because + * we need to put in 2 end-of-buffer characters. + */ + b->yy_ch_buf = (char *) yyalloc(b->yy_buf_size + 2 ); + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_is_our_buffer = 1; + + yy_init_buffer(b,file ); + + return b; +} + +/** Destroy the buffer. + * @param b a buffer created with yy_create_buffer() + * + */ + void yy_delete_buffer (YY_BUFFER_STATE b ) +{ + + if ( ! b ) + return; + + if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */ + YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0; + + if ( b->yy_is_our_buffer ) + yyfree((void *) b->yy_ch_buf ); + + yyfree((void *) b ); +} + +#ifndef __cplusplus +extern int isatty (int ); +#endif /* __cplusplus */ + +/* Initializes or reinitializes a buffer. + * This function is sometimes called more than once on the same buffer, + * such as during a yyrestart() or at EOF. + */ + static void yy_init_buffer (YY_BUFFER_STATE b, FILE * file ) + +{ + int oerrno = errno; + + yy_flush_buffer(b ); + + b->yy_input_file = file; + b->yy_fill_buffer = 1; + + /* If b is the current buffer, then yy_init_buffer was _probably_ + * called from yyrestart() or through yy_get_next_buffer. + * In that case, we don't want to reset the lineno or column. + */ + if (b != YY_CURRENT_BUFFER){ + b->yy_bs_lineno = 1; + b->yy_bs_column = 0; + } + + b->yy_is_interactive = file ? (isatty( fileno(file) ) > 0) : 0; + + errno = oerrno; +} + +/** Discard all buffered characters. On the next scan, YY_INPUT will be called. + * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER. + * + */ + void yy_flush_buffer (YY_BUFFER_STATE b ) +{ + if ( ! b ) + return; + + b->yy_n_chars = 0; + + /* We always need two end-of-buffer characters. The first causes + * a transition to the end-of-buffer state. The second causes + * a jam in that state. + */ + b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR; + b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR; + + b->yy_buf_pos = &b->yy_ch_buf[0]; + + b->yy_at_bol = 1; + b->yy_buffer_status = YY_BUFFER_NEW; + + if ( b == YY_CURRENT_BUFFER ) + yy_load_buffer_state( ); +} + +/** Pushes the new state onto the stack. The new state becomes + * the current state. This function will allocate the stack + * if necessary. + * @param new_buffer The new state. + * + */ +void yypush_buffer_state (YY_BUFFER_STATE new_buffer ) +{ + if (new_buffer == NULL) + return; + + yyensure_buffer_stack(); + + /* This block is copied from yy_switch_to_buffer. */ + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *(yy_c_buf_p) = (yy_hold_char); + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + /* Only push if top exists. Otherwise, replace top. */ + if (YY_CURRENT_BUFFER) + (yy_buffer_stack_top)++; + YY_CURRENT_BUFFER_LVALUE = new_buffer; + + /* copied from yy_switch_to_buffer. */ + yy_load_buffer_state( ); + (yy_did_buffer_switch_on_eof) = 1; +} + +/** Removes and deletes the top of the stack, if present. + * The next element becomes the new top. + * + */ +void yypop_buffer_state (void) +{ + if (!YY_CURRENT_BUFFER) + return; + + yy_delete_buffer(YY_CURRENT_BUFFER ); + YY_CURRENT_BUFFER_LVALUE = NULL; + if ((yy_buffer_stack_top) > 0) + --(yy_buffer_stack_top); + + if (YY_CURRENT_BUFFER) { + yy_load_buffer_state( ); + (yy_did_buffer_switch_on_eof) = 1; + } +} + +/* Allocates the stack if it does not exist. + * Guarantees space for at least one push. + */ +static void yyensure_buffer_stack (void) +{ + int num_to_alloc; + + if (!(yy_buffer_stack)) { + + /* First allocation is just for 2 elements, since we don't know if this + * scanner will even need a stack. We use 2 instead of 1 to avoid an + * immediate realloc on the next call. + */ + num_to_alloc = 1; + (yy_buffer_stack) = (struct yy_buffer_state**)yyalloc + (num_to_alloc * sizeof(struct yy_buffer_state*) + ); + + memset((yy_buffer_stack), 0, num_to_alloc * sizeof(struct yy_buffer_state*)); + + (yy_buffer_stack_max) = num_to_alloc; + (yy_buffer_stack_top) = 0; + return; + } + + if ((yy_buffer_stack_top) >= ((yy_buffer_stack_max)) - 1){ + + /* Increase the buffer to prepare for a possible push. */ + int grow_size = 8 /* arbitrary grow size */; + + num_to_alloc = (yy_buffer_stack_max) + grow_size; + (yy_buffer_stack) = (struct yy_buffer_state**)yyrealloc + ((yy_buffer_stack), + num_to_alloc * sizeof(struct yy_buffer_state*) + ); + + /* zero only the new slots.*/ + memset((yy_buffer_stack) + (yy_buffer_stack_max), 0, grow_size * sizeof(struct yy_buffer_state*)); + (yy_buffer_stack_max) = num_to_alloc; + } +} + +/** Setup the input buffer state to scan directly from a user-specified character buffer. + * @param base the character buffer + * @param size the size in bytes of the character buffer + * + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE yy_scan_buffer (char * base, yy_size_t size ) +{ + YY_BUFFER_STATE b; + + if ( size < 2 || + base[size-2] != YY_END_OF_BUFFER_CHAR || + base[size-1] != YY_END_OF_BUFFER_CHAR ) + /* They forgot to leave room for the EOB's. */ + return 0; + + b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state ) ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in yy_scan_buffer()" ); + + b->yy_buf_size = size - 2; /* "- 2" to take care of EOB's */ + b->yy_buf_pos = b->yy_ch_buf = base; + b->yy_is_our_buffer = 0; + b->yy_input_file = 0; + b->yy_n_chars = b->yy_buf_size; + b->yy_is_interactive = 0; + b->yy_at_bol = 1; + b->yy_fill_buffer = 0; + b->yy_buffer_status = YY_BUFFER_NEW; + + yy_switch_to_buffer(b ); + + return b; +} + +/** Setup the input buffer state to scan a string. The next call to yylex() will + * scan from a @e copy of @a str. + * @param str a NUL-terminated string to scan + * + * @return the newly allocated buffer state object. + * @note If you want to scan bytes that may contain NUL values, then use + * yy_scan_bytes() instead. + */ +YY_BUFFER_STATE yy_scan_string (yyconst char * yy_str ) +{ + + return yy_scan_bytes(yy_str,strlen(yy_str) ); +} + +/** Setup the input buffer state to scan the given bytes. The next call to yylex() will + * scan from a @e copy of @a bytes. + * @param bytes the byte buffer to scan + * @param len the number of bytes in the buffer pointed to by @a bytes. + * + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE yy_scan_bytes (yyconst char * bytes, int len ) +{ + YY_BUFFER_STATE b; + char *buf; + yy_size_t n; + int i; + + /* Get memory for full buffer, including space for trailing EOB's. */ + n = len + 2; + buf = (char *) yyalloc(n ); + if ( ! buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_scan_bytes()" ); + + for ( i = 0; i < len; ++i ) + buf[i] = bytes[i]; + + buf[len] = buf[len+1] = YY_END_OF_BUFFER_CHAR; + + b = yy_scan_buffer(buf,n ); + if ( ! b ) + YY_FATAL_ERROR( "bad buffer in yy_scan_bytes()" ); + + /* It's okay to grow etc. this buffer, and we should throw it + * away when we're done. + */ + b->yy_is_our_buffer = 1; + + return b; +} + +#ifndef YY_EXIT_FAILURE +#define YY_EXIT_FAILURE 2 +#endif + +static void yy_fatal_error (yyconst char* msg ) +{ + (void) fprintf( stderr, "%s\n", msg ); + exit( YY_EXIT_FAILURE ); +} + +/* Redefine yyless() so it works in section 3 code. */ + +#undef yyless +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + yytext[yyleng] = (yy_hold_char); \ + (yy_c_buf_p) = yytext + yyless_macro_arg; \ + (yy_hold_char) = *(yy_c_buf_p); \ + *(yy_c_buf_p) = '\0'; \ + yyleng = yyless_macro_arg; \ + } \ + while ( 0 ) + +/* Accessor methods (get/set functions) to struct members. */ + +/** Get the current line number. + * + */ +int yyget_lineno (void) +{ + + return yylineno; +} + +/** Get the input stream. + * + */ +FILE *yyget_in (void) +{ + return yyin; +} + +/** Get the output stream. + * + */ +FILE *yyget_out (void) +{ + return yyout; +} + +/** Get the length of the current token. + * + */ +int yyget_leng (void) +{ + return yyleng; +} + +/** Get the current token. + * + */ + +char *yyget_text (void) +{ + return yytext; +} + +/** Set the current line number. + * @param line_number + * + */ +void yyset_lineno (int line_number ) +{ + + yylineno = line_number; +} + +/** Set the input stream. This does not discard the current + * input buffer. + * @param in_str A readable stream. + * + * @see yy_switch_to_buffer + */ +void yyset_in (FILE * in_str ) +{ + yyin = in_str ; +} + +void yyset_out (FILE * out_str ) +{ + yyout = out_str ; +} + +int yyget_debug (void) +{ + return yy_flex_debug; +} + +void yyset_debug (int bdebug ) +{ + yy_flex_debug = bdebug ; +} + +/* yylex_destroy is for both reentrant and non-reentrant scanners. */ +int yylex_destroy (void) +{ + + /* Pop the buffer stack, destroying each element. */ + while(YY_CURRENT_BUFFER){ + yy_delete_buffer(YY_CURRENT_BUFFER ); + YY_CURRENT_BUFFER_LVALUE = NULL; + yypop_buffer_state(); + } + + /* Destroy the stack itself. */ + yyfree((yy_buffer_stack) ); + (yy_buffer_stack) = NULL; + + return 0; +} + +/* + * Internal utility routines. + */ + +#ifndef yytext_ptr +static void yy_flex_strncpy (char* s1, yyconst char * s2, int n ) +{ + register int i; + for ( i = 0; i < n; ++i ) + s1[i] = s2[i]; +} +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * s ) +{ + register int n; + for ( n = 0; s[n]; ++n ) + ; + + return n; +} +#endif + +void *yyalloc (yy_size_t size ) +{ + return (void *) malloc( size ); +} + +void *yyrealloc (void * ptr, yy_size_t size ) +{ + /* The cast to (char *) in the following accommodates both + * implementations that use char* generic pointers, and those + * that use void* generic pointers. It works with the latter + * because both ANSI C and C++ allow castless assignment from + * any pointer type to void*, and deal with argument conversions + * as though doing an assignment. + */ + return (void *) realloc( (char *) ptr, size ); +} + +void yyfree (void * ptr ) +{ + free( (char *) ptr ); /* see yyrealloc() for (char *) cast */ +} + +#define YYTABLES_NAME "yytables" + +#undef YY_NEW_FILE +#undef YY_FLUSH_BUFFER +#undef yy_set_bol +#undef yy_new_buffer +#undef yy_set_interactive +#undef yytext_ptr +#undef YY_DO_BEFORE_ACTION + +#ifdef YY_DECL_IS_OURS +#undef YY_DECL_IS_OURS +#undef YY_DECL +#endif +#line 206 "conf_lexer.lxx" + + + diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/conf_lexer.lxx b/debian/htdig/htdig-3.2.0b6/htcommon/conf_lexer.lxx new file mode 100644 index 00000000..d17f7e50 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/conf_lexer.lxx @@ -0,0 +1,206 @@ +/* +// +// conf_lexer.lxx +// +// This lexical parser is used to parse ht://Dig config +// files. +// +// Note: The resulting .cxx file produces warnings of unused +// labels. As at 2003-06-02, these have been manually +// removed, but they will reappear when (f)lex is re-run. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: conf_lexer.lxx,v 1.12 2004/06/10 14:48:38 angusgb Exp $ +// +*/ +%{ +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ +%} + +%option yylineno noyywrap nounput +%x t_right +%x incl +%x bracket +%x br_string + +%{ +#ifdef HAVE_STRINGS_H +#include <strings.h> +#endif + +#include "HtConfiguration.h" +//#include "Dictionary.h" +#include "conf_parser.h" +#define MAX_INCLUDE_DEPTH 10 +YY_BUFFER_STATE include_stack[MAX_INCLUDE_DEPTH]; +String *name_stack[MAX_INCLUDE_DEPTH]; +int lineno_stack[MAX_INCLUDE_DEPTH]; +int include_stack_ptr = 0; +%} + +KEYWORD [a-zA-Z_][a-zA-Z_0-9/]* +NUMBER [0-9]+ +STRING [\x21-\xff]+ +BR_STRING [^ \n\t<>]+ +%% + +^[[:space:]]*#.*\n /* Ignore comments */ +^[[:space:]]*\n /* Ignore empty lines */ +<*>[ \t]+ /* Ignore spaces */ +include[ \t]*: BEGIN(incl); +"<" { BEGIN(bracket); return(T_LEFT_BR); } +<bracket>"/" return(T_SLASH); +<bracket>">" return(T_RIGHT_BR); +<INITIAL,bracket>{KEYWORD} { + //yylval.str = (char *)malloc(yyleng+1); + yylval.str = new char[yyleng+1]; + strcpy(yylval.str,yytext); + return(T_KEYWORD); + } +<br_string>{BR_STRING} { + BEGIN(bracket); + yylval.str = new char[yyleng+1]; + strcpy(yylval.str,yytext); + return(T_STRING); + } + +<t_right>{NUMBER} { + //yylval.str = (char*)malloc(yyleng+1); + yylval.str = new char[yyleng+1]; + strcpy(yylval.str,yytext); + return(T_NUMBER); + } + +<INITIAL,bracket>: { + if (YY_START==bracket) + BEGIN(br_string); + else + BEGIN(t_right); + return(T_DELIMITER); + } + +<t_right>{STRING}\\\n { + //yylval.str = (char *)malloc(yyleng+1-2); + yylval.str = new char[yyleng+1-2]; + //strcpy(yylval.str,yytext); + memcpy(yylval.str,yytext,yyleng-2); + yylval.str[yyleng-2]='\0'; + return(T_STRING); + } + +<t_right>"\""[^\n]+"\"" { + yylval.str = new char[yyleng+1]; + strcpy(yylval.str,yytext); + return(T_STRING); + } + +<t_right>{STRING} { + //yylval.str = (char *)malloc(yyleng+1); + yylval.str = new char[yyleng+1]; + strcpy(yylval.str,yytext); + return(T_STRING); + } +<t_right>\\\n + { + /* Ignore newline after "\" */ + } +<t_right>[ \t]* + { + /* Ignore spaces */ + } +<incl>{STRING} { /* got the include file name */ + if ( include_stack_ptr >= MAX_INCLUDE_DEPTH ) + { + fprintf(stderr,"Includes nested too deeply\n"); + // exit(1); // Seems too harsh! + return(T_NEWLINE); + } + include_stack[include_stack_ptr++] = + YY_CURRENT_BUFFER; + + HtConfiguration* config= HtConfiguration::config(); + // handle ${var} in file name + String ParsedFilename = + config->ParseString(yytext); + + if (ParsedFilename[0] != '/') + { // Given file name not fully qualified + // so strip dir. name from current one + String str; + if (include_stack_ptr > 1) + str = *name_stack[include_stack_ptr-2]; + else // still at top level config + str = config->getFileName(); + int len = str.lastIndexOf('/') + 1; + if (len > 0) + { // Current name has directory path + // component, so use it for new name + str.chop(str.length() - len); + str << ParsedFilename; + ParsedFilename = str; + } + } + + yyin = fopen( ParsedFilename.get(), "r" ); + + if ( ! yyin ) { + fprintf(stderr,"can't find file: %s\n",yytext); + // exit(1); // Seems too harsh! + include_stack_ptr--; + return(T_NEWLINE); + } + name_stack[include_stack_ptr-1] = + new String(ParsedFilename.get()); + lineno_stack[include_stack_ptr-1] = yylineno; + yylineno = 1; + yy_switch_to_buffer( yy_create_buffer( yyin, YY_BUF_SIZE ) ); + + BEGIN(INITIAL); + } + +<<EOF>> { + if ( include_stack_ptr <= 0 ) + { + static int termnext = 0; + // fix to allow unterminated final line + if (++termnext <= 1) + return(T_NEWLINE); + termnext = 0; // in case we're called again + yyterminate(); + } + else + { + delete name_stack[include_stack_ptr-1]; + yylineno = lineno_stack[include_stack_ptr-1]; + yy_delete_buffer( YY_CURRENT_BUFFER ); + yy_switch_to_buffer( + include_stack[--include_stack_ptr] ); + } + } + + +<bracket,t_right>\n { + BEGIN(INITIAL); + return(T_NEWLINE); + } +^[[:space:]]+ + +\n +<*>.|\n { + HtConfiguration* config= HtConfiguration::config(); + String str; + if (include_stack_ptr > 0) + str = *name_stack[include_stack_ptr-1]; + else // still at top level config + str = config->getFileName(); + fprintf(stderr,"Unknown char in file %s line %d: %s\n",str.get(),yylineno,yytext); + // exit(1); // Seems too harsh! + } +%% diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/conf_parser.cxx b/debian/htdig/htdig-3.2.0b6/htcommon/conf_parser.cxx new file mode 100644 index 00000000..13b77ef8 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/conf_parser.cxx @@ -0,0 +1,1553 @@ +/* A Bison parser, made by GNU Bison 1.875c. */ + +/* Skeleton parser for Yacc-like parsing with Bison, + Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +/* As a special exception, when this file is copied by Bison into a + Bison output file, you may use that output file without restriction. + This special exception was added by the Free Software Foundation + in version 1.24 of Bison. */ + +/* Written by Richard Stallman by simplifying the original so called + ``semantic'' parser. */ + +/* All symbols defined below should begin with yy or YY, to avoid + infringing on user name space. This should be done even for local + variables, as they might otherwise be expanded by user macros. + There are some unavoidable exceptions within include files to + define necessary library symbols; they are noted "INFRINGES ON + USER NAME SPACE" below. */ + +/* Identify Bison output. */ +#define YYBISON 1 + +/* Skeleton name. */ +#define YYSKELETON_NAME "yacc.c" + +/* Pure parsers. */ +#define YYPURE 0 + +/* Using locations. */ +#define YYLSP_NEEDED 0 + + + +/* Tokens. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + /* Put the tokens into the symbol table, so that GDB and other debuggers + know about them. */ + enum yytokentype { + NUM = 258, + T_DELIMITER = 259, + T_NEWLINE = 260, + T_RIGHT_BR = 261, + T_LEFT_BR = 262, + T_SLASH = 263, + T_STRING = 264, + T_KEYWORD = 265, + T_NUMBER = 266 + }; +#endif +#define NUM 258 +#define T_DELIMITER 259 +#define T_NEWLINE 260 +#define T_RIGHT_BR 261 +#define T_LEFT_BR 262 +#define T_SLASH 263 +#define T_STRING 264 +#define T_KEYWORD 265 +#define T_NUMBER 266 + + + + +/* Copy the first part of user declarations. */ +#line 1 "conf_parser.yxx" + +// +// conf_parser.yxx +// +// This syntax analyzer is used to parse ht://Dig config +// files. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: conf_parser.cxx,v 1.7 2004/06/10 14:48:38 angusgb Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +/* Bison version > 1.25 needed */ +/* TODO: +1. Better error handling +2. ? +*/ +#include <stdio.h> /* for debug */ +#include <stdlib.h> + +#ifdef HAVE_STD +#include <iostream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <iostream.h> +#endif /* HAVE_STD */ + +#include "HtConfiguration.h" +#include "htString.h" +/*#define YYDEBUG 1*/ +#define YYPARSE_PARAM aConf +int yyerror(char *s); +int yylex(void); +#undef DEBUG +#ifdef DEBUG +int sn_debug=3; +#endif + + +/* Enabling traces. */ +#ifndef YYDEBUG +# define YYDEBUG 0 +#endif + +/* Enabling verbose error messages. */ +#ifdef YYERROR_VERBOSE +# undef YYERROR_VERBOSE +# define YYERROR_VERBOSE 1 +#else +# define YYERROR_VERBOSE 0 +#endif + +#if ! defined (YYSTYPE) && ! defined (YYSTYPE_IS_DECLARED) +#line 50 "conf_parser.yxx" +typedef union YYSTYPE { + char *str; + ConfigDefaults *ConfLine; + HtConfiguration *ConfLines; +} YYSTYPE; +/* Line 191 of yacc.c. */ +#line 153 "conf_parser.cxx" +# define yystype YYSTYPE /* obsolescent; will be withdrawn */ +# define YYSTYPE_IS_DECLARED 1 +# define YYSTYPE_IS_TRIVIAL 1 +#endif + + + +/* Copy the second part of user declarations. */ + + +/* Line 214 of yacc.c. */ +#line 165 "conf_parser.cxx" + +#if ! defined (yyoverflow) || YYERROR_VERBOSE + +# ifndef YYFREE +# define YYFREE free +# endif +# ifndef YYMALLOC +# define YYMALLOC malloc +# endif + +/* The parser invokes alloca or malloc; define the necessary symbols. */ + +# ifdef YYSTACK_USE_ALLOCA +# if YYSTACK_USE_ALLOCA +# define YYSTACK_ALLOC alloca +# endif +# else +# if defined (alloca) || defined (_ALLOCA_H) +# define YYSTACK_ALLOC alloca +# else +# ifdef __GNUC__ +# define YYSTACK_ALLOC __builtin_alloca +# endif +# endif +# endif + +# ifdef YYSTACK_ALLOC + /* Pacify GCC's `empty if-body' warning. */ +# define YYSTACK_FREE(Ptr) do { /* empty */; } while (0) +# else +# if defined (__STDC__) || defined (__cplusplus) +# include <stdlib.h> /* INFRINGES ON USER NAME SPACE */ +# define YYSIZE_T size_t +# endif +# define YYSTACK_ALLOC YYMALLOC +# define YYSTACK_FREE YYFREE +# endif +#endif /* ! defined (yyoverflow) || YYERROR_VERBOSE */ + + +#if (! defined (yyoverflow) \ + && (! defined (__cplusplus) \ + || (defined (YYSTYPE_IS_TRIVIAL) && YYSTYPE_IS_TRIVIAL))) + +/* A type that is properly aligned for any stack member. */ +union yyalloc +{ + short yyss; + YYSTYPE yyvs; + }; + +/* The size of the maximum gap between one aligned stack and the next. */ +# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1) + +/* The size of an array large to enough to hold all stacks, each with + N elements. */ +# define YYSTACK_BYTES(N) \ + ((N) * (sizeof (short) + sizeof (YYSTYPE)) \ + + YYSTACK_GAP_MAXIMUM) + +/* Copy COUNT objects from FROM to TO. The source and destination do + not overlap. */ +# ifndef YYCOPY +# if defined (__GNUC__) && 1 < __GNUC__ +# define YYCOPY(To, From, Count) \ + __builtin_memcpy (To, From, (Count) * sizeof (*(From))) +# else +# define YYCOPY(To, From, Count) \ + do \ + { \ + register YYSIZE_T yyi; \ + for (yyi = 0; yyi < (Count); yyi++) \ + (To)[yyi] = (From)[yyi]; \ + } \ + while (0) +# endif +# endif + +/* Relocate STACK from its old location to the new one. The + local variables YYSIZE and YYSTACKSIZE give the old and new number of + elements in the stack, and YYPTR gives the new location of the + stack. Advance YYPTR to a properly aligned location for the next + stack. */ +# define YYSTACK_RELOCATE(Stack) \ + do \ + { \ + YYSIZE_T yynewbytes; \ + YYCOPY (&yyptr->Stack, Stack, yysize); \ + Stack = &yyptr->Stack; \ + yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \ + yyptr += yynewbytes / sizeof (*yyptr); \ + } \ + while (0) + +#endif + +#if defined (__STDC__) || defined (__cplusplus) + typedef signed char yysigned_char; +#else + typedef short yysigned_char; +#endif + +/* YYFINAL -- State number of the termination state. */ +#define YYFINAL 2 +/* YYLAST -- Last index in YYTABLE. */ +#define YYLAST 31 + +/* YYNTOKENS -- Number of terminals. */ +#define YYNTOKENS 12 +/* YYNNTS -- Number of nonterminals. */ +#define YYNNTS 7 +/* YYNRULES -- Number of rules. */ +#define YYNRULES 20 +/* YYNRULES -- Number of states. */ +#define YYNSTATES 37 + +/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX. */ +#define YYUNDEFTOK 2 +#define YYMAXUTOK 266 + +#define YYTRANSLATE(YYX) \ + ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK) + +/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX. */ +static const unsigned char yytranslate[] = +{ + 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11 +}; + +#if YYDEBUG +/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in + YYRHS. */ +static const unsigned char yyprhs[] = +{ + 0, 0, 3, 4, 7, 9, 11, 13, 18, 23, + 28, 32, 45, 47, 50, 52, 55, 58, 61, 64, + 67 +}; + +/* YYRHS -- A `-1'-separated list of the rules' RHS. */ +static const yysigned_char yyrhs[] = +{ + 13, 0, -1, -1, 13, 14, -1, 15, -1, 16, + -1, 5, -1, 10, 4, 9, 5, -1, 10, 4, + 11, 5, -1, 10, 4, 18, 5, -1, 10, 4, + 5, -1, 7, 10, 4, 9, 6, 5, 17, 7, + 8, 10, 6, 5, -1, 15, -1, 17, 15, -1, + 5, -1, 9, 9, -1, 11, 9, -1, 9, 11, + -1, 11, 11, -1, 18, 9, -1, 18, 11, -1 +}; + +/* YYRLINE[YYN] -- source line where rule number YYN was defined. */ +static const unsigned char yyrline[] = +{ + 0, 65, 65, 66, 69, 81, 87, 90, 100, 106, + 112, 122, 143, 161, 173, 176, 189, 202, 215, 228, + 240 +}; +#endif + +#if YYDEBUG || YYERROR_VERBOSE +/* YYTNME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM. + First, the terminals, then, starting at YYNTOKENS, nonterminals. */ +static const char *const yytname[] = +{ + "$end", "error", "$undefined", "NUM", "T_DELIMITER", "T_NEWLINE", + "T_RIGHT_BR", "T_LEFT_BR", "T_SLASH", "T_STRING", "T_KEYWORD", + "T_NUMBER", "$accept", "input", "block", "simple_expression", + "complex_expression", "simple_expression_list", "list", 0 +}; +#endif + +# ifdef YYPRINT +/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to + token YYLEX-NUM. */ +static const unsigned short yytoknum[] = +{ + 0, 256, 257, 258, 259, 260, 261, 262, 263, 264, + 265, 266 +}; +# endif + +/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives. */ +static const unsigned char yyr1[] = +{ + 0, 12, 13, 13, 14, 14, 14, 15, 15, 15, + 15, 16, 17, 17, 17, 18, 18, 18, 18, 18, + 18 +}; + +/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN. */ +static const unsigned char yyr2[] = +{ + 0, 2, 0, 2, 1, 1, 1, 4, 4, 4, + 3, 12, 1, 2, 1, 2, 2, 2, 2, 2, + 2 +}; + +/* YYDEFACT[STATE-NAME] -- Default rule to reduce with in state + STATE-NUM when YYTABLE doesn't specify something else to do. Zero + means the default is an error. */ +static const unsigned char yydefact[] = +{ + 2, 0, 1, 6, 0, 0, 3, 4, 5, 0, + 0, 0, 10, 0, 0, 0, 0, 7, 15, 17, + 8, 16, 18, 9, 19, 20, 0, 0, 14, 12, + 0, 0, 13, 0, 0, 0, 11 +}; + +/* YYDEFGOTO[NTERM-NUM]. */ +static const yysigned_char yydefgoto[] = +{ + -1, 1, 6, 7, 8, 30, 15 +}; + +/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing + STATE-NUM. */ +#define YYPACT_NINF -8 +static const yysigned_char yypact[] = +{ + -8, 0, -8, -8, -7, 16, -8, -8, -8, 20, + -3, 2, -8, 4, 7, 12, 21, -8, -8, -8, + -8, -8, -8, -8, -8, -8, 23, 9, -8, -8, + -6, 18, -8, 19, 24, 26, -8 +}; + +/* YYPGOTO[NTERM-NUM]. */ +static const yysigned_char yypgoto[] = +{ + -8, -8, -8, -5, -8, -8, -8 +}; + +/* YYTABLE[YYPACT[STATE-NUM]]. What to do in state STATE-NUM. If + positive, shift that token. If negative, reduce the rule which + number is the opposite. If zero, do what YYDEFACT says. + If YYTABLE_NINF, syntax error. */ +#define YYTABLE_NINF -1 +static const unsigned char yytable[] = +{ + 2, 31, 12, 9, 5, 3, 13, 4, 14, 17, + 5, 16, 20, 18, 28, 19, 21, 23, 22, 5, + 10, 24, 29, 25, 11, 32, 33, 26, 27, 34, + 35, 36 +}; + +static const unsigned char yycheck[] = +{ + 0, 7, 5, 10, 10, 5, 9, 7, 11, 5, + 10, 9, 5, 9, 5, 11, 9, 5, 11, 10, + 4, 9, 27, 11, 4, 30, 8, 6, 5, 10, + 6, 5 +}; + +/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing + symbol of state STATE-NUM. */ +static const unsigned char yystos[] = +{ + 0, 13, 0, 5, 7, 10, 14, 15, 16, 10, + 4, 4, 5, 9, 11, 18, 9, 5, 9, 11, + 5, 9, 11, 5, 9, 11, 6, 5, 5, 15, + 17, 7, 15, 8, 10, 6, 5 +}; + +#if ! defined (YYSIZE_T) && defined (__SIZE_TYPE__) +# define YYSIZE_T __SIZE_TYPE__ +#endif +#if ! defined (YYSIZE_T) && defined (size_t) +# define YYSIZE_T size_t +#endif +#if ! defined (YYSIZE_T) +# if defined (__STDC__) || defined (__cplusplus) +# include <stddef.h> /* INFRINGES ON USER NAME SPACE */ +# define YYSIZE_T size_t +# endif +#endif +#if ! defined (YYSIZE_T) +# define YYSIZE_T unsigned int +#endif + +#define yyerrok (yyerrstatus = 0) +#define yyclearin (yychar = YYEMPTY) +#define YYEMPTY (-2) +#define YYEOF 0 + +#define YYACCEPT goto yyacceptlab +#define YYABORT goto yyabortlab +#define YYERROR goto yyerrorlab + + +/* Like YYERROR except do call yyerror. This remains here temporarily + to ease the transition to the new meaning of YYERROR, for GCC. + Once GCC version 2 has supplanted version 1, this can go. */ + +#define YYFAIL goto yyerrlab + +#define YYRECOVERING() (!!yyerrstatus) + +#define YYBACKUP(Token, Value) \ +do \ + if (yychar == YYEMPTY && yylen == 1) \ + { \ + yychar = (Token); \ + yylval = (Value); \ + yytoken = YYTRANSLATE (yychar); \ + YYPOPSTACK; \ + goto yybackup; \ + } \ + else \ + { \ + yyerror ("syntax error: cannot back up");\ + YYERROR; \ + } \ +while (0) + +#define YYTERROR 1 +#define YYERRCODE 256 + +/* YYLLOC_DEFAULT -- Compute the default location (before the actions + are run). */ + +#ifndef YYLLOC_DEFAULT +# define YYLLOC_DEFAULT(Current, Rhs, N) \ + ((Current).first_line = (Rhs)[1].first_line, \ + (Current).first_column = (Rhs)[1].first_column, \ + (Current).last_line = (Rhs)[N].last_line, \ + (Current).last_column = (Rhs)[N].last_column) +#endif + +/* YYLEX -- calling `yylex' with the right arguments. */ + +#ifdef YYLEX_PARAM +# define YYLEX yylex (YYLEX_PARAM) +#else +# define YYLEX yylex () +#endif + +/* Enable debugging if requested. */ +#if YYDEBUG + +# ifndef YYFPRINTF +# include <stdio.h> /* INFRINGES ON USER NAME SPACE */ +# define YYFPRINTF fprintf +# endif + +# define YYDPRINTF(Args) \ +do { \ + if (yydebug) \ + YYFPRINTF Args; \ +} while (0) + +# define YYDSYMPRINT(Args) \ +do { \ + if (yydebug) \ + yysymprint Args; \ +} while (0) + +# define YYDSYMPRINTF(Title, Token, Value, Location) \ +do { \ + if (yydebug) \ + { \ + YYFPRINTF (stderr, "%s ", Title); \ + yysymprint (stderr, \ + Token, Value); \ + YYFPRINTF (stderr, "\n"); \ + } \ +} while (0) + +/*------------------------------------------------------------------. +| yy_stack_print -- Print the state stack from its BOTTOM up to its | +| TOP (included). | +`------------------------------------------------------------------*/ + +#if defined (__STDC__) || defined (__cplusplus) +static void +yy_stack_print (short *bottom, short *top) +#else +static void +yy_stack_print (bottom, top) + short *bottom; + short *top; +#endif +{ + YYFPRINTF (stderr, "Stack now"); + for (/* Nothing. */; bottom <= top; ++bottom) + YYFPRINTF (stderr, " %d", *bottom); + YYFPRINTF (stderr, "\n"); +} + +# define YY_STACK_PRINT(Bottom, Top) \ +do { \ + if (yydebug) \ + yy_stack_print ((Bottom), (Top)); \ +} while (0) + + +/*------------------------------------------------. +| Report that the YYRULE is going to be reduced. | +`------------------------------------------------*/ + +#if defined (__STDC__) || defined (__cplusplus) +static void +yy_reduce_print (int yyrule) +#else +static void +yy_reduce_print (yyrule) + int yyrule; +#endif +{ + int yyi; + unsigned int yylno = yyrline[yyrule]; + YYFPRINTF (stderr, "Reducing stack by rule %d (line %u), ", + yyrule - 1, yylno); + /* Print the symbols being reduced, and their result. */ + for (yyi = yyprhs[yyrule]; 0 <= yyrhs[yyi]; yyi++) + YYFPRINTF (stderr, "%s ", yytname [yyrhs[yyi]]); + YYFPRINTF (stderr, "-> %s\n", yytname [yyr1[yyrule]]); +} + +# define YY_REDUCE_PRINT(Rule) \ +do { \ + if (yydebug) \ + yy_reduce_print (Rule); \ +} while (0) + +/* Nonzero means print parse trace. It is left uninitialized so that + multiple parsers can coexist. */ +int yydebug; +#else /* !YYDEBUG */ +# define YYDPRINTF(Args) +# define YYDSYMPRINT(Args) +# define YYDSYMPRINTF(Title, Token, Value, Location) +# define YY_STACK_PRINT(Bottom, Top) +# define YY_REDUCE_PRINT(Rule) +#endif /* !YYDEBUG */ + + +/* YYINITDEPTH -- initial size of the parser's stacks. */ +#ifndef YYINITDEPTH +# define YYINITDEPTH 200 +#endif + +/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only + if the built-in stack extension method is used). + + Do not make this value too large; the results are undefined if + SIZE_MAX < YYSTACK_BYTES (YYMAXDEPTH) + evaluated with infinite-precision integer arithmetic. */ + +#if defined (YYMAXDEPTH) && YYMAXDEPTH == 0 +# undef YYMAXDEPTH +#endif + +#ifndef YYMAXDEPTH +# define YYMAXDEPTH 10000 +#endif + + + +#if YYERROR_VERBOSE + +# ifndef yystrlen +# if defined (__GLIBC__) && defined (_STRING_H) +# define yystrlen strlen +# else +/* Return the length of YYSTR. */ +static YYSIZE_T +# if defined (__STDC__) || defined (__cplusplus) +yystrlen (const char *yystr) +# else +yystrlen (yystr) + const char *yystr; +# endif +{ + register const char *yys = yystr; + + while (*yys++ != '\0') + continue; + + return yys - yystr - 1; +} +# endif +# endif + +# ifndef yystpcpy +# if defined (__GLIBC__) && defined (_STRING_H) && defined (_GNU_SOURCE) +# define yystpcpy stpcpy +# else +/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in + YYDEST. */ +static char * +# if defined (__STDC__) || defined (__cplusplus) +yystpcpy (char *yydest, const char *yysrc) +# else +yystpcpy (yydest, yysrc) + char *yydest; + const char *yysrc; +# endif +{ + register char *yyd = yydest; + register const char *yys = yysrc; + + while ((*yyd++ = *yys++) != '\0') + continue; + + return yyd - 1; +} +# endif +# endif + +#endif /* !YYERROR_VERBOSE */ + + + +#if YYDEBUG +/*--------------------------------. +| Print this symbol on YYOUTPUT. | +`--------------------------------*/ + +#if defined (__STDC__) || defined (__cplusplus) +static void +yysymprint (FILE *yyoutput, int yytype, YYSTYPE *yyvaluep) +#else +static void +yysymprint (yyoutput, yytype, yyvaluep) + FILE *yyoutput; + int yytype; + YYSTYPE *yyvaluep; +#endif +{ + /* Pacify ``unused variable'' warnings. */ + (void) yyvaluep; + + if (yytype < YYNTOKENS) + { + YYFPRINTF (yyoutput, "token %s (", yytname[yytype]); +# ifdef YYPRINT + YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep); +# endif + } + else + YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]); + + switch (yytype) + { + default: + break; + } + YYFPRINTF (yyoutput, ")"); +} + +#endif /* ! YYDEBUG */ +/*-----------------------------------------------. +| Release the memory associated to this symbol. | +`-----------------------------------------------*/ + +#if defined (__STDC__) || defined (__cplusplus) +static void +yydestruct (int yytype, YYSTYPE *yyvaluep) +#else +static void +yydestruct (yytype, yyvaluep) + int yytype; + YYSTYPE *yyvaluep; +#endif +{ + /* Pacify ``unused variable'' warnings. */ + (void) yyvaluep; + + switch (yytype) + { + + default: + break; + } +} + + +/* Prevent warnings from -Wmissing-prototypes. */ + +#ifdef YYPARSE_PARAM +# if defined (__STDC__) || defined (__cplusplus) +int yyparse (void *YYPARSE_PARAM); +# else +int yyparse (); +# endif +#else /* ! YYPARSE_PARAM */ +#if defined (__STDC__) || defined (__cplusplus) +int yyparse (void); +#else +int yyparse (); +#endif +#endif /* ! YYPARSE_PARAM */ + + + +/* The lookahead symbol. */ +int yychar; + +/* The semantic value of the lookahead symbol. */ +YYSTYPE yylval; + +/* Number of syntax errors so far. */ +int yynerrs; + + + +/*----------. +| yyparse. | +`----------*/ + +#ifdef YYPARSE_PARAM +# if defined (__STDC__) || defined (__cplusplus) +int yyparse (void *YYPARSE_PARAM) +# else +int yyparse (YYPARSE_PARAM) + void *YYPARSE_PARAM; +# endif +#else /* ! YYPARSE_PARAM */ +#if defined (__STDC__) || defined (__cplusplus) +int +yyparse (void) +#else +int +yyparse () + +#endif +#endif +{ + + register int yystate; + register int yyn; + int yyresult; + /* Number of tokens to shift before error messages enabled. */ + int yyerrstatus; + /* Lookahead token as an internal (translated) token number. */ + int yytoken = 0; + + /* Three stacks and their tools: + `yyss': related to states, + `yyvs': related to semantic values, + `yyls': related to locations. + + Refer to the stacks thru separate pointers, to allow yyoverflow + to reallocate them elsewhere. */ + + /* The state stack. */ + short yyssa[YYINITDEPTH]; + short *yyss = yyssa; + register short *yyssp; + + /* The semantic value stack. */ + YYSTYPE yyvsa[YYINITDEPTH]; + YYSTYPE *yyvs = yyvsa; + register YYSTYPE *yyvsp; + + + +#define YYPOPSTACK (yyvsp--, yyssp--) + + YYSIZE_T yystacksize = YYINITDEPTH; + + /* The variables used to return semantic value and location from the + action routines. */ + YYSTYPE yyval; + + + /* When reducing, the number of symbols on the RHS of the reduced + rule. */ + int yylen; + + YYDPRINTF ((stderr, "Starting parse\n")); + + yystate = 0; + yyerrstatus = 0; + yynerrs = 0; + yychar = YYEMPTY; /* Cause a token to be read. */ + + /* Initialize stack pointers. + Waste one element of value and location stack + so that they stay on the same level as the state stack. + The wasted elements are never initialized. */ + + yyssp = yyss; + yyvsp = yyvs; + + goto yysetstate; + +/*------------------------------------------------------------. +| yynewstate -- Push a new state, which is found in yystate. | +`------------------------------------------------------------*/ + yynewstate: + /* In all cases, when you get here, the value and location stacks + have just been pushed. so pushing a state here evens the stacks. + */ + yyssp++; + + yysetstate: + *yyssp = yystate; + + if (yyss + yystacksize - 1 <= yyssp) + { + /* Get the current used size of the three stacks, in elements. */ + YYSIZE_T yysize = yyssp - yyss + 1; + +#ifdef yyoverflow + { + /* Give user a chance to reallocate the stack. Use copies of + these so that the &'s don't force the real ones into + memory. */ + YYSTYPE *yyvs1 = yyvs; + short *yyss1 = yyss; + + + /* Each stack pointer address is followed by the size of the + data in use in that stack, in bytes. This used to be a + conditional around just the two extra args, but that might + be undefined if yyoverflow is a macro. */ + yyoverflow ("parser stack overflow", + &yyss1, yysize * sizeof (*yyssp), + &yyvs1, yysize * sizeof (*yyvsp), + + &yystacksize); + + yyss = yyss1; + yyvs = yyvs1; + } +#else /* no yyoverflow */ +# ifndef YYSTACK_RELOCATE + goto yyoverflowlab; +# else + /* Extend the stack our own way. */ + if (YYMAXDEPTH <= yystacksize) + goto yyoverflowlab; + yystacksize *= 2; + if (YYMAXDEPTH < yystacksize) + yystacksize = YYMAXDEPTH; + + { + short *yyss1 = yyss; + union yyalloc *yyptr = + (union yyalloc *) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize)); + if (! yyptr) + goto yyoverflowlab; + YYSTACK_RELOCATE (yyss); + YYSTACK_RELOCATE (yyvs); + +# undef YYSTACK_RELOCATE + if (yyss1 != yyssa) + YYSTACK_FREE (yyss1); + } +# endif +#endif /* no yyoverflow */ + + yyssp = yyss + yysize - 1; + yyvsp = yyvs + yysize - 1; + + + YYDPRINTF ((stderr, "Stack size increased to %lu\n", + (unsigned long int) yystacksize)); + + if (yyss + yystacksize - 1 <= yyssp) + YYABORT; + } + + YYDPRINTF ((stderr, "Entering state %d\n", yystate)); + + goto yybackup; + +/*-----------. +| yybackup. | +`-----------*/ +yybackup: + +/* Do appropriate processing given the current state. */ +/* Read a lookahead token if we need one and don't already have one. */ +/* yyresume: */ + + /* First try to decide what to do without reference to lookahead token. */ + + yyn = yypact[yystate]; + if (yyn == YYPACT_NINF) + goto yydefault; + + /* Not known => get a lookahead token if don't already have one. */ + + /* YYCHAR is either YYEMPTY or YYEOF or a valid lookahead symbol. */ + if (yychar == YYEMPTY) + { + YYDPRINTF ((stderr, "Reading a token: ")); + yychar = YYLEX; + } + + if (yychar <= YYEOF) + { + yychar = yytoken = YYEOF; + YYDPRINTF ((stderr, "Now at end of input.\n")); + } + else + { + yytoken = YYTRANSLATE (yychar); + YYDSYMPRINTF ("Next token is", yytoken, &yylval, &yylloc); + } + + /* If the proper action on seeing token YYTOKEN is to reduce or to + detect an error, take that action. */ + yyn += yytoken; + if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken) + goto yydefault; + yyn = yytable[yyn]; + if (yyn <= 0) + { + if (yyn == 0 || yyn == YYTABLE_NINF) + goto yyerrlab; + yyn = -yyn; + goto yyreduce; + } + + if (yyn == YYFINAL) + YYACCEPT; + + /* Shift the lookahead token. */ + YYDPRINTF ((stderr, "Shifting token %s, ", yytname[yytoken])); + + /* Discard the token being shifted unless it is eof. */ + if (yychar != YYEOF) + yychar = YYEMPTY; + + *++yyvsp = yylval; + + + /* Count tokens shifted since error; after three, turn off error + status. */ + if (yyerrstatus) + yyerrstatus--; + + yystate = yyn; + goto yynewstate; + + +/*-----------------------------------------------------------. +| yydefault -- do the default action for the current state. | +`-----------------------------------------------------------*/ +yydefault: + yyn = yydefact[yystate]; + if (yyn == 0) + goto yyerrlab; + goto yyreduce; + + +/*-----------------------------. +| yyreduce -- Do a reduction. | +`-----------------------------*/ +yyreduce: + /* yyn is the number of a rule to reduce with. */ + yylen = yyr2[yyn]; + + /* If YYLEN is nonzero, implement the default value of the action: + `$$ = $1'. + + Otherwise, the following line sets YYVAL to garbage. + This behavior is undocumented and Bison + users should not rely upon it. Assigning to YYVAL + unconditionally makes the parser a bit smaller, and it avoids a + GCC warning that YYVAL may be used uninitialized. */ + yyval = yyvsp[1-yylen]; + + + YY_REDUCE_PRINT (yyn); + switch (yyn) + { + case 3: +#line 66 "conf_parser.yxx" + { /* Whole config file */ ;} + break; + + case 4: +#line 69 "conf_parser.yxx" + { + // name: value + ((HtConfiguration *)aConf)->AddParsed(yyvsp[0].ConfLine->name,yyvsp[0].ConfLine->value); + #ifdef DEBUG + if (sn_debug>=2) { + cout<<"Added to conf: "<<yyvsp[0].ConfLine->name<<":"<<yyvsp[0].ConfLine->value<<endl; + } + #endif + delete [] yyvsp[0].ConfLine->name; + delete [] yyvsp[0].ConfLine->value; + delete yyvsp[0].ConfLine; + ;} + break; + + case 5: +#line 81 "conf_parser.yxx" + { + // <server www.gc.lviv.ua> + // server_max_docs: 456 + // ... : ... + // </server> + ;} + break; + + case 6: +#line 87 "conf_parser.yxx" + { /* Ignore empty lines */ ;} + break; + + case 7: +#line 90 "conf_parser.yxx" + { + // locale: uk_UA.KOI8-U + // + // We can't do inserting into config + // here because we don't know if it's + // in complex expression or not. + yyval.ConfLine=new ConfigDefaults; + yyval.ConfLine->name = yyvsp[-3].str; yyval.ConfLine->value=yyvsp[-1].str; + ;} + break; + + case 8: +#line 100 "conf_parser.yxx" + { + // max_head_length: 300000 + // + yyval.ConfLine=new ConfigDefaults; + yyval.ConfLine->name = yyvsp[-3].str; yyval.ConfLine->value=yyvsp[-1].str; + ;} + break; + + case 9: +#line 106 "conf_parser.yxx" + { + // bad_extensions: .XLS .xls .pdf .PDF .doc .DOC + // + yyval.ConfLine=new ConfigDefaults; + yyval.ConfLine->name = yyvsp[-3].str; yyval.ConfLine->value=yyvsp[-1].str; + ;} + break; + + case 10: +#line 112 "conf_parser.yxx" + { + // excude_urls: + // + yyval.ConfLine=new ConfigDefaults; + yyval.ConfLine->name = yyvsp[-2].str; + yyval.ConfLine->value=new char[1]; + *yyval.ConfLine->value='\0'; + ;} + break; + + case 11: +#line 122 "conf_parser.yxx" + { + // check if "<param> ... </param>" are equal + if (strcmp(yyvsp[-10].str,yyvsp[-2].str)!=0) { + // todo: setup error string, return with error. + // Inform about line number + cerr<<"Brackets mismatch: Opened: "<<yyvsp[-10].str<<" Closed: "<<yyvsp[-2].str<<endl; + // exit(1); + } + // Oll right. Append set of parameters to object($2) + ((HtConfiguration *)aConf)->Add(yyvsp[-10].str,yyvsp[-8].str,yyvsp[-5].ConfLines); + #ifdef DEBUG + if (sn_debug >= 2) { + cout<<"Added to conf: "<<yyvsp[-10].str<<":"<<yyvsp[-8].str<<":"<<yyvsp[-5].ConfLines<<endl; + } + #endif + delete yyvsp[-10].str; + delete yyvsp[-8].str; + delete [] yyvsp[-2].str; + ;} + break; + + case 12: +#line 143 "conf_parser.yxx" + { + //aaa: nnn + //bbb: ccc + // ... + // + // First entry. We need to create conf to store it. + HtConfiguration *expressionList=new HtConfiguration(); + expressionList->AddParsed(yyvsp[0].ConfLine->name,yyvsp[0].ConfLine->value); + yyval.ConfLines=expressionList; + #ifdef DEBUG + if (sn_debug>=2) { + cout<<"Create list of properties: "<<expressionList<<endl; + } + #endif + delete yyvsp[0].ConfLine->name; + delete yyvsp[0].ConfLine->value; + delete yyvsp[0].ConfLine; +;} + break; + + case 13: +#line 161 "conf_parser.yxx" + { + yyvsp[-1].ConfLines->AddParsed(yyvsp[0].ConfLine->name,yyvsp[0].ConfLine->value); + #ifdef DEBUG + if (sn_debug>=2) { + cout<<yyvsp[0].ConfLine->name<<":"<<yyvsp[0].ConfLine->value<<" added to "<<yyvsp[-1].ConfLines<<endl; + } + #endif + delete yyvsp[0].ConfLine->name; + delete yyvsp[0].ConfLine->value; + delete yyvsp[0].ConfLine; + //$$=$1; //I think $$==$1 + ;} + break; + + case 14: +#line 173 "conf_parser.yxx" + { /* Ignore empty lines */ ;} + break; + + case 15: +#line 176 "conf_parser.yxx" + { + // Paste 2 components. Reallocate memory for 2 components. + if ((yyval.str=new char[strlen(yyvsp[-1].str)+strlen(yyvsp[0].str)+1+1])==NULL) { + fprintf(stderr,"Can't allocate memory\n"); + exit(1); + } + strcpy(yyval.str,yyvsp[-1].str); + strcat(yyval.str," "); // Delimiter in list + strcat(yyval.str,yyvsp[0].str); + delete [] yyvsp[-1].str; + delete [] yyvsp[0].str; + ;} + break; + + case 16: +#line 189 "conf_parser.yxx" + { + // Paste 2 components. Reallocate memory for 2 components. + if ((yyval.str=new char[strlen(yyvsp[-1].str)+strlen(yyvsp[0].str)+1+1])==NULL) { + fprintf(stderr,"Can't allocate memory\n"); + exit(1); + } + strcpy(yyval.str,yyvsp[-1].str); + strcat(yyval.str," "); // Delimiter in list + strcat(yyval.str,yyvsp[0].str); + delete [] yyvsp[-1].str; + delete [] yyvsp[0].str; + ;} + break; + + case 17: +#line 202 "conf_parser.yxx" + { + // Paste 2 components. Reallocate memory for 2 components. + if ((yyval.str=new char[strlen(yyvsp[-1].str)+strlen(yyvsp[0].str)+1+1])==NULL) { + fprintf(stderr,"Can't allocate memory\n"); + exit(1); + } + strcpy(yyval.str,yyvsp[-1].str); + strcat(yyval.str," "); // Delimiter in list + strcat(yyval.str,yyvsp[0].str); + delete [] yyvsp[-1].str; + delete [] yyvsp[0].str; + ;} + break; + + case 18: +#line 215 "conf_parser.yxx" + { + // Paste 2 components. Reallocate memory for 2 components. + if ((yyval.str=new char[strlen(yyvsp[-1].str)+strlen(yyvsp[0].str)+1+1])==NULL) { + fprintf(stderr,"Can't allocate memory\n"); + exit(1); + } + strcpy(yyval.str,yyvsp[-1].str); + strcat(yyval.str," "); // Delimiter in list + strcat(yyval.str,yyvsp[0].str); + delete [] yyvsp[-1].str; + delete [] yyvsp[0].str; + ;} + break; + + case 19: +#line 228 "conf_parser.yxx" + { + char *old=yyval.str; + if ((yyval.str=new char [strlen(yyval.str)+strlen(yyvsp[0].str)+1+1])==NULL) { + fprintf(stderr,"Can't reallocate memory\n"); + exit(1); + } + strcpy(yyval.str,old); + delete [] old; + strcat(yyval.str," "); + strcat(yyval.str,yyvsp[0].str); + delete [] yyvsp[0].str; + ;} + break; + + case 20: +#line 240 "conf_parser.yxx" + { + char *old=yyval.str; + if ((yyval.str=new char [strlen(yyval.str)+strlen(yyvsp[0].str)+1+1])==NULL) { + fprintf(stderr,"Can't reallocate memory\n"); + exit(1); + } + strcpy(yyval.str,old); + delete [] old; + strcat(yyval.str," "); + strcat(yyval.str,yyvsp[0].str); + delete [] yyvsp[0].str; + ;} + break; + + + } + +/* Line 1000 of yacc.c. */ +#line 1309 "conf_parser.cxx" + + yyvsp -= yylen; + yyssp -= yylen; + + + YY_STACK_PRINT (yyss, yyssp); + + *++yyvsp = yyval; + + + /* Now `shift' the result of the reduction. Determine what state + that goes to, based on the state we popped back to and the rule + number reduced by. */ + + yyn = yyr1[yyn]; + + yystate = yypgoto[yyn - YYNTOKENS] + *yyssp; + if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp) + yystate = yytable[yystate]; + else + yystate = yydefgoto[yyn - YYNTOKENS]; + + goto yynewstate; + + +/*------------------------------------. +| yyerrlab -- here on detecting error | +`------------------------------------*/ +yyerrlab: + /* If not already recovering from an error, report this error. */ + if (!yyerrstatus) + { + ++yynerrs; +#if YYERROR_VERBOSE + yyn = yypact[yystate]; + + if (YYPACT_NINF < yyn && yyn < YYLAST) + { + YYSIZE_T yysize = 0; + int yytype = YYTRANSLATE (yychar); + const char* yyprefix; + char *yymsg; + int yyx; + + /* Start YYX at -YYN if negative to avoid negative indexes in + YYCHECK. */ + int yyxbegin = yyn < 0 ? -yyn : 0; + + /* Stay within bounds of both yycheck and yytname. */ + int yychecklim = YYLAST - yyn; + int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS; + int yycount = 0; + + yyprefix = ", expecting "; + for (yyx = yyxbegin; yyx < yyxend; ++yyx) + if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR) + { + yysize += yystrlen (yyprefix) + yystrlen (yytname [yyx]); + yycount += 1; + if (yycount == 5) + { + yysize = 0; + break; + } + } + yysize += (sizeof ("syntax error, unexpected ") + + yystrlen (yytname[yytype])); + yymsg = (char *) YYSTACK_ALLOC (yysize); + if (yymsg != 0) + { + char *yyp = yystpcpy (yymsg, "syntax error, unexpected "); + yyp = yystpcpy (yyp, yytname[yytype]); + + if (yycount < 5) + { + yyprefix = ", expecting "; + for (yyx = yyxbegin; yyx < yyxend; ++yyx) + if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR) + { + yyp = yystpcpy (yyp, yyprefix); + yyp = yystpcpy (yyp, yytname[yyx]); + yyprefix = " or "; + } + } + yyerror (yymsg); + YYSTACK_FREE (yymsg); + } + else + yyerror ("syntax error; also virtual memory exhausted"); + } + else +#endif /* YYERROR_VERBOSE */ + yyerror ("syntax error"); + } + + + + if (yyerrstatus == 3) + { + /* If just tried and failed to reuse lookahead token after an + error, discard it. */ + + if (yychar <= YYEOF) + { + /* If at end of input, pop the error token, + then the rest of the stack, then return failure. */ + if (yychar == YYEOF) + for (;;) + { + YYPOPSTACK; + if (yyssp == yyss) + YYABORT; + YYDSYMPRINTF ("Error: popping", yystos[*yyssp], yyvsp, yylsp); + yydestruct (yystos[*yyssp], yyvsp); + } + } + else + { + YYDSYMPRINTF ("Error: discarding", yytoken, &yylval, &yylloc); + yydestruct (yytoken, &yylval); + yychar = YYEMPTY; + + } + } + + /* Else will try to reuse lookahead token after shifting the error + token. */ + goto yyerrlab1; + + +/*---------------------------------------------------. +| yyerrorlab -- error raised explicitly by YYERROR. | +`---------------------------------------------------*/ +yyerrorlab: + +#ifdef __GNUC__ + /* Pacify GCC when the user code never invokes YYERROR and the label + yyerrorlab therefore never appears in user code. */ + if (0) + goto yyerrorlab; +#endif + + yyvsp -= yylen; + yyssp -= yylen; + yystate = *yyssp; + goto yyerrlab1; + + +/*-------------------------------------------------------------. +| yyerrlab1 -- common code for both syntax error and YYERROR. | +`-------------------------------------------------------------*/ +yyerrlab1: + yyerrstatus = 3; /* Each real token shifted decrements this. */ + + for (;;) + { + yyn = yypact[yystate]; + if (yyn != YYPACT_NINF) + { + yyn += YYTERROR; + if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR) + { + yyn = yytable[yyn]; + if (0 < yyn) + break; + } + } + + /* Pop the current state because it cannot handle the error token. */ + if (yyssp == yyss) + YYABORT; + + YYDSYMPRINTF ("Error: popping", yystos[*yyssp], yyvsp, yylsp); + yydestruct (yystos[yystate], yyvsp); + YYPOPSTACK; + yystate = *yyssp; + YY_STACK_PRINT (yyss, yyssp); + } + + if (yyn == YYFINAL) + YYACCEPT; + + YYDPRINTF ((stderr, "Shifting error token, ")); + + *++yyvsp = yylval; + + + yystate = yyn; + goto yynewstate; + + +/*-------------------------------------. +| yyacceptlab -- YYACCEPT comes here. | +`-------------------------------------*/ +yyacceptlab: + yyresult = 0; + goto yyreturn; + +/*-----------------------------------. +| yyabortlab -- YYABORT comes here. | +`-----------------------------------*/ +yyabortlab: + yyresult = 1; + goto yyreturn; + +#ifndef yyoverflow +/*----------------------------------------------. +| yyoverflowlab -- parser overflow comes here. | +`----------------------------------------------*/ +yyoverflowlab: + yyerror ("parser stack overflow"); + yyresult = 2; + /* Fall through. */ +#endif + +yyreturn: +#ifndef yyoverflow + if (yyss != yyssa) + YYSTACK_FREE (yyss); +#endif + return yyresult; +} + + +#line 253 "conf_parser.yxx" + +int +yyerror (char *s) /* Called by yyparse on error */ +{ + extern int yylineno; + extern int include_stack_ptr; + extern String *name_stack[]; + HtConfiguration* config= HtConfiguration::config(); + String str; + if (include_stack_ptr > 0) + str = *name_stack[include_stack_ptr-1]; + else // still at top level config + str = config->getFileName(); + //fprintf (stderr, "%s\nIn line %d\n",s,yylineno); + fprintf(stderr,"Error in file %s line %d: %s\n",str.get(),yylineno,s); + // exit(1); + return -1; +} + + diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/conf_parser.h b/debian/htdig/htdig-3.2.0b6/htcommon/conf_parser.h new file mode 100644 index 00000000..7b0c521d --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/conf_parser.h @@ -0,0 +1,73 @@ +/* A Bison parser, made by GNU Bison 1.875a. */ + +/* Skeleton parser for Yacc-like parsing with Bison, + Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +/* As a special exception, when this file is copied by Bison into a + Bison output file, you may use that output file without restriction. + This special exception was added by the Free Software Foundation + in version 1.24 of Bison. */ + +/* Tokens. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + /* Put the tokens into the symbol table, so that GDB and other debuggers + know about them. */ + enum yytokentype { + NUM = 258, + T_DELIMITER = 259, + T_NEWLINE = 260, + T_RIGHT_BR = 261, + T_LEFT_BR = 262, + T_SLASH = 263, + T_STRING = 264, + T_KEYWORD = 265, + T_NUMBER = 266 + }; +#endif +#define NUM 258 +#define T_DELIMITER 259 +#define T_NEWLINE 260 +#define T_RIGHT_BR 261 +#define T_LEFT_BR 262 +#define T_SLASH 263 +#define T_STRING 264 +#define T_KEYWORD 265 +#define T_NUMBER 266 + + + + +#if ! defined (YYSTYPE) && ! defined (YYSTYPE_IS_DECLARED) + +typedef union YYSTYPE { + char *str; + ConfigDefaults *ConfLine; + HtConfiguration *ConfLines; +} YYSTYPE; +/* Line 1240 of yacc.c. */ + +# define yystype YYSTYPE /* obsolescent; will be withdrawn */ +# define YYSTYPE_IS_DECLARED 1 +# define YYSTYPE_IS_TRIVIAL 1 +#endif + +extern YYSTYPE yylval; + + + diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/conf_parser.yxx b/debian/htdig/htdig-3.2.0b6/htcommon/conf_parser.yxx new file mode 100644 index 00000000..85d0213d --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/conf_parser.yxx @@ -0,0 +1,270 @@ +%{ +// +// conf_parser.yxx +// +// This syntax analyzer is used to parse ht://Dig config +// files. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: conf_parser.yxx,v 1.8 2004/06/10 14:48:39 angusgb Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +/* Bison version > 1.25 needed */ +/* TODO: +1. Better error handling +2. ? +*/ +#include <stdio.h> /* for debug */ +#include <stdlib.h> + +#ifdef HAVE_STD +#include <iostream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <iostream.h> +#endif /* HAVE_STD */ + +#include "HtConfiguration.h" +#include "htString.h" +/*#define YYDEBUG 1*/ +#define YYPARSE_PARAM aConf +int yyerror(char *s); +int yylex(void); +#undef DEBUG +#ifdef DEBUG +int sn_debug=3; +#endif +%} + +%union { + char *str; + ConfigDefaults *ConfLine; + HtConfiguration *ConfLines; +} + +%token NUM T_DELIMITER T_NEWLINE T_RIGHT_BR T_LEFT_BR T_SLASH +%token <str> T_STRING T_KEYWORD T_NUMBER +%type <str> list +%type <ConfLine> simple_expression +%type <ConfLines> simple_expression_list + +/* Grammar follows */ +%% + +input: + | input block { /* Whole config file */ } +; + +block: simple_expression { + // name: value + ((HtConfiguration *)aConf)->AddParsed($1->name,$1->value); + #ifdef DEBUG + if (sn_debug>=2) { + cout<<"Added to conf: "<<$1->name<<":"<<$1->value<<endl; + } + #endif + delete [] $1->name; + delete [] $1->value; + delete $1; + } + | complex_expression { + // <server www.gc.lviv.ua> + // server_max_docs: 456 + // ... : ... + // </server> + } + | T_NEWLINE { /* Ignore empty lines */ } +; + +simple_expression: T_KEYWORD T_DELIMITER T_STRING T_NEWLINE { + // locale: uk_UA.KOI8-U + // + // We can't do inserting into config + // here because we don't know if it's + // in complex expression or not. + $$=new ConfigDefaults; + $$->name = $1; $$->value=$3; + } + + | T_KEYWORD T_DELIMITER T_NUMBER T_NEWLINE { + // max_head_length: 300000 + // + $$=new ConfigDefaults; + $$->name = $1; $$->value=$3; + } + | T_KEYWORD T_DELIMITER list T_NEWLINE { + // bad_extensions: .XLS .xls .pdf .PDF .doc .DOC + // + $$=new ConfigDefaults; + $$->name = $1; $$->value=$3; + } + | T_KEYWORD T_DELIMITER T_NEWLINE { + // excude_urls: + // + $$=new ConfigDefaults; + $$->name = $1; + $$->value=new char[1]; + *$$->value='\0'; + } +; + +complex_expression: T_LEFT_BR T_KEYWORD T_DELIMITER T_STRING T_RIGHT_BR T_NEWLINE simple_expression_list T_LEFT_BR T_SLASH T_KEYWORD T_RIGHT_BR T_NEWLINE { + // check if "<param> ... </param>" are equal + if (strcmp($2,$10)!=0) { + // todo: setup error string, return with error. + // Inform about line number + cerr<<"Brackets mismatch: Opened: "<<$2<<" Closed: "<<$10<<endl; + // exit(1); + } + // Oll right. Append set of parameters to object($2) + ((HtConfiguration *)aConf)->Add($2,$4,$7); + #ifdef DEBUG + if (sn_debug >= 2) { + cout<<"Added to conf: "<<$2<<":"<<$4<<":"<<$7<<endl; + } + #endif + delete $2; + delete $4; + delete [] $10; + } + ; + +simple_expression_list: simple_expression { + //aaa: nnn + //bbb: ccc + // ... + // + // First entry. We need to create conf to store it. + HtConfiguration *expressionList=new HtConfiguration(); + expressionList->AddParsed($1->name,$1->value); + $$=expressionList; + #ifdef DEBUG + if (sn_debug>=2) { + cout<<"Create list of properties: "<<expressionList<<endl; + } + #endif + delete $1->name; + delete $1->value; + delete $1; +} + | simple_expression_list simple_expression { + $1->AddParsed($2->name,$2->value); + #ifdef DEBUG + if (sn_debug>=2) { + cout<<$2->name<<":"<<$2->value<<" added to "<<$1<<endl; + } + #endif + delete $2->name; + delete $2->value; + delete $2; + //$$=$1; //I think $$==$1 + } + | T_NEWLINE { /* Ignore empty lines */ } + ; + +list: T_STRING T_STRING { + // Paste 2 components. Reallocate memory for 2 components. + if (($$=new char[strlen($1)+strlen($2)+1+1])==NULL) { + fprintf(stderr,"Can't allocate memory\n"); + exit(1); + } + strcpy($$,$1); + strcat($$," "); // Delimiter in list + strcat($$,$2); + delete [] $1; + delete [] $2; + } + + | T_NUMBER T_STRING { + // Paste 2 components. Reallocate memory for 2 components. + if (($$=new char[strlen($1)+strlen($2)+1+1])==NULL) { + fprintf(stderr,"Can't allocate memory\n"); + exit(1); + } + strcpy($$,$1); + strcat($$," "); // Delimiter in list + strcat($$,$2); + delete [] $1; + delete [] $2; + } + + | T_STRING T_NUMBER { + // Paste 2 components. Reallocate memory for 2 components. + if (($$=new char[strlen($1)+strlen($2)+1+1])==NULL) { + fprintf(stderr,"Can't allocate memory\n"); + exit(1); + } + strcpy($$,$1); + strcat($$," "); // Delimiter in list + strcat($$,$2); + delete [] $1; + delete [] $2; + } + + | T_NUMBER T_NUMBER { + // Paste 2 components. Reallocate memory for 2 components. + if (($$=new char[strlen($1)+strlen($2)+1+1])==NULL) { + fprintf(stderr,"Can't allocate memory\n"); + exit(1); + } + strcpy($$,$1); + strcat($$," "); // Delimiter in list + strcat($$,$2); + delete [] $1; + delete [] $2; + } + + | list T_STRING { + char *old=$$; + if (($$=new char [strlen($$)+strlen($2)+1+1])==NULL) { + fprintf(stderr,"Can't reallocate memory\n"); + exit(1); + } + strcpy($$,old); + delete [] old; + strcat($$," "); + strcat($$,$2); + delete [] $2; + } + | list T_NUMBER { + char *old=$$; + if (($$=new char [strlen($$)+strlen($2)+1+1])==NULL) { + fprintf(stderr,"Can't reallocate memory\n"); + exit(1); + } + strcpy($$,old); + delete [] old; + strcat($$," "); + strcat($$,$2); + delete [] $2; + } +; +%% +int +yyerror (char *s) /* Called by yyparse on error */ +{ + extern int yylineno; + extern int include_stack_ptr; + extern String *name_stack[]; + HtConfiguration* config= HtConfiguration::config(); + String str; + if (include_stack_ptr > 0) + str = *name_stack[include_stack_ptr-1]; + else // still at top level config + str = config->getFileName(); + //fprintf (stderr, "%s\nIn line %d\n",s,yylineno); + fprintf(stderr,"Error in file %s line %d: %s\n",str.get(),yylineno,s); + // exit(1); + return -1; +} diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/defaults.cc b/debian/htdig/htdig-3.2.0b6/htcommon/defaults.cc new file mode 100644 index 00000000..0148165c --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/defaults.cc @@ -0,0 +1,2832 @@ +// +// defaults.cc +// +// defaults: default values for the ht programs through the +// HtConfiguration class +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: defaults.cc,v 1.112 2004/06/12 13:39:12 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "HtConfiguration.h" + +// Fields and their values: +// Attribute name +// Default value ("" becomes "no default" in .html docs) +// Type (boolean, number, integer, string, string list, quoted string list, +// pattern list) +// Commands using attribute (all, htdig, htsearch, htfuzzy, +// htdump, htload, htnotify, htpurge) +// Block (Global, Server, URL) +// Versions for which attribute is present +// Class (Extra Output, External:Parsers, External:Protocols, +// File Layout, +// Indexing:Connection, Indexing:Out, Indexing:What,Indexing:Where, +// Presentation:Files, Presentation:How, Presentation:Text, +// Searching:Method, Searching:Ranking, Searching:UI, +// URLs) +// Example +// Description + +ConfigDefaults defaults[] = +{ + +{ "accents_db", "${database_base}.accents.db", \ + "string", "htfuzzy htsearch", "", "all", "File Layout", "accents_db: ${database_base}.uml.db", " \ + The database file used for the fuzzy \"accents\" search \ + algorithm. This database is created by \ + <a href=\"htfuzzy.html\">htfuzzy</a> and used by \ + <a href=\"htsearch.html\" target=\"_top\">htsearch</a>. \ +" }, \ +{ "accept_language", "", \ + "string list", "htdig", "Server", "3.2.0b4", "Indexing:Out", "accept_language: en-us en it", " \ + This attribute allows you to restrict the set of natural languages \ + that are preferred as a response to an HTTP request performed by the \ + digger. This can be done by putting one or more language tags \ + (as defined by RFC 1766) in the preferred order, separated by spaces. \ + By doing this, when the server performs a content negotiation based \ + on the 'accept-language' given by the HTTP user agent, a different \ + content can be shown depending on the value of this attribute. If \ + set to an empty list, no language will be sent and the server default \ + will be returned. \ +" }, \ +{ "add_anchors_to_excerpt", "true", \ + "boolean", "htsearch", "", "3.1.0", "Presentation:How", "add_anchors_to_excerpt: no", " \ + If set to true, the first occurrence of each matched \ + word in the excerpt will be linked to the closest \ + anchor in the document. This only has effect if the \ + <strong>EXCERPT</strong> variable is used in the output \ + template and the excerpt is actually going to be displayed. \ +" }, \ +{ "allow_double_slash", "false", \ + "boolean", "htdig", "", "3.2.0b4", "Indexing:Out", "allow_double_slash: true", " \ + If set to true, strings of multiple slashes ('/') in URL paths \ + will be left intact, rather than being collapsed. This is necessary \ + for some search engine URLs which use slashes to separate fields rather \ + than to separate directory components. However, it can lead to multiple database \ + entries refering to the same file, and it causes '/foo//../' to \ + be equivalent to '/foo/', rather than to '/'. \ +" }, \ +{ "allow_in_form", "", \ + "string list", "htsearch", "", "3.1.0", "Searching:UI", "allow_in_form: search_algorithm search_results_header", " \ + Allows the specified config file attributes to be specified \ + in search forms as separate fields. This could be used to \ + allow form writers to design their own headers and footers \ + and specify them in the search form. Another example would \ + be to offer a menu of search_algorithms in the form. \ + <table> \ + <tr> \ + <td nowrap> \ + <code> \ + <SELECT NAME=\"search_algorithm\"><br> \ + <OPTION VALUE=\"exact:1 prefix:0.6 synonyms:0.5 endings:0.1\" SELECTED>fuzzy<br> \ + <OPTION VALUE=\"exact:1\">exact<br> \ + </SELECT> \ + </code></td> \ + </tr> \ + </table> \ + The general idea behind this is to make an input parameter out \ + of any configuration attribute that's not already automatically \ + handled by an input parameter. You can even make up your own \ + configuration attribute names, for purposes of passing data from \ + the search form to the results output. You're not restricted to \ + the existing attribute names. The attributes listed in the \ + allow_in_form list will be settable in the search form using \ + input parameters of the same name, and will be propagated to \ + the follow-up search form in the results template using template \ + variables of the same name in upper-case. \ + You can also make select lists out of any of these input \ + parameters, in the follow-up search form, using the \ + <a href=\"#build_select_lists\">build_select_lists</a> \ + configuration attribute. \ + <br>WARNING: Extreme care are should be taken with this option, as \ + allowing CGI scripts to set file names can open security holes.\ +" }, \ +{ "allow_numbers", "false", \ + "boolean", "htdig htsearch", "", "all", "Indexing:What", "allow_numbers: true", " \ + If set to true, numbers are considered words. This \ + means that searches can be done on strings of digits as well as \ + regular words. All the same rules apply to numbers as \ + to words. This does not cause numbers containing a decimal point or \ + commas to be treated as a single entity. \ + When allow_numbers is false, words are stil \ + allowed to contain digits, but they must also contain at \ + least one alphabetic character or \ + <a href=\"#extra_word_characters\">extra word</a> character. \ + To disallow digits in words, add the digits to \ + <a href=\"#valid_punctuation\">valid_punctuation</a>. \ +" }, \ +{ "allow_space_in_url", "false", \ + "boolean", "htdig", "", "3.2.0b6", "Indexing:Where", "allow_space_in_url: true", " \ + If set to true, htdig will handle URLs that contain \ + embedded spaces. Technically, this is a violation of \ + RFC 2396, which says spaces should be stripped out \ + (as htdig does by default). However, many web browsers \ + and HTML code generators violate this standard already, \ + so enabling this attribute allows htdig to handle these \ + non-compliant URLs. Even with this attribute set, htdig \ + still strips out all white space (leading, trailing and \ + embedded), except that space characters embedded within \ + the URL will be encoded as %20. \ +" }, \ +{ "allow_virtual_hosts", "true", \ + "boolean", "htdig", "", "3.0.8b2", "Indexing:Where", "allow_virtual_hosts: false", " \ + If set to true, htdig will index virtual web sites as \ + expected. If false, all URL host names will be \ + normalized into whatever the DNS server claims the IP \ + address to map to. If this option is set to false, \ + there is no way to index either \"soft\" or \"hard\" \ + virtual web sites. \ +" }, \ +{ "anchor_target", "", \ + "string", "htsearch", "", "3.1.6", "Presentation:How", "anchor_target: body", " \ + When the first matched word in the excerpt is linked \ + to the closest anchor in the document, this string \ + can be set to specify a target in the link so the \ + resulting page is displayed in the desired frame. \ + This value will only be used if the \ + <a href=\"#add_anchors_to_excerpt\">add_anchors_to_excerpt</a> \ + attribute is set to true, the <strong>EXCERPT</strong> \ + variable is used in the output template and the \ + excerpt is actually displayed with a link. \ +" }, \ +{ "any_keywords", "false", \ + "boolean", "htsearch", "", "3.2.0b2", "Searching:Method", "any_keywords: yes", " \ + If set to true, the words in the <strong>keywords</strong> \ + input parameter in the search form will be joined with logical \ + ORs rather than ANDs, so that any of the words provided will do. \ + Note that this has nothing to do with limiting the search to \ + words in META keywords tags. See the <a href=\"hts_form.html\"> \ + search form</a> documentation for details on this. \ +" }, \ +{ "author_factor", "1", \ + "number", "htsearch", "", "3.2.0b4", "Searching:Ranking", "author_factor: 1", " \ + Weighting applied to words in a <meta name=\"author\" ... > \ + tag.<br> \ + See also <a href=\"#heading_factor\">heading_factor</a>. \ +" }, \ +{ "authorization", "", \ + "string", "htdig", "URL", "3.1.4", "Indexing:Out", "authorization: myusername:mypassword", " \ + This tells htdig to send the supplied \ + <em>username</em><strong>:</strong><em>password</em> with each HTTP request. \ + The credentials will be encoded using the \"Basic\" authentication \ + scheme. There <em>must</em> be a colon (:) between the username and \ + password.<br> \ + This attribute can also be specified on htdig's command line using \ + the -u option, and will be blotted out so it won't show up in a \ + process listing. If you use it directly in a configuration file, \ + be sure to protect it so it is readable only by you, and do not \ + use that same configuration file for htsearch. \ +" }, \ +{ "backlink_factor", "0.1", \ + "number", "htsearch", "", "3.1.0", "Searching:Ranking", "backlink_factor: 501.1", " \ + This is a weight of \"how important\" a page is, based on \ + the number of URLs pointing to it. It's actually \ + multiplied by the ratio of the incoming URLs (backlinks) \ + and outgoing URLs (links on the page), to balance out pages \ + with lots of links to pages that link back to them. The ratio \ + gives lower weight to \"link farms\", which often have many \ + links to them. This factor can \ + be changed without changing the database in any way. \ + However, setting this value to something other than 0 \ + incurs a slowdown on search results. \ +" }, \ +{ "bad_extensions", ".wav .gz .z .sit .au .zip .tar .hqx .exe .com .gif .jpg .jpeg .aiff .class .map .ram .tgz .bin .rpm .mpg .mov .avi .css", \ + "string list", "htdig", "URL", "all", "Indexing:Where", "bad_extensions: .foo .bar .bad", " \ + This is a list of extensions on URLs which are \ + considered non-parsable. This list is used mainly to \ + supplement the MIME-types that the HTTP server provides \ + with documents. Some HTTP servers do not have a correct \ + list of MIME-types and so can advertise certain \ + documents as text while they are some binary format. \ + If the list is empty, then all extensions are acceptable, \ + provided they pass other criteria for acceptance or rejection. \ + See also <a href=\"#valid_extensions\">valid_extensions</a>. \ +" }, \ +{ "bad_local_extensions", ".php .shtml .cgi", \ + "string list", "htdig", "URL", "all", "Indexing:Where", "bad_extensions: .foo .bar .bad", " \ + This is a list of extensions on URLs which must be retrieved \ + using the URL's true transport mechanism (such as HTTP). \ + If <a href=\"#local_urls\">local_urls</a> is specified, URLs not \ + ending with these extensions may instead be retrieved through \ + the local filesystem for efficiency. \ +" }, +{ "bad_querystr", "", \ + "pattern list", "htdig", "URL", "3.1.0", "Indexing:Where", "bad_querystr: forum=private section=topsecret&passwd=required", " \ + This is a list of CGI query strings to be excluded from \ + indexing. This can be used in conjunction with CGI-generated \ + portions of a website to control which pages are \ + indexed. \ +" }, \ +{ "bad_word_list", "${common_dir}/bad_words", \ + "string", "htdig htsearch", "", "all", "Indexing:What,Searching:Method", "bad_word_list: ${common_dir}/badwords.txt", " \ + This specifies a file which contains words which should \ + be excluded when digging or searching. This list should \ + include the most common words or other words that you \ + don't want to be able to search on (things like <em> \ + sex</em> or <em>smut</em> are examples of these.)<br> \ + The file should contain one word per line. A sample \ + bad words file is located in the <code>contrib/examples</code> \ + directory. \ +" }, \ +{ "bin_dir", BIN_DIR, \ + "string", "all", "", "all", "File Layout", "bin_dir: /usr/local/bin", " \ + This is the directory in which the executables \ + related to ht://Dig are installed. It is never used \ + directly by any of the programs, but other attributes \ + can be defined in terms of this one. \ + <p> \ + The default value of this attribute is determined at \ + compile time. \ + </p> \ +" }, \ +{ "boolean_keywords", "and or not", \ + "string list", "htsearch", "", "3.1.6", "Presentation:How", "boolean_keywords: et ou non", " \ + These three strings are used as the keywords used in \ + constructing the \ + <a href=\"hts_templates.html#LOGICAL_WORDS\">LOGICAL_WORDS</a> \ + template variable, \ + and in parsing the <a href=\"hts_form.html#words\">words</a> input \ + parameter when the <a href=\"hts_form.html#method\">method</a> \ + parameter or <a href=\"#match_method\">match_method</a> attribute \ + is set to <code>boolean</code>. \ + See also the \ + <a href=\"#boolean_syntax_errors\">boolean_syntax_errors</a> attribute. \ +" }, +{ "boolean_syntax_errors", "Expected \ + 'a search word, a quoted phrase or a boolean expression between ()' \ + 'at the end' 'instead of' 'end of expression' quotes", \ + "quoted string list", "htsearch", "", "3.1.6", "Presentation:How", + "boolean_syntax_errors: Attendait \"un mot\" \"à la fin\" \ + \"au lieu de\" \"fin d'expression\" \"guillemet\"", " \ + These six strings are used as the keywords used to \ + construct various syntax error messages for errors encountered in \ + parsing the <a href=\"hts_form.html#words\">words</a> input \ + parameter when the <a href=\"hts_form.html#method\">method</a> parameter \ + or <a href=\"#match_method\">match_method</a> attribute \ + is set to <code>boolean</code>. \ + They are used in conjunction with the \ + <a href=\"#boolean_keywords\">boolean_keywords</a> attribute, and \ + comprise all \ + English-specific parts of these error messages. The order in which \ + the strings are put together may not be ideal, or even gramatically \ + correct, for all languages, but they can be used to make fairly \ + intelligible messages in many languages. \ +" }, +{ "build_select_lists", "", \ + "quoted string list", "htsearch", "", "3.2.0b1", "Searching:UI", "build_select_lists: \ + MATCH_LIST matchesperpage matches_per_page_list \\<br> \ + 1 1 1 matches_per_page \"Previous Amount\" \\<br> \ + RESTRICT_LIST,multiple restrict restrict_names 2 1 2 restrict \"\" \\<br> \ + FORMAT_LIST,radio format template_map 3 2 1 template_name \"\"", " \ + This list allows you to define any htsearch input parameter as \ + a select list for use in templates, provided you also define \ + the corresponding name list attribute which enumerates all the \ + choices to put in the list. It can be used for existing input \ + parameters, as well as any you define using the \ + <a href=\"#allow_in_form\">allow_in_form</a> \ + attribute. The entries in this list each consist of an octuple, \ + a set of eight strings defining the variables and how they are to \ + be used to build a select list. The attribute can contain many \ + of these octuples. The strings in the string list are merely \ + taken eight at a time. For each octuple of strings specified in \ + build_select_lists, the elements have the following meaning: \ + <ol> \ + <li>the name of the template variable to be defined as a list, \ + optionally followed by a comma and the type of list, and \ + optional formatting codes \ + <li>the input parameter name that the select list will set \ + <li>the name of the user-defined attribute containing the \ + name list \ + <li>the tuple size used in the name list above \ + <li>the index into a name list tuple for the value \ + <li>the index for the corresponding label on the selector \ + <li>the configuration attribute where the default value for \ + this input parameter is defined \ + <li>the default label, if not an empty string, which will be \ + used as the label for an additional list item for the current \ + input parameter value if it doesn't match any value in the \ + given list \ + </ol> \ + See the <a href=\"hts_selectors.html\">select list documentation</a> \ + for more information on this attribute. \ +" }, \ +{ "caps_factor", "1", \ + "number", "htsearch", "", "??", "Searching:Ranking", "caps_factor: 1", " \ + TO BE COMPLETED<br> \ + See also <a href=\"#heading_factor\">heading_factor</a>. \ +" }, \ +{ "case_sensitive", "true", \ + "boolean", "htdig", "", "3.1.0b2", "Indexing:Where", "case_sensitive: false", " \ + This specifies whether ht://Dig should consider URLs \ + case-sensitive or not. If your server is case-insensitive, \ + you should probably set this to false. <br> \ + Even if this is false, \ + <a href=\"#common_url_parts\">common_url_parts</a>, \ + <a href=\"#url_part_aliases\">url_part_aliases</a> and \ + <a href=\"#url_rewrite_rules\">url_rewrite_rules</a> \ + are all still case sensitive, and \ + <a href=\"#server_aliases\">server_aliases</a> \ + is still case insensitive. \ +" }, \ +{ "check_unique_date", "false", \ + "boolean", "htdig", "Global", "3.2.0b3", "", "check_unique_date: false", " \ + Include the modification date of the page in the MD5 hash, to reduce the \ + problem with identical but physically separate pages in different parts of the tree pointing to \ + different pages. \ +" }, \ +{ "check_unique_md5", "false", \ + "boolean", "htdig", "Global", "3.2.0b3", "", "check_unique_md5: false", " \ + Uses the MD5 hash of pages to reject aliases, prevents multiple entries \ + in the index caused by such things as symbolic links \ + Note: May not do the right thing for incremental update \ +" }, \ +{ "collection_names", "", \ + "string list", "htsearch", "", "3.2.0b2", "", "collection_names: htdig_docs htdig_bugs", " \ + This is a list of config file names that are used for searching multiple databases. \ + Simply put, htsearch will loop through the databases specified by each of these config \ + files and present the result of the search on all of the databases. \ + The corresponding config files are looked up in the <a href=\"#config_dir\">config_dir</a> directory. \ + Each listed config file <strong>must</strong> exist, as well as the corresponding databases. \ +" }, \ +{ "common_dir", COMMON_DIR, \ + "string", "all", "", "all", "File Layout", "common_dir: /tmp", " \ + Specifies the directory for files that will or can be \ + shared among different search databases. The default \ + value for this attribute is defined at compile time. \ +" }, \ +{ "common_url_parts", "http:// http://www. ftp:// ftp://ftp. /pub/ .html .htm .shtml /index.html /index.htm .com/ .com mailto:", \ + "string list", "all", "", "3.1.0", "URLs", "common_url_parts: http://www.htdig.org/ml/ \\<br> \ +.html \\<br> \ +http://dev.htdig.org/ \\<br> \ +http://www.htdig.org/", " \ + Sub-strings often found in URLs stored in the \ + database. These are replaced in the database by an \ + internal space-saving encoding. If a string \ + specified in <a href=\"#url_part_aliases\">url_part_aliases</a>, \ + overlaps any string in common_url_parts, the \ + common_url_parts string is ignored.<br> \ + Note that when this attribute is changed, the \ + database should be rebuilt, unless the effect of \ + \"changing\" the affected URLs in the database is \ + wanted.<br> \ +" }, \ +{ "compression_level", "6", \ + "integer", "htdig", "", "3.1.0", "Indexing:How", "compression_level: 0", " \ + If non-zero and the \ + <a href=\"http://www.cdrom.com/pub/infozip/zlib/\">zlib</a> \ + compression library was available when compiled, \ + this attribute controls the amount of compression used in the \ + <a href=\"#doc_excerpt\">doc_excerpt</a> file. \ + <br/>This must be in the range 0-9, and must be non-zero when \ + <a href=\"#wordlist_compress_zlib\">wordlist_compress_zlib</a> \ + is used. \ +" }, \ +{ "config", "", \ + "string", "all", "", "??", "File Layout", "", " \ + Name of configuration file to load. \ + For security reasons, restrictions are placed on the values which \ + can be specified on the command line to \ + <a href=\"htsearch.html\" target=\"_top\">htsearch</a>. \ + The default value of this attribute is determined at \ + compile time. \ +" }, \ +{ "config_dir", CONFIG_DIR, \ + "string", "all", "", "all", "File Layout", "config_dir: /var/htdig/conf", " \ + This is the directory which contains all configuration \ + files related to ht://Dig. It is never used \ + directly by any of the programs, but other attributes \ + or the <a href=\"#include\">include</a> directive \ + can be defined in terms of this one. \ + <p> \ + The default value of this attribute is determined at \ + compile time. \ + </p> \ +" }, +{ "content_classifier", "${bin_dir}/HtFileType", \ + "string", "htdig", "", "3.2.0b4", "Indexing:What", "content_classifier: file -i -b", " \ + When ht://Dig can't determine the type of a <code>file://</code> \ + URL from its extension, this program is used to determine the type. \ + The program is called with one argument, the name of (possibly a \ + temporary copy of) the file. \ + <p> \ + See also <a href=\"#mime_types\">mime_types</a>.\ + </p> \ +" }, \ +{ "cookies_input_file", "", \ + "string", "htdig", "", "3.2.0b4", "Indexing:Connection", "cookies_input_file: ${common_dir}/cookies.txt", " \ + Specifies the location of the file used for importing cookies \ + for the crawl. These cookies will be preloaded into htdig's \ + in-memory cookie jar, but aren't written back to the file. \ + Cookies are specified according to Netscape's format \ + (tab-separated fields). If this attribute is left blank, \ + no cookie file will be read. \ + For more information, see the sample cookies.txt file in the \ + ht://Dig source distribution. \ +" }, \ +{ "create_image_list", "false", \ + "boolean", "htdig", "", "all", "Extra Output", "create_image_list: yes", " \ + If set to true, a file with all the image URLs that \ + were seen will be created, one URL per line. This list \ + will not be in any order and there will be lots of \ + duplicates, so after htdig has completed, it should be \ + piped through <code>sort -u</code> to get a unique list. \ +" }, \ +{ "create_url_list", "false", \ + "boolean", "htdig", "", "all", "Extra Output", "create_url_list: yes", " \ + If set to true, a file with all the URLs that were seen \ + will be created, one URL per line. This list will not \ + be in any order and there will be lots of duplicates, \ + so after htdig has completed, it should be piped \ + through <code>sort -u</code> to get a unique list. \ +" }, \ +{ "database_base", "${database_dir}/db", \ + "string", "all", "", "all", "File Layout", "database_base: ${database_dir}/sales", " \ + This is the common prefix for files that are specific \ + to a search database. Many different attributes use \ + this prefix to specify filenames. Several search \ + databases can share the same directory by just changing \ + this value for each of the databases. \ +" }, \ +{ "database_dir", DATABASE_DIR, \ + "string", "all", "", "all", "File Layout", "database_dir: /var/htdig", " \ + This is the directory which contains all database and \ + other files related to ht://Dig. It is never used \ + directly by any of the programs, but other attributes \ + are defined in terms of this one. \ + <p> \ + The default value of this attribute is determined at \ + compile time. \ + </p> \ +" }, \ +{ "date_factor", "0", \ + "number", "htsearch", "", "3.1.0", "Searching:Ranking", "date_factor: 0.35", " \ + This factor, gives higher \ + rankings to newer documents and lower rankings to older \ + documents. Before setting this factor, it's advised to \ + make sure your servers are returning accurate dates \ + (check the dates returned in the long format). \ + Additionally, setting this to a nonzero value incurs a \ + small performance hit on searching. \ +" }, \ +{ "date_format", "", \ + "string", "htsearch", "", "3.1.2", "Presentation:How", "date_format: %Y-%m-%d", " \ + This format string determines the output format for \ + modification dates of documents in the search results. \ + It is interpreted by your system's <em>strftime</em> \ + function. Please refer to your system's manual page \ + for this function, for a description of available \ + format codes. If this format string is empty, as it \ + is by default, \ + <a href=\"htsearch.html\" target=\"_top\">htsearch</a> \ + will pick a format itself. In this case, the <a \ + href=\"#iso_8601\">iso_8601</a> attribute can be used \ + to modify the appearance of the date. \ +" }, \ +{ "description_factor", "150", \ + "number", "htsearch", "", "3.1.0b3", "Searching:Ranking", "description_factor: 350", " \ + Plain old \"descriptions\" are the text of a link pointing \ + to a document. This factor gives weight to the words of \ + these descriptions of the document. Not surprisingly, \ + these can be pretty accurate summaries of a document's \ + content. See also <a href=\"#heading_factor\">heading_factor</a> \ + and <a href=\"#meta_description_factor\">meta_description_factor</a>. \ +" }, \ +{ "description_meta_tag_names", "description", \ + "string list", "htdig", "", "3.1.6", "Searching:Ranking", "description_meta_tag_names: \"description htdig-description\"", " \ + The words in this list are used to search for descriptions in HTML \ + <em>META</em> tags. This list can contain any number of strings \ + that each will be seen as the name for whatever description \ + convention is used. While words in any of the specified \ + description contents will be indexed, only the last meta tag \ + containing a description will be kept for the \ + <a href=\"hts_templates.html#METADESCRIPTION\"METADESCRIPTION</a> \ + variable in search results. The order in \ + which the names are specified in this configuration attribute \ + is irrelevant, as it is the order in which the tags appear in \ + the documents that matters.<br> The <em>META</em> tags have the \ + following format:<br> \ + <tt> <META name=\"<em>somename</em>\" \ + content=\"<em>somevalue</em>\"> </tt><br> \ + See also <a href=\"#meta_description_factor\">meta_description_factor</a>. \ +" }, \ +{ "disable_cookies", "true", \ + "boolean", "htdig", "Server", "3.2.0b4", "Indexing:Connection", "disable_cookies: true", " \ + This option, if set to true, will disable HTTP cookies. \ +" }, \ +{ "doc_db", "${database_base}.docdb", \ + "string", "all", "", "all", "File Layout", "doc_db: ${database_base}documents.db", " \ + This file will contain a Berkeley database of documents \ + indexed by document number. It contains all the information \ + gathered for each document, except the document excerpts \ + which are stored in the <a href=\"#doc_excerpt\"><em> \ + doc_excerpt</em></a> file. \ +" }, \ +{ "doc_excerpt", "${database_base}.excerpts", \ + "string", "all", "", "3.2.0b1", "File Layout", "doc_excerpt: ${database_base}excerpts.db", " \ + This file will contain a Berkeley database of document excerpts \ + indexed by document number. It contains all the text \ + gathered for each document, so this file can become \ + rather large if <a href=\"#max_head_length\"><em> \ + max_head_length</em></a> is set to a large value. \ + The size can be reduced by setting the \ + <a href=\"#compression_level\"><em>compression_level</em></a>, \ + if supported on your system. \ +" }, \ +{ "doc_index", "${database_base}.docs.index", \ + "string", "htdig", "", "all", "File Layout", "doc_index: documents.index.db", " \ + This file contains a mapping of document numbers to URLs and is \ + used by htdig during indexing. It is used on updates if it exists. \ +" }, \ +{ "doc_list", "${database_base}.docs", \ + "string", "htdig htdump htload", "", "all", "File Layout", "doc_list: /tmp/documents.text", " \ + This file is basically a text version of the file \ + specified in <em><a href=\"#doc_db\">doc_db</a></em>. Its \ + only use is to have a human readable database of all \ + documents. The file is easy to parse with tools like \ + perl or tcl. \ +" }, \ +{ "endday", "", \ + "integer", "htsearch", "", "3.1.6", "Searching:Method", "endday: 31", " \ + Day component of last date allowed as last-modified date \ + of returned docutments. \ + This is most usefully specified as a \ + <a href=\"hts_form.html#startyear\">GCI argument</a>. \ + See also <a href=\"#startyear\">startyear</a>. \ +" }, \ +{ "end_ellipses", "<strong><code> ...</code></strong>", \ + "string", "htsearch", "", "all", "Presentation:Text", "end_ellipses: ...", " \ + When excerpts are displayed in the search output, this \ + string will be appended to the excerpt if there is text \ + following the text displayed. This is just a visual \ + reminder to the user that the excerpt is only part of \ + the complete document. \ +" }, \ +{ "end_highlight", "</strong>", \ + "string", "htsearch", "", "3.1.4", "Presentation:Text", "end_highlight: </font>", " \ + When excerpts are displayed in the search output, matched \ + words will be highlighted using <a href=\"#start_highlight\"> \ + start_highlight</a> and this string. \ + You should ensure that highlighting tags are balanced, \ + that is, this string should close any formatting \ + tag opened by start_highlight. \ +" }, \ +{ "endings_affix_file", "${common_dir}/english.aff", \ + "string", "htfuzzy", "", "all", "File Layout", "endings_affix_file: /var/htdig/affix_rules", " \ + Specifies the location of the file which contains the \ + affix rules used to create the endings search algorithm \ + databases. Consult the documentation on \ + <a href=\"htfuzzy.html\">htfuzzy</a> for more information on the \ + format of this file. \ +" }, \ +{ "endings_dictionary", "${common_dir}/english.0", \ + "string", "htfuzzy", "", "all", "File Layout", "endings_dictionary: /var/htdig/dictionary", " \ + Specifies the location of the file which contains the \ + dictionary used to create the endings search algorithm \ + databases. Consult the documentation on \ + <a href=\"htfuzzy.html\">htfuzzy</a> for more information on the \ + format of this file. \ +" }, \ +{ "endings_root2word_db", "${common_dir}/root2word.db", \ + "string", "htfuzzy htsearch", "", "all", "File Layout", "endings_root2word_db: /var/htdig/r2w.db", " \ + This attributes specifies the database filename to be \ + used in the 'endings' fuzzy search algorithm. The \ + database maps word roots to all legal words with that \ + root. For more information about this and other fuzzy \ + search algorithms, consult the \ + <a href=\"htfuzzy.html\">htfuzzy</a> documentation.<br> \ + Note that the default value uses the \ + <a href=\"#common_dir\">common_dir</a> attribute instead of the \ + <a href=\"#database_dir\">database_dir</a> attribute. \ + This is because this database can be shared with \ + different search databases. \ +" }, \ +{ "endings_word2root_db", "${common_dir}/word2root.db", \ + "string", "htfuzzy htsearch", "", "all", "File Layout", "endings_word2root_db: /var/htdig/w2r.bm", " \ + This attributes specifies the database filename to be \ + used in the 'endings' fuzzy search algorithm. The \ + database maps words to their root. For more information \ + about this and other fuzzy search algorithms, consult \ + the <a href=\"htfuzzy.html\">htfuzzy</a> \ + documentation.<br> \ + Note that the default value uses the \ + <a href=\"#common_dir\">common_dir</a> attribute instead of the \ + <a href=\"#database_dir\">database_dir</a> attribute. \ + This is because this database can be shared with \ + different search databases. \ +" }, \ +{ "endmonth", "", \ + "integer", "htsearch", "", "3.1.6", "Searching:Method", "endmonth: 12", " \ + Month component of last date allowed as last-modified date \ + of returned docutments. \ + This is most usefully specified as a \ + <a href=\"hts_form.html#startyear\">GCI argument</a>. \ + See also <a href=\"#startyear\">startyear</a>. \ +" }, \ +{ "endyear", "", \ + "integer", "htsearch", "", "3.1.6", "Searching:Method", "endyear: 2002", " \ + Year component of last date allowed as last-modified date \ + of returned docutments. \ + This is most usefully specified as a \ + <a href=\"hts_form.html#startyear\">GCI argument</a>. \ + See also <a href=\"#startyear\">startyear</a>. \ +" }, \ +{ "excerpt_length", "300", \ + "integer", "htsearch", "", "all", "Presentation:How", "excerpt_length: 500", " \ + This is the maximum number of characters the displayed \ + excerpt will be limited to. The first matched word will \ + be highlighted in the middle of the excerpt so that there is \ + some surrounding context.<br> \ + The <em><a href=\"#start_ellipses\"> \ + start_ellipses</a></em> and \ + <em><a href=\"#end_ellipses\">end_ellipses</a></em> are used to \ + indicate that the document contains text before and \ + after the displayed excerpt respectively. \ + The <em><a href=\"#start_highlight\">start_highlight</a></em> and \ + <em><a href=\"#end_highlight\">end_highlight</a></em> are used to \ + specify what formatting tags are used to highlight matched words. \ +" }, \ +{ "excerpt_show_top", "false", \ + "boolean", "htsearch", "", "all", "Presentation:How", "excerpt_show_top: yes", " \ + If set to true, the excerpt of a match will always show \ + the top of the matching document. If it is false (the \ + default), the excerpt will attempt to show the part of \ + the document that actually contains one of the words. \ +" }, \ +{ "exclude", "", \ + "pattern list", "htsearch", "", "3.2.0b4", "Searching:Method", "exclude: myhost.com/mailarchive/", " \ + If a URL contains any of the space separated patterns, it will be \ + discarded in the searching phase. This is used to exclude certain \ + URLs from search results. The list can be specified from within \ + the configuration file, and can be overridden with the \"exclude\" \ + input parameter in the search form. \ +" }, \ +{ "exclude_urls", "/cgi-bin/ .cgi", \ + "pattern list", "htdig", "URL", "all", "Indexing:Where", "exclude_urls: students.html cgi-bin", " \ + If a URL contains any of the space separated patterns, \ + it will be rejected. This is used to exclude such \ + common things such as an infinite virtual web-tree \ + which start with cgi-bin. \ +" }, \ +{ "external_parsers", "", \ + "quoted string list", "htdig", "", "3.0.7", "External:Parsers", "external_parsers: text/html /usr/local/bin/htmlparser \\<br> \ + application/pdf /usr/local/bin/parse_doc.pl \\<br> \ + application/msword->text/plain \"/usr/local/bin/mswordtotxt -w\" \\<br> \ + application/x-gunzip->user-defined /usr/local/bin/ungzipper", " \ + This attribute is used to specify a list of \ + content-type/parsers that are to be used to parse \ + documents that cannot by parsed by any of the internal \ + parsers. The list of external parsers is examined \ + before the builtin parsers are checked, so this can be \ + used to override the internal behavior without \ + recompiling htdig.<br> \ + The external parsers are specified as pairs of \ + strings. The first string of each pair is the \ + content-type that the parser can handle while the \ + second string of each pair is the path to the external \ + parsing program. If quoted, it may contain parameters, \ + separated by spaces.<br> \ + External parsing can also be done with external \ + converters, which convert one content-type to \ + another. To do this, instead of just specifying \ + a single content-type as the first string \ + of a pair, you specify two types, in the form \ + <em>type1</em><strong>-></strong><em>type2</em>, \ + as a single string with no spaces. The second \ + string will define an external converter \ + rather than an external parser, to convert \ + the first type to the second. If the second \ + type is <strong>user-defined</strong>, then \ + it's up to the converter script to put out a \ + \"Content-Type: <em>type</em>\" header followed \ + by a blank line, to indicate to htdig what type it \ + should expect for the output, much like what a CGI \ + script would do. The resulting content-type must \ + be one that htdig can parse, either internally, \ + or with another external parser or converter.<br> \ + Only one external parser or converter can be \ + specified for any given content-type. However, \ + an external converter for one content-type can be \ + chained to the internal parser for the same type, \ + by appending <strong>-internal</strong> to the \ + second type string (e.g. text/html->text/html-internal) \ + to perform external preprocessing on documents of \ + this type before internal parsing. \ + There are two internal parsers, for text/html and \ + text/plain.<p> \ + The parser program takes four command-line \ + parameters, not counting any parameters already \ + given in the command string:<br> \ + <em>infile content-type URL configuration-file</em><br> \ + <table border=\"1\"> \ + <tr> \ + <th> Parameter </th> \ + <th> Description </th> \ + <th> Example </th> \ + </tr> \ + <tr> \ + <td valign=\"top\"> infile </td> \ + <td> A temporary file with the contents to be parsed. </td> \ + <td> /var/tmp/htdext.14242 </td> \ + </tr> \ + <tr> \ + <td valign=\"top\"> content-type </td> \ + <td> The MIME-type of the contents. </td> \ + <td> text/html </td> \ + </tr> \ + <tr> \ + <td valign=\"top\"> URL </td> \ + <td> The URL of the contents. </td> \ + <td> http://www.htdig.org/attrs.html </td> \ + </tr> \ + <tr> \ + <td valign=\"top\"> configuration-file </td> \ + <td> The configuration-file in effect. </td> \ + <td> /etc/htdig/htdig.conf </td> \ + </tr> \ + </table><p> \ + The external parser is to write information for \ + htdig on its standard output. Unless it is an \ + external converter, which will output a document \ + of a different content-type, then its output must \ + follow the format described here.<br> \ + The output consists of records, each record terminated \ + with a newline. Each record is a series of (unless \ + expressively allowed to be empty) non-empty tab-separated \ + fields. The first field is a single character \ + that specifies the record type. The rest of the fields \ + are determined by the record type. \ + <table border=\"1\"> \ + <tr> \ + <th> Record type </th> \ + <th> Fields </th> \ + <th> Description </th> \ + </tr> \ + <tr> \ + <th rowspan=\"3\" valign=\"top\"> w </th> \ + <td valign=\"top\"> word </td> \ + <td> A word that was found in the document. </td> \ + </tr> \ + <tr> \ + <td valign=\"top\"> location </td> \ + <td> \ + A number indicating the normalized location of \ + the word within the document. The number has to \ + fall in the range 0-1000 where 0 means the top of \ + the document. \ + </td> \ + </tr> \ + <tr> \ + <td valign=\"top\"> heading level </td> \ + <td> \ + A heading level that is used to compute the \ + weight of the word depending on its context in \ + the document itself. The level is in the range of \ + 0-11 and are defined as follows: \ + <dl compact> \ + <dt> 0 </dt> <dd> Normal text </dd> \ + <dt> 1 </dt> <dd> Title text </dd> \ + <dt> 2 </dt> <dd> Heading 1 text </dd> \ + <dt> 3 </dt> <dd> Heading 2 text </dd> \ + <dt> 4 </dt> <dd> Heading 3 text </dd> \ + <dt> 5 </dt> <dd> Heading 4 text </dd> \ + <dt> 6 </dt> <dd> Heading 5 text </dd> \ + <dt> 7 </dt> <dd> Heading 6 text </dd> \ + <dt> 8 </dt> <dd> text alternative to images </dd> \ + <dt> 9 </dt> <dd> Keywords </dd> \ + <dt> 10 </dt> <dd> Meta-description </dd> \ + <dt> 11 </dt> <dd> Author </dd> \ + </dl> \ + </td> \ + </tr> \ + <tr> \ + <th rowspan=\"2\" valign=\"top\"> u </th> \ + <td valign=\"top\"> document URL </td> \ + <td> \ + A hyperlink to another document that is \ + referenced by the current document. It must be \ + complete and non-relative, using the URL parameter to \ + resolve any relative references found in the document. \ + </td> \ + </tr> \ + <tr> \ + <td valign=\"top\"> hyperlink description </td> \ + <td> \ + For HTML documents, this would be the text \ + between the <a href...> and </a> \ + tags. \ + </td> \ + </tr> \ + <tr> \ + <th valign=\"top\"> t </th> \ + <td valign=\"top\"> title </td> \ + <td> The title of the document </td> \ + </tr> \ + <tr> \ + <th valign=\"top\"> h </th> \ + <td valign=\"top\"> head </td> \ + <td> \ + The top of the document itself. This is used to \ + build the excerpt. This should only contain \ + normal ASCII text \ + </td> \ + </tr> \ + <tr> \ + <th valign=\"top\"> a </th> \ + <td valign=\"top\"> anchor </td> \ + <td> \ + The label that identifies an anchor that can be \ + used as a target in an URL. This really only \ + makes sense for HTML documents. \ + </td> \ + </tr> \ + <tr> \ + <th valign=\"top\"> i </th> \ + <td valign=\"top\"> image URL </td> \ + <td> \ + An URL that points at an image that is part of \ + the document. \ + </td> \ + </tr> \ + <tr> \ + <th rowspan=\"3\" valign=\"top\"> m </th> \ + <td valign=\"top\"> http-equiv </td> \ + <td> \ + The HTTP-EQUIV attribute of a \ + <a href=\"meta.html\"><em>META</em> tag</a>. \ + May be empty. \ + </td> \ + </tr> \ + <tr> \ + <td valign=\"top\"> name </td> \ + <td> \ + The NAME attribute of this \ + <a href=\"meta.html\"><em>META</em> tag</a>. \ + May be empty. \ + </td> \ + </tr> \ + <tr> \ + <td valign=\"top\"> contents </td> \ + <td> \ + The CONTENTS attribute of this \ + <a href=\"meta.html\"><em>META</em> tag</a>. \ + May be empty. \ + </td> \ + </tr> \ + </table> \ + <p><em>See also FAQ questions <a href=\"FAQ.html#q4.8\">4.8</a> and \ + <a href=\"FAQ.html#q4.9\">4.9</a> for more examples.</em></p> \ +" }, \ +{ "external_protocols", "", \ + "quoted string list", "htdig", "", "3.2.0b1", "External:Protocols", "external_protocols: https /usr/local/bin/handler.pl \\<br> \ + ftp /usr/local/bin/ftp-handler.pl", " \ + This attribute is a bit like \ + <a href=\"#external_parsers\">external_parsers</a> since it specifies \ + a list of protocols/handlers that are used to download documents \ + that cannot be retrieved using the internal methods. This enables \ + htdig to index documents with URL schemes it does not understand, \ + or to use more advanced authentication for the documents it is \ + retrieving. This list is checked before HTTP or other methods, \ + so this can override the internal behavior without writing additional \ + code for htdig.<br> \ + The external protocols are specified as pairs of strings, the first \ + being the URL scheme that the script can handle while the second \ + is the path to the script itself. If the second is \ + quoted, then additional command-line arguments may be given.<br> \ + If the external protocol does not contain a colon (:), it is assumed \ + to have the standard format \ + \"protocol://[usr[:password]@]address[:port]/path\". \ + If it ends with a colon, then it is assumed to have the simpler format \ + \"protocol:path\". If it ends with \"://\" then the standard form is \ + again assumed. <br> \ + The program takes three command-line parameters, not counting any \ + parameters already given in the command string:<br> \ + <em>protocol URL configuration-file</em><br> \ + <table border=\"1\"> \ + <tr> \ + <th> Parameter </th> \ + <th> Description </th> \ + <th> Example </th> \ + </tr> \ + <tr> \ + <td valign=\"top\"> protocol </td> \ + <td> The URL scheme to be used. </td> \ + <td> https </td> \ + </tr> \ + <tr> \ + <td valign=\"top\"> URL </td> \ + <td> The URL to be retrieved. </td> \ + <td> https://www.htdig.org:8008/attrs.html </td> \ + </tr> \ + <tr> \ + <td valign=\"top\"> configuration-file </td> \ + <td> The configuration-file in effect. </td> \ + <td> /etc/htdig/htdig.conf </td> \ + </tr> \ + </table><p> \ + The external protocol script is to write information for htdig on the \ + standard output. The output must follow the form described here. The \ + output consists of a header followed by a blank line, followed by \ + the contents of the document. Each record in the header is terminated \ + with a newline. Each record is a series of (unless expressively \ + allowed to be empty) non-empty tab-separated fields. The first field \ + is a single character that specifies the record type. The rest of \ + the fields are determined by the record type. \ + <table border=\"1\"> \ + <tr> \ + <th> Record type </th> \ + <th> Fields </th> \ + <th> Description </th> \ + </tr> \ + <tr> \ + <th valign=\"top\"> s </th> \ + <td valign=\"top\"> status code </td> \ + <td> \ + An HTTP-style status code, e.g. 200, 404. Typical codes include: \ + <dl compact> \ + <dt> 200 </dt> \ + <dd> Successful retrieval </dd> \ + <dt> 304 </dt> \ + <dd> \ + Not modified (for example, if the document hasn\'t \ + changed since the last dig) \ + </dd> \ + <dt> 301 </dt> \ + <dd> Redirect (to another URL) </dd> \ + <dt> 401 </dt> \ + <dd> Not authorized </dd> \ + <dt> 404 </dt> \ + <dd> Not found </dd> \ + </dl> \ + </td> \ + </tr> \ + <tr> \ + <th valign=\"top\"> r </th> \ + <td valign=\"top\"> reason </td> \ + <td> \ + A text string describing the status code, \ + e.g \"Redirect\" or \"Not Found.\" \ + </td> \ + </tr> \ + <tr> \ + <th valign=\"top\"> m </th> \ + <td valign=\"top\"> status code </td> \ + <td> \ + The modification time of this document. While the code is \ + fairly flexible about the time/date formats it accepts, it \ + is recommended to use something standard, like \ + RFC1123: Sun, 06 Nov 1994 08:49:37 GMT, or \ + ISO-8601: 1994-11-06 08:49:37 GMT. \ + </td> \ + </tr> \ + <tr> \ + <th valign=\"top\"> t </th> \ + <td valign=\"top\"> content-type </td> \ + <td> \ + A valid MIME type for the document, like text/html or text/plain. \ + </td> \ + </tr> \ + <tr> \ + <th valign=\"top\"> l </th> \ + <td valign=\"top\"> content-length </td> \ + <td> \ + The length of the document on the server, which may not \ + necessarily be the length of the buffer returned. \ + </td> \ + </tr> \ + <tr> \ + <th valign=\"top\"> u </th> \ + <td valign=\"top\"> url </td> \ + <td> \ + The URL of the document, or in the case of a redirect, the \ + URL that should be indexed as a result of the redirect. \ + </td> \ + </tr> \ + </table> \ +" }, \ +{ "extra_word_characters", "", \ + "string", "htdig htsearch", "", "3.1.2", "Indexing:What", "extra_word_characters: _", " \ + These characters are considered part of a word. \ + In contrast to the characters in the \ + <a href=\"#valid_punctuation\">valid_punctuation</a> \ + attribute, they are treated just like letter \ + characters. See also the <a href=\"#allow_numbers\">allow_numbers</a>\ + attribute.<br> \ + Note that the <a href=\"#locale\">locale</a> attribute \ + is normally used to configure which characters \ + constitute letter characters.<br> \ + Note also that it is an error to have characters in both \ + extra_word_characters and \ + <a href=\"#valid_punctuation\">valid_punctuation</a>. \ + To add one of the characters in the default valid_punctuation to \ + extra_word_characters, an explicit valid_punctuation entry must be \ + added to the configuration file.<br> \ + See also the comments about special characters at \ + <a href=\"#valid_punctuation\">valid_punctuation</a>. \ +" }, \ +{ "head_before_get", "true", \ + "boolean", "htdig", "Server", "3.2.0b1", "Indexing:Connection", "head_before_get: false", " \ + If set to true, an HTTP/1.1 <em>HEAD</em> \ + call is made in order to retrieve header information about a document. \ + If the status code and the content-type returned show that the \ + document is parsable, then a subsequent 'GET' call is made. In \ + general, it is recommended that this attribute be set to 'true', \ + as it can really improve performance (especially when used with \ + persistent connections). This is particularly so during an \ + incremental dig, since in this case 'htdig' can ask the server if the \ + document has been modified since last dig. However there are a few \ + cases when it is better to switch it off: \ + <ul> \ + <li>the majority of documents are parsable (HTML or a type for which \ + an external parser has been provided) and must be retrieved anyway \ + (initial dig);</li> \ + <li>the server does not support the HEAD method or it is \ + disabled;</li> \ + <li>in some cases <a href=\"#persistent_connections\">persistent_connections</a> may \ + not work properly and either the 'head_before_get' attribute or the \ + 'persistent_connections' attribute must be turned off.</li> \ + </ul> \ +" }, \ +{ "heading_factor", "5", \ + "number", "htsearch", "", "3.2.0b1", "Searching:Ranking", "heading_factor: 20", " \ + This is a factor which will be used to multiply the \ + weight of words between <h1> and </h1> \ + tags, as well as headings of levels <h2> through \ + <h6>. It is used to assign the level of importance \ + to headings. Setting a factor to 0 will cause words \ + in these headings to be ignored. The number may be a \ + floating point number. See also \ + <a href=\"#author_factor\">author_factor</a> \ + <a href=\"#backlink_factor\">backlink_factor</a> \ + <a href=\"#caps_factor\">caps_factor</a> \ + <a href=\"#date_factor\">date_factor</a> \ + <a href=\"#description_factor\">description_factor</a> \ + <a href=\"#keywords_factor\">keywords_factor</a> \ + <a href=\"#meta_description_factor\">meta_description_factor</a> \ + <a href=\"#text_factor\">text_factor</a> \ + <a href=\"#title_factor\">title_factor</a> \ + <a href=\"#url_text_factor\">url_text_factor</a> \ +" }, \ +{ "htnotify_prefix_file", "", \ + "string", "htnotify", "", "3.2.0b3", "Extra Output", "htnotify_prefix_file: ${common_dir}/notify_prefix.txt", " \ + Specifies the file containing text to be inserted in each mail \ + message sent by htnotify before the list of expired webpages. If omitted, \ + nothing is inserted. \ +" }, \ +{ "htnotify_replyto", "", \ + "string", "htnotify", "", "3.2.0b3", "Extra Output", "htnotify_replyto: [email protected]", " \ + This specifies the email address that htnotify email messages \ + include in the Reply-to: field. \ +" }, \ +{ "htnotify_sender", "webmaster@www", \ + "string", "htnotify", "", "all", "Extra Output", "htnotify_sender: [email protected]", " \ + This specifies the email address that htnotify email \ + messages get sent out from. The address is forged using \ + /usr/lib/sendmail. Check htnotify/htnotify.cc for \ + detail on how this is done. \ +" }, \ +{ "htnotify_suffix_file", "", \ + "string", "htnotify", "", "3.2.0b3", "Extra Output", "htnotify_suffix_file: ${common_dir}/notify_suffix.txt", " \ + Specifies the file containing text to be inserted in each mail message \ + sent by htnotify after the list of expired webpages. If omitted, htnotify \ + will insert a standard message. \ +" }, \ +{ "htnotify_webmaster", "ht://Dig Notification Service", \ + "string", "htnotify", "", "3.2.0b3", "Extra Output", "htnotify_webmaster: Notification Service", " \ + This provides a name for the From field, in addition to the email \ + address for the email messages sent out by htnotify. \ +" }, \ +{ "http_proxy", "", \ + "string", "htdig", "URL", "3.0", "Indexing:Connection", "http_proxy: http://proxy.bigbucks.com:3128", " \ + When this attribute is set, all HTTP document \ + retrievals will be done using the HTTP-PROXY protocol. \ + The URL specified in this attribute points to the host \ + and port where the proxy server resides.<br> \ + Later, this should be able to be overridden by the \ + <code>http_proxy</code> environement variable, but it currently cannot.\ + The use of a proxy server greatly improves performance \ + of the indexing process.<br> \ + See also \ + <a href=\"#http_proxy_authorization\">http_proxy_authorization</a> and \ + <a href=\"#http_proxy_exclude\">#http_proxy_exclude</a>. \ +" }, \ +{ "http_proxy_authorization", "", \ + "string", "htdig", "URL", "3.2.0b4", "Indexing:Connection", "http_proxy_authorization: myusername:mypassword", " \ + This tells htdig to send the supplied \ + <em>username</em><strong>:</strong><em>password</em> with each HTTP request, \ + when using a proxy with authorization requested. \ + The credentials will be encoded using the \"Basic\" authentication \ + scheme. There <em>must</em> be a colon (:) between the username and \ + password.<br> \ + If you use this option, be sure to protect the configuration file \ + so it is readable only by you, and do not \ + use that same configuration file for htsearch. \ +" }, \ +{ "http_proxy_exclude", "", \ + "pattern list", "htdig", "", "3.1.0b3", "Indexing:Connection", "http_proxy_exclude: http://intranet.foo.com/", " \ + When this is set, URLs matching this will not use the \ + proxy. This is useful when you have a mixture of sites \ + near to the digging server and far away. \ +" }, \ +{ "ignore_alt_text", "false", \ + "boolean", "htdig", "", "3.1.6", "Indexing:What", "ignore_alt_text: true", " \ + If set, this causes the text of the ALT field in an <IMG...> tag \ + not to be indexed as part of the text of the document, nor included in \ + excerpts. \ +" }, \ +{ "ignore_dead_servers", "true", \ + "boolean", "htdig", "", "3.1.6", "Indexing:Connection", "ignore_dead_servers: false", " \ + Determines whether htdig will continue to index URLs from a \ + server after an attempted connection to the server fails as \ + "no host found" or "host not found (port)." If \ + set to false, htdig will try <em>every</em> URL from that server. \ +" }, \ +{ "image_list", "${database_base}.images", \ + "string", "htdig", "", "all", "Extra Output", "image_list: allimages", " \ + This is the file that a list of image URLs gets written \ + to by <a href=\"htdig.html\">htdig</a> when the \ + <a href=\"#create_image_list\">create_image_list</a> is set to \ + true. As image URLs are seen, they are just appended to \ + this file, so after htdig finishes it is probably a \ + good idea to run <code>sort -u</code> on the file to \ + eliminate duplicates from the file. \ +" }, \ +{ "image_url_prefix", IMAGE_URL_PREFIX, \ + "string", "htsearch", "", "all", "Presentation:Text", "image_url_prefix: /images/htdig", " \ + This specifies the directory portion of the URL used \ + to display star images. This attribute isn't directly \ + used by htsearch, but is used in the default URL for \ + the <a href=\"#star_image\">star_image</a> and \ + <a href=\"#star_blank\">star_blank</a> attributes, and \ + other attributes may be defined in terms of this one. \ + <p> \ + The default value of this attribute is determined at \ + compile time. \ + </p> \ +" }, \ +{ "include", "", \ + "string", "all", "", "3.1.0", "", "include: ${config_dir}/htdig.conf", " \ + This is not quite a configuration attribute, but \ + rather a directive. It can be used within one \ + configuration file to include the definitions of \ + another file. The last definition of an attribute \ + is the one that applies, so after including a file, \ + any of its definitions can be overridden with \ + subsequent definitions. This can be useful when \ + setting up many configurations that are mostly the \ + same, so all the common attributes can be maintained \ + in a single configuration file. The include directives \ + can be nested, but watch out for nesting loops. \ +" }, \ +{ "iso_8601", "false", \ + "boolean", "htsearch htnotify", "", "3.1.0b2", "Presentation:How,Extra Output", "iso_8601: true", " \ + This sets whether dates should be output in ISO 8601 \ + format. For example, this was written on: 1998-10-31 11:28:13 EST. \ + See also the <a \ + href=\"#date_format\">date_format</a> attribute, which \ + can override any date format that \ + <a href=\"htsearch.html\" target=\"_top\">htsearch</a> \ + picks by default.<br> \ + This attribute also affects the format of the date \ + <a href=\"htnotify.html\">htnotify</a> expects to find \ + in a <strong>htdig-notification-date</strong> field. \ +" }, \ +{ "keywords", "", \ + "string list", "htsearch", "", "??", "Searching:Method", "keywords: documentation", " \ + Keywords which <strong>must</strong> be found on all pages returned, \ + even if the \"or\" (\"Any\") <a href=\"#method\">method</a> is \ + selected. \ +" }, \ +{ "keywords_factor", "100", \ + "number", "htsearch", "", "all", "Searching:Ranking", "keywords_factor: 12", " \ + This is a factor which will be used to multiply the \ + weight of words in the list of \ + <a href=\"#keywords_meta_tag_names\">meta keywords</a> of a document. \ + The number may be a floating point number. See also the \ + <a href=\"#heading_factor\">heading_factor</a> attribute. \ +" }, \ +{ "keywords_meta_tag_names", "keywords htdig-keywords", \ + "string list", "htdig", "", "3.0.6", "Indexing:What", "keywords_meta_tag_names: keywords description", " \ + The words in this list are used to search for keywords \ + in HTML <em>META</em> tags. This list can contain any \ + number of strings that each will be seen as the name \ + for whatever keyword convention is used.<br> \ + The <em>META</em> tags have the following format:<br> \ +<code> \ + <META name=\"<em>somename</em>\" content=\"<em>somevalue</em>\"> \ +</code> \ +" }, \ +{ "limit_normalized", "", \ + "pattern list", "htdig", "", "3.1.0b2", "Indexing:Where", "limit_normalized: http://www.mydomain.com", " \ + This specifies a set of patterns that all URLs have to \ + match against in order for them to be included in the \ + search. Unlike the limit_urls_to attribute, this is done \ + <strong>after</strong> the URL is normalized and the \ + <a href=\"#server_aliases\">server_aliases</a> \ + attribute is applied. This allows filtering after any \ + hostnames and DNS aliases are resolved. Otherwise, this \ + attribute is the same as the <a \ + href=\"#limit_urls_to\">limit_urls_to</a> attribute. \ +" }, \ +{ "limit_urls_to", "${start_url}", \ + "pattern list", "htdig", "", "all", "Indexing:Where", "limit_urls_to: .sdsu.edu kpbs [.*\\.html]", " \ + This specifies a set of patterns that all URLs have to \ + match against in order for them to be included in the \ + search. Any number of strings can be specified, \ + separated by spaces. If multiple patterns are given, at \ + least one of the patterns has to match the URL.<br> \ + Matching, by default, is a case-sensitive string match on the URL \ + to be used, unless the <a href=\"#case_sensitive\">case_sensitive</a> \ + attribute is false. The match will be performed <em>after</em> \ + the relative references have been converted to a valid \ + URL. This means that the URL will <em>always</em> start \ + with a transport specifier (<code>http://</code> if none is \ + specified).<br> \ + Granted, this is not the perfect way of doing this, \ + but it is simple enough and it covers most cases.<br> \ + To limit URLs in htsearch, use \ + <a href=\"#restrict\">restrict</a>. \ +" }, \ +{ "local_default_doc", "index.html", \ + "string list", "htdig", "Server", "3.0.8b2", "Indexing:Where", "local_default_doc: default.html default.htm index.html index.htm", " \ + Set this to the default documents in a directory used by the \ + server. This is used for local filesystem access, \ + using <a href=\"#local_urls\">local_urls</a>, to \ + translate URLs like http://foo.com/ into something like \ + /home/foo.com/index.html \ + (see also <a href=\"#remove_default_doc\">remove_default_doc</a>). \ + <br>The list should only contain names that the local server \ + recognizes as default documents for directory URLs, as defined \ + by the DirectoryIndex setting in Apache's srm.conf, for example. \ + As of version 3.1.5, this can be a string list rather than a single \ + name, and htdig will use the first name that works. Since this \ + requires a loop, setting the most common name first will improve \ + performance. Special characters can be embedded in these names \ + using %xx hex encoding. \ +" }, \ +{ "local_urls", "", \ + "string list", "htdig", "", "3.0.8b2", "Indexing:Where", "local_urls: http://www.foo.com/=/usr/www/htdocs/", " \ + Set this to tell ht://Dig to access certain URLs through \ + local filesystems. At first ht://Dig will try to access \ + pages with URLs matching the patterns through the \ + filesystems specified. If it cannot find the file, or \ + if it doesn't recognize the file name extension, it will \ + try the URL through HTTP instead. Note the example--the \ + equal sign and the final slashes in both the URL and the \ + directory path are critical. \ + <br>The fallback to HTTP can be disabled by setting the \ + <a href=\"#local_urls_only\">local_urls_only</a> attribute to true. \ + To access user directory URLs through the local filesystem, \ + set <a href=\"#local_user_urls\">local_user_urls</a>. \ + File types which need processing by the HTTP server may be \ + specified by the \ + <a href=\"#bad_local_extensions\">bad_local_extensions</a> \ + attribute. \ + As of version 3.1.5, you can provide multiple mappings of a given \ + URL to different directories, and htdig will use the first \ + mapping that works. \ + Special characters can be embedded in these names using %xx hex encoding. \ + For example, you can use %3D to embed an \"=\" sign in an URL pattern. \ + <br> \ + See also <a href=\"#local_default_doc\">local_default_doc</a>. \ +" }, \ +{ "local_urls_only", "false", \ + "boolean", "htdig", "", "3.1.4", "Indexing:Where", "local_urls_only: true", " \ + Set this to tell ht://Dig to access files only through the \ + local filesystem, for URLs matching the patterns in the \ + <a href=\"#local_urls\">local_urls</a> or \ + <a href=\"#local_user_urls\">local_user_urls</a> attribute. If it \ + cannot find the file, it will give up rather than trying HTTP or \ + another protocol. With this option, even <code>file://</code> urls \ + are not retrieved, except throught the local_urls mechanism.\ +" }, \ +{ "local_user_urls", "", \ + "string list", "htdig", "", "3.0.8b2", "Indexing:Where", "local_user_urls: http://www.my.org/=/home/,/www/", " \ + Set this to access user directory URLs through the local \ + filesystem. If you leave the \"path\" portion out, it will \ + look up the user's home directory in /etc/password (or NIS \ + or whatever). As with <a href=\"#local_urls\">local_urls</a>, \ + if the files are not found, ht://Dig will try with HTTP or the \ + appropriate protocol. Again, note the \ + example's format. To map http://www.my.org/~joe/foo/bar.html \ + to /home/joe/www/foo/bar.html, try the example below. \ + <br>The fallback to HTTP can be disabled by setting the \ + <a href=\"#local_urls_only\">local_urls_only</a> attribute to true. \ + As of version 3.1.5, you can provide multiple mappings of a given \ + URL to different directories, and htdig will use the first \ + mapping that works. \ + Special characters can be embedded in these names using %xx hex encoding. \ + For example, you can use %3D to embed an \"=\" sign in an URL pattern. \ +" }, \ +{ "locale", "C", \ + "string", "htdig", "", "3.0", "Indexing:What,Presentation:How", "locale: en_US", " \ + Set this to whatever locale you want your search \ + database cover. It affects the way international \ + characters are dealt with. On most systems a list of \ + legal locales can be found in /usr/lib/locale. Also \ + check the <strong>setlocale(3C)</strong> man page. \ + Note that depending the locale you choose, and whether \ + your system's locale implementation affects floating \ + point input, you may need to specify the decimal point \ + as a comma rather than a period. This will affect \ + settings of <a href=\"#search_algorithm\">search_algorithm</a> \ + and any of the scoring factors. \ +" }, \ +{ "logging", "false", \ + "boolean", "htsearch", "", "3.1.0b2", "Extra Output", "logging: true", " \ + This sets whether htsearch should use the syslog() to log \ + search requests. If set, this will log requests with a \ + default level of LOG_INFO and a facility of LOG_LOCAL5. For \ + details on redirecting the log into a separate file or other \ + actions, see the <strong>syslog.conf(5)</strong> man \ + page. To set the level and facility used in logging, change \ + LOG_LEVEL and LOG_FACILITY in the include/htconfig.h file \ + before compiling. \ + <dl> \ + <dt> \ + Each line logged by htsearch contains the following: \ + </dt> \ + <dd> \ + REMOTE_ADDR [config] (match_method) [words] \ + [logicalWords] (matches/matches_per_page) - \ + page, HTTP_REFERER \ + </dd> \ + </dl> \ + where any of the above are null or empty, it \ + either puts in '-' or 'default' (for config). \ +" }, \ +{ "maintainer", "[email protected]", \ + "string", "htdig", "Server", "all", "Indexing:Out", "maintainer: [email protected]", " \ + This should be the email address of the person in \ + charge of the digging operation. This string is added \ + to the user-agent: field when the digger sends a \ + request to a server. \ +" }, \ +{ "match_method", "and", \ + "string", "htsearch", "", "3.0", "Searching:Method", "match_method: boolean", " \ + This is the default method for matching that htsearch \ + uses. The valid choices are: \ + <ul> \ + <li> or </li> \ + <li> and </li> \ + <li> boolean </li> \ + </ul> \ + This attribute will only be used if the HTML form that \ + calls htsearch didn't have the \ + <a href=\"hts_form.html#method\">method</a> value set. \ +" }, \ +{ "matches_per_page", "10", \ + "integer", "htsearch", "", "3.0", "Searching:Method", "matches_per_page: 999", " \ + If this is set to a relatively small number, the \ + matches will be shown in pages instead of all at once. \ + This attribute will only be used if the HTML form that \ + calls htsearch didn't have the \ + <a href=\"hts_form.html#matchesperpage\">matchesperpage</a> value set. \ +" }, \ +{ "max_connection_requests", "-1", \ + "integer", "htdig", "", "3.2.0b1", "Indexing:Connection", "max_connection_requests: 100", " \ + This attribute tells htdig to limit the number of requests it will \ + send to a server using a single, persistent HTTP connection. This \ + only applies when the \ + <a href=\"#persistent_connections\">persistent_connections</a> \ + attribute is set. You may set the limit as high as you want, \ + but it must be at least 1. A value of -1 specifies no limit. \ + Requests in the queue for a server will be combined until either \ + the limit is reached, or the queue is empty. \ +" }, \ +{ "max_description_length", "60", \ + "integer", "htdig", "", "all", "Indexing:What", "max_description_length: 40", " \ + While gathering descriptions of URLs, \ + <a href=\"htdig.html\">htdig</a> will only record \ + up to this many bytes of hyperlink descriptions for use in the \ + <a href=\"hts_templates.html#DESCRIPTION\">DESCRIPTION</a> template \ + variable. This is used mostly to deal with broken HTML. (If a \ + hyperlink is not terminated with a </a> the \ + description will go on until the end of the document.) \ +" }, \ +{ "max_descriptions", "5", \ + "integer", "htdig", "", "all", "Indexing:What", "max_descriptions: 1", " \ + While gathering <a href=\"#description_factor\">descriptions</a> of \ + URLs for the \ + <a href=\"hts_templates.html#DESCRIPTIONS\">DESCRIPTIONS</a> template \ + variable, <a href=\"htdig.html\">htdig</a> will only record up to this \ + number of descriptions, in the order in which it encounters \ + them. This is used to prevent the database entry for a document \ + from growing out of control if the document has a huge number \ + of links to it. <br> \ + Note that all descriptions are used for indexing. \ +" }, \ +{ "max_doc_size", "100000", \ + "integer", "htdig", "URL", "3.0", "Indexing:What", "max_doc_size: 5000000", " \ + This is the upper limit to the amount of data retrieved \ + for documents (in bytes). This is mainly used to prevent \ + unreasonable memory consumption since each document \ + will be read into memory by <a href=\"htdig.html\"> \ + htdig</a>. \ +" }, \ +{ "max_excerpts", "1", \ + "integer", "htsearch", "URL", "3.1.6", "Presentation:How", "max_excerpts: 10", " \ + This value determines the maximum number of excerpts \ + that can be displayed for one matching document in the \ + search results. \ +" }, \ +{ "max_head_length", "512", \ + "integer", "htdig", "", "all", "Indexing:How", "max_head_length: 50000", " \ + For each document retrieved, the top of the document is \ + stored. This attribute determines the size of this \ + block (in bytes). The text that will be stored is only the text; \ + no markup is stored.<br> \ + We found that storing 50,000 bytes will store about \ + 95% of all the documents completely. This really \ + depends on how much storage is available and how much \ + you want to show. Currently, this is must not be 0. \ +" }, \ +{ "max_hop_count", "999999", \ + "integer", "htdig", "", "all", "Indexing:Where", "max_hop_count: 4", " \ + Instead of limiting the indexing process by URL \ + pattern, it can also be limited by the number of hops \ + or clicks a document is removed from the starting URL. \ + <br> \ + The starting page or pages will have hop count 0. \ +" }, \ +{ "max_keywords", "-1", \ + "integer", "htdig", "", "3.2.0b1", "Indexing:What", "max_keywords: 10", " \ + This attribute can be used to limit the number of keywords \ + per document that htdig will accept from meta keywords tags. \ + A value of -1 or less means no limit. This can help combat meta \ + keyword spamming, by limiting the amount of keywords that will be \ + indexed, but it will not completely prevent irrelevant matches \ + in a search if the first few keywords in an offending document \ + are not relevant to its contents. \ +" }, \ +{ "max_meta_description_length", "512", \ + "integer", "htdig", "", "3.1.0b1", "Indexing:How", "max_meta_description_length: 1000", " \ + While gathering descriptions from meta description tags, \ + <a href=\"htdig.html\">htdig</a> will only store up to \ + this much of the text (in bytes) for each document to fill the \ + <a href=\"hts_templates.html#METADESCRIPTION\">METADESCRIPTION</a> \ + template variable. All words in the meta description are still \ + used for indexing. \ +" }, \ +{ "max_prefix_matches", "1000", \ + "integer", "htsearch", "", "3.1.0b1", "Searching:Method", "max_prefix_matches: 100", " \ + The Prefix <a href=\"#search_algorithm\">fuzzy algorithm</a> \ + could potentially match a \ + very large number of words. This value limits the \ + number of words each prefix can match. Note \ + that this does not limit the number of documents that \ + are matched in any way. \ +" }, \ +{ "max_retries", "3", \ + "integer", "htdig", "", "3.2.0b1", "Indexing:Connection", "max_retries: 6", " \ + This option set the maximum number of retries when retrieving a document \ + fails (mainly for reasons of connection). \ +" }, \ +{ "max_stars", "4", \ + "integer", "htsearch", "", "all", "Presentation:How", "max_stars: 6", " \ + When stars are used to display the score of a match, \ + this value determines the maximum number of stars that \ + can be displayed. \ +" }, \ +{ "maximum_page_buttons", "${maximum_pages}", \ + "integer", "htsearch", "", "3.2.0b3", "Presentation:How", "maximum_page_buttons: 20", " \ + This value limits the number of page links that will be \ + included in the page list at the bottom of the search \ + results page. By default, it takes on the value of the \ + <a href=\"#maximum_pages\">maximum_pages</a> \ + attribute, but you can set it to something lower to allow \ + more pages than buttons. In this case, pages above this \ + number will have no corresponding button. \ +" }, \ +{ "maximum_pages", "10", \ + "integer", "htsearch", "", "all", "Presentation:How", "maximum_pages: 20", " \ + This value limits the number of page links that will be \ + included in the page list at the bottom of the search \ + results page. As of version 3.1.4, this will limit the \ + total number of matching documents that are shown. \ + You can make the number of page buttons smaller than the \ + number of allowed pages by setting the \ + <a href=\"#maximum_page_buttons\">maximum_page_buttons</a> \ + attribute. \ +" }, \ +{ "maximum_word_length", "32", \ + "integer", "htdig htsearch htfuzzy", "", "3.1.3", "Indexing:What", "maximum_word_length: 15", " \ + This sets the maximum length of words that will be \ + indexed. Words longer than this value will be silently \ + truncated when put into the index, or searched in the \ + index. \ +" }, \ +{ "md5_db", "${database_base}.md5hash.db", \ + "string", "htdig", "", "3.2.0b3", "File Layout", "md5_db: ${database_base}.md5.db", " \ + This file holds a database of md5 and date hashes of pages to \ + catch and eliminate duplicates of pages. See also the \ + <a href=\"#check_unique_md5\">check_unique_md5</a> and \ + <a href=\"#check_unique_date\">check_unique_date</a> attributes. \ +" }, \ +{ "meta_description_factor", "50", \ + "number", "htsearch", "", "3.1.0b1", "Searching:Ranking", "meta_description_factor: 20", " \ + This is a factor which will be used to multiply the \ + weight of words in any META description tags in a document. \ + The number may be a floating point number. See also the \ + <a href=\"#heading_factor\">heading_factor</a> attribute and the \ + <a href=\"#description_factor\">description_factor</a> attribute. \ +" }, \ +{ "metaphone_db", "${database_base}.metaphone.db", \ + "string", "htfuzzy htsearch", "", "all", "File Layout", "metaphone_db: ${database_base}.mp.db", " \ + The database file used for the fuzzy \"metaphone\" search \ + algorithm. This database is created by \ + <a href=\"htfuzzy.html\">htfuzzy</a> and used by \ + <a href=\"htsearch.html\" target=\"_top\">htsearch</a>. \ +" }, \ +{ "method_names", "and All or Any boolean Boolean", \ + "quoted string list", "htsearch", "", "all", "Searching:UI", "method_names: or Or and And", " \ + These values are used to create the <strong> \ + method</strong> menu. It consists of pairs. The first \ + element of each pair is one of the known methods, the \ + second element is the text that will be shown in the \ + menu for that method. This text needs to be quoted if \ + it contains spaces. \ + See the <a href=\"hts_selectors.html\">select list documentation</a> \ + for more information on how this attribute is used. \ +" }, \ +{ "mime_types", "${config_dir}/mime.types", \ + "string", "htdig", "", "3.2.0b1", "Indexing:Where", "mime_types: /etc/mime.types", " \ + This file is used by htdig for local file access and resolving \ + file:// URLs to ensure the files are parsable. If you are running \ + a webserver with its own MIME file, you should set this attribute \ + to point to that file. \ + <p> \ + See also <a href=\"#content_classifier\">content_classifier</a>.\ +"}, \ +{ "minimum_prefix_length", "1", \ + "integer", "htsearch", "", "3.1.0b1", "Searching:Method", "minimum_prefix_length: 2", " \ + This sets the minimum length of prefix matches used by the \ + \"prefix\" fuzzy matching algorithm. Words shorter than this \ + will not be used in prefix matching. \ +" }, \ +{ "minimum_speling_length", "5", \ + "integer", "htsearch", "", "3.2.0b1", "Searching:Method", "minimum_speling_length: 3", " \ + This sets the minimum length of words used by the \ + \"speling\" fuzzy matching algorithm. Words shorter than this \ + will not be used in this fuzzy matching. \ +" }, \ +{ "minimum_word_length", "3", \ + "integer", "htdig htsearch", "", "all", "Indexing:What", "minimum_word_length: 2", " \ + This sets the minimum length of words that will be \ + indexed. Words shorter than this value will be silently \ + ignored but still put into the excerpt.<br> \ + Note that by making this value less than 3, a lot more \ + words that are very frequent will be indexed. It might \ + be advisable to add some of these to the \ + <a href=\"#bad_word_list\">bad_words list</a>. \ +" }, \ +{ "multimatch_factor", "1", \ + "number", "htsearch", "", "3.1.6", "Searching:Ranking", "multimatch_factor: 1000", " \ + This factor gives higher rankings to documents that have more than \ + one matching search word when the <strong>or</strong> \ + <a href=\"#match_method\">match_method</a> is used. \ + In version 3.1.6, the matching words' combined scores were multiplied \ + by this factor for each additional matching word. Currently, this \ + multiplier is applied at most once. \ +" }, +{ "next_page_text", "[next]", \ + "string", "htsearch", "", "3.1.0", "Presentation:Text", "next_page_text: <img src=\"/htdig/buttonr.gif\">", " \ + The text displayed in the hyperlink to go to the next \ + page of matches. \ +" }, \ +{ "no_excerpt_show_top", "false", \ + "boolean", "htsearch", "", "3.1.0b3", "Presentation:How", "no_excerpt_show_top: yes", " \ + If no excerpt is available, this option will act the \ + same as <a \ + href=\"#excerpt_show_top\">excerpt_show_top</a>, that is, \ + it will show the top of the document. \ +" }, \ +{ "no_excerpt_text", "<em>(None of the search words were found in the top of this document.)</em>", \ + "string", "htsearch", "", "3.0", "Presentation:Text", "no_excerpt_text:", " \ + This text will be displayed in place of the excerpt if \ + there is no excerpt available. If this attribute is set \ + to nothing (blank), the excerpt label will not be \ + displayed in this case. \ +" }, \ +{ "no_next_page_text", "${next_page_text}", \ + "string", "htsearch", "", "3.0", "Presentation:Text", "no_next_page_text:", " \ + The text displayed where there would normally be a \ + hyperlink to go to the next page of matches. \ +" }, \ +{ "no_page_list_header", "", \ + "string", "htsearch", "", "3.0", "Presentation:Text", "no_page_list_header: <hr noshade size=2>All results on this page.<br>", " \ + This text will be used as the value of the PAGEHEADER \ + variable, for use in templates or the \ + <a href=\"#search_results_footer\">search_results_footer</a> \ + file, when all search results fit on a single page. \ +" }, \ +{ "no_page_number_text", "", \ + "quoted string list", "htsearch", "", "3.0", "Presentation:Text", "no_page_number_text: \ + <strong>1</strong> <strong>2</strong> \\<br> \ + <strong>3</strong> <strong>4</strong> \\<br> \ + <strong>5</strong> <strong>6</strong> \\<br> \ + <strong>7</strong> <strong>8</strong> \\<br> \ + <strong>9</strong> <strong>10</strong> \ +", " \ + The text strings in this list will be used when putting \ + together the PAGELIST variable, for use in templates or \ + the <a href=\"#search_results_footer\">search_results_footer</a> \ + file, when search results fit on more than page. The PAGELIST \ + is the list of links at the bottom of the search results page. \ + There should be as many strings in the list as there are \ + pages allowed by the <a href=\"#maximum_page_buttons\">maximum_page_buttons</a> \ + attribute. If there are not enough, or the list is empty, \ + the page numbers alone will be used as the text for the links. \ + An entry from this list is used for the current page, as the \ + current page is shown in the page list without a hypertext link, \ + while entries from the <a href=\"#page_number_text\"> \ + page_number_text</a> list are used for the links to other pages. \ + The text strings can contain HTML tags to highlight page numbers \ + or embed images. The strings need to be quoted if they contain \ + spaces. \ +" }, \ +{ "no_prev_page_text", "${prev_page_text}", \ + "string", "htsearch", "", "3.0", "Presentation:Text", "no_prev_page_text:", " \ + The text displayed where there would normally be a \ + hyperlink to go to the previous page of matches. \ +" }, \ +{ "no_title_text", "filename", \ + "string", "htsearch", "", "3.1.0", "Presentation:Text", "no_title_text: \"No Title Found\"", " \ + This specifies the text to use in search results when no \ + title is found in the document itself. If it is set to \ + filename, htsearch will use the name of the file itself, \ + enclosed in brackets (e.g. [index.html]). \ +" }, \ +{ "noindex_end", "<!--/htdig_noindex--> </SCRIPT>", \ + "quoted string list", "htdig", "", "3.1.0", "Indexing:What", "noindex_end: </SCRIPT>", " \ + This string marks the end of a section of an HTML file that should be \ + completely ignored when indexing. Note that text between noindex_start\ + and noindex_end isn't even counted as white space; the text \ + \"<code>foo<!--htdig_noindex-->something<!--/htdig_noindex-->bar</code>\" \ + matches the word \"foobar\", not the phrase \"foo bar\". White space \ + following noindex_end <em>is</em> counted as white space. See also \ + <a href=\"#noindex_start\">noindex_start</a>. \ +" }, \ +{ "noindex_start", "<!--htdig_noindex--> <SCRIPT", \ + "quoted string list", "htdig", "", "3.1.0", "Indexing:What", "noindex_start: <SCRIPT", " \ + These strings mark the start of a section of an HTML file that should \ + be completely ignored when indexing. They work together with \ + <a href=\"#noindex_end\">noindex_end</a>. Once a string in \ + noindex_start is found, text is ignored until the string at the \ + <em>same position</em> within <a href=\"#noindex_end\">noindex_end</a> \ + is encountered. The sections marked off this way cannot overlap. \ + As in the first default pattern, this can be SGML comment \ + declarations that can be inserted anywhere in the documents to exclude \ + different sections from being indexed. However, existing tags can also \ + be used; this is especially useful to exclude some sections from being \ + indexed where the files to be indexed can not be edited. The second \ + default pattern shows how SCRIPT sections in 'uneditable' documents \ + can be skipped; note how noindex_start does not contain an ending \ + >: this allows for all SCRIPT tags to be matched regardless of \ + attributes defined (different types or languages). \ + Note that the match for this string is case insensitive. \ +" }, \ +{ "nothing_found_file", "${common_dir}/nomatch.html", \ + "string", "htsearch", "", "all", "Presentation:Files", "nothing_found_file: /www/searching/nothing.html", " \ + This specifies the file which contains the <code> \ + HTML</code> text to display when no matches were found. \ + The file should contain a complete <code>HTML</code> \ + document.<br> \ + Note that this attribute could also be defined in \ + terms of <a href=\"#database_base\">database_base</a> to \ + make is specific to the current search database. \ +" }, \ +{ "nph", "false", \ + "boolean", "htsearch", "", "3.2.0b2", "Presentation:How", "nph: true", " \ + This attribute determines whether htsearch sends out full HTTP \ + headers as required for an NPH (non-parsed header) CGI. Some \ + servers assume CGIs will act in this fashion, for example MS \ + IIS. If your server does not send out full HTTP headers, you \ + should set this to true. \ +" }, \ +{ "page_list_header", "<hr noshade size=2>Pages:<br>", \ + "string", "htsearch", "", "3.0", "Presentation:Text", "page_list_header:", " \ + This text will be used as the value of the PAGEHEADER \ + variable, for use in templates or the \ + <a href=\"#search_results_footer\">search_results_footer</a> \ + file, when all search results fit on more than one page. \ +" }, \ +{ "page_number_separator", "\" \"", \ + "quoted string list", "htsearch", "", "3.1.4", "Presentation:Text", "page_number_separator: \"</td> <td>\"", " \ + The text strings in this list will be used when putting \ + together the PAGELIST variable, for use in templates or \ + the <a href=\"#search_results_footer\">search_results_footer</a> \ + file, when search results fit on more than page. The PAGELIST \ + is the list of links at the bottom of the search results page. \ + The strings in the list will be used in rotation, and will \ + separate individual entries taken from \ + <a href=\"#page_number_text\">page_number_text</a> and \ + <a href=\"#no_page_number_text\">no_page_number_text</a>. \ + There can be as many or as few strings in the list as you like. \ + If there are not enough for the number of pages listed, it goes \ + back to the start of the list. If the list is empty, a space is \ + used. The text strings can contain HTML tags. The strings need \ + to be quoted if they contain spaces, or to specify an empty string. \ +" }, \ +{ "page_number_text", "", \ + "quoted string list", "htsearch", "", "3.0", "Presentation:Text", "page_number_text: \ + <em>1</em> <em>2</em> \\<br> \ + <em>3</em> <em>4</em> \\<br> \ + <em>5</em> <em>6</em> \\<br> \ + <em>7</em> <em>8</em> \\<br> \ + <em>9</em> <em>10</em> \ +", " \ + The text strings in this list will be used when putting \ + together the PAGELIST variable, for use in templates or \ + the <a href=\"#search_results_footer\">search_results_footer</a> \ + file, when search results fit on more than page. The PAGELIST \ + is the list of links at the bottom of the search results page. \ + There should be as many strings in the list as there are \ + pages allowed by the <a href=\"#maximum_page_buttons\">maximum_page_buttons</a> \ + attribute. If there are not enough, or the list is empty, \ + the page numbers alone will be used as the text for the links. \ + Entries from this list are used for the links to other pages, \ + while an entry from the <a href=\"#no_page_number_text\"> \ + no_page_number_text</a> list is used for the current page, as the \ + current page is shown in the page list without a hypertext link. \ + The text strings can contain HTML tags to highlight page numbers \ + or embed images. The strings need to be quoted if they contain \ + spaces. \ +" }, \ +{ "persistent_connections", "true", \ + "boolean", "htdig", "Server", "3.2.0b1", "Indexing:Connection", "persistent_connections: false", " \ + If set to true, when servers make it possible, htdig can take advantage \ + of persistent connections, as defined by HTTP/1.1 (<em>RFC2616</em>). This permits \ + to reduce the number of open/close operations of connections, when retrieving \ + a document with HTTP. \ +" }, \ +{ "plural_suffix", "s", \ + "string", "htsearch", "", "3.2.0b2", "Presentation: Text", "plural_suffix: en", " \ + Specifies the value of the PLURAL_MATCHES template \ + variable used in the header, footer and template files. \ + This can be used for localization for non-English languages \ + where 's' is not the appropriate suffix. \ +" }, \ +{ "prefix_match_character", "*", \ + "string", "htsearch", "", "3.1.0b1", "Searching:Method", "prefix_match_character: ing", " \ + A null prefix character means that prefix matching should be \ + applied to every search word. Otherwise prefix matching is \ + done on any search word ending with the characters specified \ + in this string, with the string being stripped off before \ + looking for matches. The \"prefix\" algorithm must be enabled \ + in <a href=\"#search_algorithm\">search_algorithm</a> \ + for this to work. You may also want to set the <a \ + href=\"#max_prefix_matches\">max_prefix_matches</a> and <a \ + href=\"#minimum_prefix_length\">minimum_prefix_length</a> attributes \ + to get it working as you want.<br> As a special case, in version \ + 3.1.6 and later, if this string is non-null and is entered alone \ + as a search word, it is taken as a wildcard that matches all \ + documents in the database. If this string is null, the wildcard \ + for this special case will be <strong>*</strong>. This wildcard \ + doesn't require the prefix algorithm to be enabled. \ +" }, \ +{ "prev_page_text", "[prev]", \ + "string", "htsearch", "", "3.0", "Presentation:Text", "prev_page_text: <img src=\"/htdig/buttonl.gif\">", " \ + The text displayed in the hyperlink to go to the \ + previous page of matches. \ +" }, \ +{ "regex_max_words", "25", \ + "integer", "htsearch", "", "3.2.0b1", "Searching:Method", "regex_max_words: 10", " \ + The \"regex\" <a href=\"#search_algorithm\">fuzzy algorithm</a> \ + could potentially match a \ + very large number of words. This value limits the \ + number of words each regular expression can match. Note \ + that this does not limit the number of documents that \ + are matched in any way. \ +" }, \ +{ "remove_bad_urls", "true", \ + "boolean", "htpurge", "Server", "all", "Indexing:How", "remove_bad_urls: true", " \ + If TRUE, htpurge will remove any URLs which were marked \ + as unreachable by htdig from the database. If FALSE, it \ + will not do this. When htdig is run in initial mode, \ + documents which were referred to but could not be \ + accessed should probably be removed, and hence this \ + option should then be set to TRUE, however, if htdig is \ + run to update the database, this may cause documents on \ + a server which is temporarily unavailable to be \ + removed. This is probably NOT what was intended, so \ + hence this option should be set to FALSE in that case. \ +" }, \ +{ "remove_default_doc", "index.html", \ + "string list", "htdig", "", "3.1.0", "Indexing:How", "remove_default_doc: default.html default.htm index.html index.htm", " \ + Set this to the default documents in a directory used by the \ + servers you are indexing. These document names will be stripped \ + off of URLs when they are normalized, if one of these names appears \ + after the final slash, to translate URLs like \ + http://foo.com/index.html into http://foo.com/<br> \ + Note that you can disable stripping of these names during \ + normalization by setting the list to an empty string. \ + The list should only contain names that all servers you index \ + recognize as default documents for directory URLs, as defined \ + by the DirectoryIndex setting in Apache's srm.conf, for example. \ + This does not apply to file:/// or ftp:// URLS. \ + <br>See also <a href=\"#local_default_doc\">local_default_doc</a>. \ +" }, \ +{ "remove_unretrieved_urls", "false", \ + "boolean", "htpurge", "Server", "3.2.0b1", "Indexing:How", "remove_unretrieved_urls: true", " \ + If TRUE, htpurge will remove any URLs which were discovered \ + and included as stubs in the database but not yet retrieved. If FALSE, it \ + will not do this. When htdig is run in initial mode with no restrictions \ + on hopcount or maximum documents, these should probably be removed and set \ + to true. However, if you are hoping to index a small set of documents and \ + eventually get to the rest, you should probably leave this as false. \ +" }, \ +{ "restrict", "", \ + "pattern list", "htsearch", "", "3.2.0b4", "Searching:Method", "restrict: http://www.acme.com/widgets/", " \ + This specifies a set of patterns that all URLs have to \ + match against in order for them to be included in the search \ + results. Any number of strings can be specified, separated by \ + spaces. If multiple patterns are given, at least one of the \ + patterns has to match the URL. The list can be specified \ + from within the configuration file, and can be overridden \ + with the \"restrict\" input parameter in the search form. Note \ + that the restrict list does not take precedence over the \ + <a href=\"#exclude\">exclude</a> list - if a URL matches patterns \ + in both lists it is still excluded from the search results. \ + <br>To restrict URLs in htdig, use \ + <a href=\"#limit_urls_to\">limit_urls_to</a>. \ +" }, \ +{ "robotstxt_name", "htdig", \ + "string", "htdig", "Server", "3.0.7", "Indexing:Out", "robotstxt_name: myhtdig", " \ + Sets the name that htdig will look for when parsing \ + robots.txt files. This can be used to make htdig appear \ + as a different spider than ht://Dig. Useful to \ + distinguish between a private and a global index. \ +" }, \ +{ "script_name", "", \ + "string", "htsearch", "", "3.1.4", "Presentation:Text", "script_name: /search/results.shtml", " \ + Overrides the value of the SCRIPT_NAME \ + environment attribute. This is useful if \ + htsearch is not being called directly as a CGI \ + program, but indirectly from within a dynamic \ + .shtml page using SSI directives. Previously, \ + you needed a wrapper script to do this, but \ + this configuration attribute makes wrapper \ + scripts obsolete for SSI and possibly for \ + other server scripting languages, as \ + well. (You still need a wrapper script when \ + using PHP, though.)<br> \ + Check out the <code>contrib/scriptname</code> \ + directory for a small example. Note that this \ + attribute also affects the value of the <a \ + href=\"hts_templates.html#CGI\">CGI</a> variable \ + used in htsearch templates. \ +" }, \ +{ "search_algorithm", "exact:1", \ + "string list", "htsearch", "", "all", "Searching:Method", "search_algorithm: exact:1 soundex:0.3", " \ + Specifies the search algorithms and their weight to use \ + when searching. Each entry in the list consists of the \ + algorithm name, followed by a colon (:) followed by a \ + weight multiplier. The multiplier is a floating point \ + number between 0 and 1. Note that depending on your \ + <a href=\"#locale\">locale</a> setting, and whether your \ + system's locale implementation affects floating point \ + input, you may need to specify the decimal point as a \ + comma rather than a period.<br> \ + <strong>Note:</strong>If the exact \ + method is not listed, the search may not work since the \ + original terms will not be used.<br> \ + Current algorithms supported are: \ + <dl> \ + <dt> \ + exact \ + </dt> \ + <dd> \ + The default exact word matching algorithm. This \ + will find only exactly matched words. \ + </dd> \ + <dt> \ + soundex \ + </dt> \ + <dd> \ + Uses a slightly modified <a href=\"http://www.sog.org.uk/cig/vol6/605tdrake.pdf\">soundex</a> algorithm to match \ + words. This requires that the soundex database be \ + present. It is generated with the \ + <a href=\"htfuzzy.html\">htfuzzy</a> program. \ + </dd> \ + <dt> \ + metaphone \ + </dt> \ + <dd> \ + Uses the metaphone algorithm for matching words. \ + This algorithm is more specific to the english \ + language than soundex. It requires the metaphone \ + database, which is generated with the <a \ + href=\"htfuzzy.html\">htfuzzy</a> program. \ + </dd> \ + <dt> \ + accents \ + </dt> \ + <dd> \ + Uses the accents algorithm for matching words. \ + This algorithm will treat all accented letters \ + as equivalent to their unaccented counterparts. \ + It requires the accents database, which is \ + generated with the <a \ + href=\"htfuzzy.html\">htfuzzy</a> program. \ + </dd> \ + <dt> \ + endings \ + </dt> \ + <dd> \ + This algorithm uses language specific word endings \ + to find matches. Each word is first reduced to its \ + word root and then all known legal endings are used \ + for the matching. This algorithm uses two databases \ + which are generated with <a href=\"htfuzzy.html\"> \ + htfuzzy</a>. \ + </dd> \ + <dt> \ + synonyms \ + </dt> \ + <dd> \ + Performs a dictionary lookup on all the words. This \ + algorithm uses a database generated with the <a \ + href=\"htfuzzy.html\">htfuzzy</a> program. \ + </dd> \ + <dt> \ + substring \ + </dt> \ + <dd> \ + Matches all words containing the queries as \ + substrings. Since this requires checking every word in \ + the database, this can really slow down searches \ + considerably. \ + <dd> \ + <dt> \ + prefix \ + </dt> \ + <dd> \ + Matches all words beginning with the query \ + strings. Uses the option <a \ + href=\"#prefix_match_character\">prefix_match_character</a> \ + to decide whether a query requires prefix \ + matching. For example \"abc*\" would perform prefix \ + matching on \"abc\" since * is the default \ + prefix_match_character. \ + </dd> \ + <dt> \ + regex \ + </dt> \ + <dd> \ + Matches all words that match the patterns given as regular \ + expressions. Since this requires checking every word in \ + the database, this can really slow down searches \ + considerably. The config file used for searching \ + must include the regex meta-characters (^$\\[-]|.*) \ + included in <a href=\"#extra_word_characters\">extra_word_characters</a>, \ + while the config file used for digging should not.\ + <dd> \ + <dt> \ + speling \ + </dt> \ + <dd> \ + A simple fuzzy algorithm that tries to find one-off spelling \ + mistakes, such as transposition of two letters or an extra character. \ + Since this usually generates just a few possibilities, it is \ + relatively quick. \ + <dd> \ + </dl> \ +" }, \ +{ "search_results_contenttype", "text/html", \ + "string", "htsearch", "", "all", "Presentation:Files", "search_results_contenttype: text/xml", " \ + This specifies a Content-type to be output as an HTTP header \ + at the start of search results. If set to an empty string, \ + the Content-type header will be omitted altogether. \ +" }, +{ "search_results_footer", "${common_dir}/footer.html", \ + "string", "htsearch", "", "all", "Presentation:Files", "search_results_footer: /usr/local/etc/ht/end-stuff.html", " \ + This specifies a filename to be output at the end of \ + search results. While outputting the footer, some \ + variables will be expanded. Variables use the same \ + syntax as the Bourne shell. If there is a variable VAR, \ + the following will all be recognized: \ + <ul> \ + <li> \ + $VAR \ + </li> \ + <li> \ + $(VAR) \ + </li> \ + <li> \ + ${VAR} \ + </li> \ + </ul> \ + The following variables are available. See \ + <a href=\"hts_template.html\">hts_template.html</a> for a complete \ + list. \ + <dl> \ + <dt> \ + MATCHES \ + </dt> \ + <dd> \ + The number of documents that were matched. \ + </dd> \ + <dt> \ + PLURAL_MATCHES \ + </dt> \ + <dd> \ + If MATCHES is not 1, this will be the string \"s\", \ + else it is an empty string. This can be used to say \ + something like \"$(MATCHES) \ + document$(PLURAL_MATCHES) were found\" \ + </dd> \ + <dt> \ + MAX_STARS \ + </dt> \ + <dd> \ + The value of the <a href=\"#max_stars\">max_stars</a> \ + attribute. \ + </dd> \ + <dt> \ + LOGICAL_WORDS \ + </dt> \ + <dd> \ + A string of the search words with either \"and\" or \ + \"or\" between the words, depending on the type of \ + search. \ + </dd> \ + <dt> \ + WORDS \ + </dt> \ + <dd> \ + A string of the search words with spaces in \ + between. \ + </dd> \ + <dt> \ + PAGEHEADER \ + </dt> \ + <dd> \ + This expands to either the value of the \ + <a href=\"#page_list_header\">page_list_header</a> or \ + <a href=\"#no_page_list_header\">no_page_list_header</a> \ + attribute depending on how many pages there are. \ + </dd> \ + </dl> \ + Note that this file will <strong>NOT</strong> be output \ + if no matches were found. In this case the \ + <a href=\"#nothing_found_file\">nothing_found_file</a> \ + attribute is used instead. \ + Also, this file will not be output if it is \ + overridden by defining the \ + <a href=\"#search_results_wrapper\">search_results_wrapper</a> \ + attribute. \ +" }, \ +{ "search_results_header", "${common_dir}/header.html", \ + "string", "htsearch", "", "all", "Presentation:Files", "search_results_header: /usr/local/etc/ht/start-stuff.html", " \ + This specifies a filename to be output at the start of \ + search results. While outputting the header, some \ + variables will be expanded. Variables use the same \ + syntax as the Bourne shell. If there is a variable VAR, \ + the following will all be recognized: \ + <ul> \ + <li> \ + $VAR \ + </li> \ + <li> \ + $(VAR) \ + </li> \ + <li> \ + ${VAR} \ + </li> \ + </ul> \ + The following variables are available. See \ + <a href=\"hts_template.html\">hts_template.html</a> for a complete \ + list. \ + <!-- Do these need to be listed for both _footer and _header? --> \ + <dl> \ + <dt> \ + MATCHES \ + </dt> \ + <dd> \ + The number of documents that were matched. \ + </dd> \ + <dt> \ + PLURAL_MATCHES \ + </dt> \ + <dd> \ + If MATCHES is not 1, this will be the string \"s\", \ + else it is an empty string. This can be used to say \ + something like \"$(MATCHES) \ + document$(PLURAL_MATCHES) were found\" \ + </dd> \ + <dt> \ + MAX_STARS \ + </dt> \ + <dd> \ + The value of the <a href=\"#max_stars\">max_stars</a> \ + attribute. \ + </dd> \ + <dt> \ + LOGICAL_WORDS \ + </dt> \ + <dd> \ + A string of the search words with either \"and\" or \ + \"or\" between the words, depending on the type of \ + search. \ + </dd> \ + <dt> \ + WORDS \ + </dt> \ + <dd> \ + A string of the search words with spaces in \ + between. \ + </dd> \ + </dl> \ + Note that this file will <strong>NOT</strong> be output \ + if no matches were found. In this case the \ + <a href=\"#nothing_found_file\">nothing_found_file</a> \ + attribute is used instead. \ + Also, this file will not be output if it is \ + overridden by defining the \ + <a href=\"#search_results_wrapper\">search_results_wrapper</a> \ + attribute. \ +" }, \ +{ "search_results_order", "", \ + "string list", "htsearch", "", "3.2.0b2", "Searching:Ranking", "search_results_order: \ + /docs/|faq.html * /maillist/ /testresults/", " \ + This specifies a list of patterns for URLs in \ + search results. Results will be displayed in the \ + specified order, with the search algorithm result \ + as the second order. Remaining areas, that do not \ + match any of the specified patterns, can be placed \ + by using * as the pattern. If no * is specified, \ + one will be implicitly placed at the end of the \ + list.<br> \ + See also <a href=\"#url_seed_score\">url_seed_score</a>. \ +" }, \ +{ "search_results_wrapper", "", \ + "string", "htsearch", "", "3.1.0", "Presentation:Files", "search_results_wrapper: ${common_dir}/wrapper.html", " \ + This specifies a filename to be output at the start and \ + end of search results. This file replaces the \ + <a href=\"#search_results_header\">search_results_header</a> and \ + <a href=\"#search_results_footer\">search_results_footer</a> \ + files, with the contents of both in one file, and uses the \ + pseudo-variable <strong>$(HTSEARCH_RESULTS)</strong> as a \ + separator for the header and footer sections. \ + If the filename is not specified, the file is unreadable, \ + or the pseudo-variable above is not found, htsearch reverts \ + to the separate header and footer files instead. \ + While outputting the wrapper, \ + some variables will be expanded, just as for the \ + <a href=\"#search_results_header\">search_results_header</a> and \ + <a href=\"#search_results_footer\">search_results_footer</a> \ + files.<br> \ + Note that this file will <strong>NOT</strong> be output \ + if no matches were found. In this case the \ + <a href=\"#nothing_found_file\">nothing_found_file</a> \ + attribute is used instead. \ +" }, \ +{ "search_rewrite_rules", "", + "string list", "htsearch", "", "3.1.6", "URLs", "search_rewrite_rules: http://(.*)\\\\.mydomain\\\\.org/([^/]*) http://\\\\2.\\\\1.com \\<br> \ + http://www\\\\.myschool\\\\.edu/myorgs/([^/]*) http://\\\\1.org", " \ + This is a list of pairs, <em>regex</em> <em>replacement</em>, used \ + to rewrite URLs in the search results. The left hand string is a \ + regular expression; the right hand string is a literal string with \ + embedded placeholders for fragments that matched inside brackets in \ + the regular expression. \\0 is the whole matched string, \\1 to \\9 \ + are bracketted substrings. The backslash must be doubled-up in the \ + attribute setting to get past the variable expansion parsing. Rewrite \ + rules are applied sequentially to each URL before it is displayed \ + or checked against the <a href=\"#restrict\">restrict</a> or \ + <a href=\"#exclude\">exclude</a> lists. Rewriting does not stop once a \ + match has been made, so multiple rules may affect a given URL. See \ + also <a href=\"#url_part_aliases\">url_part_aliases</a> which allows \ + URLs to be of one form during indexing and translated for results, \ + and <a href=\"#url_rewrite_rules\">url_rewrite_rules</a> which allows \ + URLs to be rewritten while indexing. \ +" }, +{ "server_aliases", "", \ + "string list", "htdig", "", "3.1.0b2", "Indexing:Where", "server_aliases: \ + foo.mydomain.com:80=www.mydomain.com:80 \\<br> \ + bar.mydomain.com:80=www.mydomain.com:80 \ +", " \ + This attribute tells the indexer that servers have several \ + DNS aliases, which all point to the same machine and are NOT \ + virtual hosts. This allows you to ensure pages are indexed \ + only once on a given machine, despite the alias used in a URL. \ + As shown in the example, the mapping goes from left to right, \ + so the server name on the right hand side is the one that is \ + used. As of version 3.1.3, the port number is optional, and is \ + assumed to be 80 if omitted. There is no easy way to map all \ + ports from one alias to another without listing them all. \ +" }, \ +{ "server_max_docs", "-1", \ + "integer", "htdig", "Server", "3.1.0b3", "Indexing:Where", "server_max_docs: 50", " \ + This attribute tells htdig to limit the dig to retrieve a maximum \ + number of documents from each server. This can cause \ + unusual behavior on update digs since the old URLs are \ + stored alphabetically. Therefore, update digs will add \ + additional URLs in pseudo-alphabetical order, up to the \ + limit of the attribute. However, it is most useful to \ + partially index a server as the URLs of additional \ + documents are entered into the database, marked as never \ + retrieved.<br> \ + A value of -1 specifies no limit. \ +" }, \ +{ "server_wait_time", "0", \ + "integer", "htdig", "Server", "3.1.0b3", "Indexing:Connection", "server_wait_time: 20", " \ + This attribute tells htdig to ensure a server has had a \ + delay (in seconds) from the beginning of the last \ + connection. This can be used to prevent \"server abuse\" \ + by digging without delay. It's recommended to set this \ + to 10-30 (seconds) when indexing servers that you don't \ + monitor yourself. Additionally, this attribute can slow \ + down local indexing if set, which may or may not be what \ + you intended. \ +" }, \ +{ "sort", "score", \ + "string", "htsearch", "", "3.1.0", "Presentation:How", "sort: revtime", " \ + This is the default sorting method that htsearch \ + uses to determine the order in which matches are displayed. \ + The valid choices are: \ + <table border=\"0\"> \ + <tr> \ + <td> \ + <ul> \ + <li> score </li> \ + <li> time </li> \ + <li> title </li> \ + </ul> \ + </td> \ + <td> \ + <ul> \ + <li> revscore </li> \ + <li> revtime </li> \ + <li> revtitle </li> \ + </ul> \ + </td> \ + </tr> \ + </table> \ + This attribute will only be used if the HTML form that \ + calls htsearch didn't have the <strong>sort</strong> \ + value set. The words date and revdate can be used instead \ + of time and revtime, as both will sort by the time that \ + the document was last modified, if this information is \ + given by the server. The default is to sort by the score, \ + which ranks documents by best match. The sort methods that \ + begin with \"rev\" simply reverse the order of the \ + sort. Note that setting this to something other than \ + \"score\" will incur a slowdown in searches. \ +" }, \ +{ "sort_names", "score Score time Time title Title revscore 'Reverse Score' revtime 'Reverse Time' revtitle 'Reverse Title'", \ + "quoted string list", "htsearch", "", "3.1.0", "Searching:UI", "sort_names: \ + score 'Best Match' time Newest title A-Z \\<br> \ + revscore 'Worst Match' revtime Oldest revtitle Z-A \ +", " \ + These values are used to create the <strong> \ + sort</strong> menu. It consists of pairs. The first \ + element of each pair is one of the known sort methods, the \ + second element is the text that will be shown in the \ + menu for that sort method. This text needs to be quoted if \ + it contains spaces. \ + See the <a href=\"hts_selectors.html\">select list documentation</a> \ + for more information on how this attribute is used. \ +" }, \ +{ "soundex_db", "${database_base}.soundex.db", \ + "string", "htfuzzy htsearch", "", "all", "File Layout", "soundex_db: ${database_base}.snd.db", " \ + The database file used for the fuzzy \"soundex\" search \ + algorithm. This database is created by \ + <a href=\"htfuzzy.html\">htfuzzy</a> and used by \ + <a href=\"htsearch.html\" target=\"_top\">htsearch</a>. \ +" }, \ +{ "star_blank", "${image_url_prefix}/star_blank.gif", \ + "string", "htsearch", "", "all", "Presentation:Text", "star_blank: http://www.somewhere.org/icons/noelephant.gif", " \ + This specifies the URL to use to display a blank of the \ + same size as the star defined in the \ + <a href=\"#star_image\">star_image</a> attribute or in the \ + <a href=\"#star_patterns\">star_patterns</a> attribute. \ +" }, \ +{ "star_image", "${image_url_prefix}/star.gif", \ + "string", "htsearch", "", "all", "Presentation:Text", "star_image: http://www.somewhere.org/icons/elephant.gif", " \ + This specifies the URL to use to display a star. This \ + allows you to use some other icon instead of a star. \ + (We like the star...)<br> \ + The display of stars can be turned on or off with the \ + <em><a href=\"#use_star_image\">use_star_image</a></em> \ + attribute and the maximum number of stars that can be \ + displayed is determined by the \ + <em><a href=\"#max_stars\">max_stars</a></em> attribute.<br> \ + Even though the image can be changed, the ALT value \ + for the image will always be a '*'. \ +" }, \ +{ "star_patterns", "", \ + "string list", "htsearch", "", "3.0", "Presentation:How", "star_patterns: \ + http://www.sdsu.edu /sdsu.gif \\<br> \ + http://www.ucsd.edu /ucsd.gif \ +", " \ + This attribute allows the star image to be changed \ + depending on the URL or the match it is used for. This \ + is mainly to make a visual distinction between matches \ + on different web sites. The star image could be \ + replaced with the logo of the company the match refers \ + to.<br> \ + It is advisable to keep all the images the same size \ + in order to line things up properly in a short result \ + listing.<br> \ + The format is simple. It is a list of pairs. The first \ + element of each pair is a pattern, the second element \ + is a URL to the image for that pattern. \ +" }, \ +{ "startday", "", \ + "integer", "htsearch", "", "3.1.6", "Searching:Method", "startday: 1", " \ + Day component of first date allowed as last-modified date \ + of returned docutments. \ + This is most usefully specified as a \ + <a href=\"hts_form.html#startyear\">GCI argument</a>. \ + See also <a href=\"#startyear\">startyear</a>. \ +" }, \ +{ "start_ellipses", "<strong><code>... </code></strong>", \ + "string", "htsearch", "", "all", "Presentation:Text", "start_ellipses: ...", " \ + When excerpts are displayed in the search output, this \ + string will be prepended to the excerpt if there is \ + text before the text displayed. This is just a visual \ + reminder to the user that the excerpt is only part of \ + the complete document. \ +" }, \ +{ "start_highlight", "<strong>", \ + "string", "htsearch", "", "3.1.4", "Presentation:Text", "start_highlight: <font color=\"#FF0000\">", " \ + When excerpts are displayed in the search output, matched \ + words will be highlighted using this string and \ + <a href=\"#end_highlight\"> end_highlight</a>. \ + You should ensure that highlighting tags are balanced, \ + that is, any formatting tags that this string \ + opens should be closed by end_highlight. \ +" }, \ +{ "startmonth", "", \ + "integer", "htsearch", "", "3.1.6", "Searching:Method", "startmonth: 1", " \ + Month component of first date allowed as last-modified date \ + of returned docutments. \ + This is most usefully specified as a \ + <a href=\"hts_form.html#startyear\">GCI argument</a>. \ + See also <a href=\"#startyear\">startyear</a>. \ +" }, \ +{ "start_url", "http://www.htdig.org/", \ + "string list", "htdig", "", "all", "Indexing:Where", "start_url: http://www.somewhere.org/alldata/index.html", " \ + This is the list of URLs that will be used to start a \ + dig when there was no existing database. Note that \ + multiple URLs can be given here. \ + <br>Note also that the value of <em>start_url</em> \ + will be the default value for \ + <a href=\"#limit_urls_to\">limit_urls_to</a>, so if \ + you set start_url to the URLs for specific files, \ + rather than a site or subdirectory URL, you may need \ + to set limit_urls_to to something less restrictive \ + so htdig doesn't reject links in the documents. \ +" }, \ +{ "startyear", "", \ + "integer", "htsearch", "", "3.1.6", "Searching:Method", "startyear: 2001", " \ + This specifies the year of the cutoff start date for \ + search results. If the start or end date are specified, \ + only results with a last modified date within this \ + range are shown. If a start or end date is specified, but startyear \ + is not, then it defaults to 1970. \ + See also <a href=\"#startday\">startday</a>, \ + <a href=\"#startmonth\">startmonth</a>, \ + <a href=\"#endday\">endday</a>, \ + <a href=\"#endmonth\">endmonth</a>, \ + <a href=\"#endyear\">endyear</a>. \ + These are most usefully specified as a \ + <a href=\"hts_form.html#startyear\">GCI argument</a>.<br> \ + For each component, if a negative number is given, \ + it is taken as relative to the current date. \ + Relative days can span several months or even years if desired, \ + and relative months can span several years. A startday of \ + -90 will select matching documents modified within \ + the last 90 days. \ +" }, \ +{ "store_phrases", "true", \ + "boolean", "htdig", "", "3.2.0b5", "Indexing:How", "startyear: false", " \ + Causes htdig to record all occurrences of each word in a document, \ + to allow accurate phrase searches. If this is false, only the first \ + occurrence of each word will be stored, causing many phrases to be \ + missed. Setting this false increases indexing speed by about 20%, \ + and reduces disk requirements by about 60%.\ +" }, \ +{ "substring_max_words", "25", \ + "integer", "htsearch", "", "3.0.8b1", "Searching:Method", "substring_max_words: 100", " \ + The Substring <a href=\"#search_algorithm\">fuzzy algorithm</a> \ + could potentially match a \ + very large number of words. This value limits the \ + number of words each substring pattern can match. Note \ + that this does not limit the number of documents that \ + are matched in any way. \ +" }, \ +{ "synonym_db", "${common_dir}/synonyms.db", \ + "string", "htsearch htfuzzy", "", "3.0", "File Layout", "synonym_db: ${database_base}.syn.db", " \ + Points to the database that <a href=\"htfuzzy.html\"> \ + htfuzzy</a> creates when the <strong>synonyms</strong> \ + algorithm is used.<br> \ + <a href=\"htsearch.html\" target=\"_top\">htsearch</a> \ + uses this to perform synonym dictionary lookups. \ +" }, \ +{ "synonym_dictionary", "${common_dir}/synonyms", \ + "string", "htfuzzy", "", "3.0", "File Layout", "synonym_dictionary: /usr/dict/synonyms", " \ + This points to a text file containing the synonym \ + dictionary used for the synonyms search algorithm.<br> \ + Each line of this file has at least two words. The \ + first word is the word to replace, the rest of the \ + words are synonyms for that word. \ +" }, \ +{ "syntax_error_file", "${common_dir}/syntax.html", \ + "string", "htsearch", "", "all", "Presentation:Files", "syntax_error_file: ${common_dir}/synerror.html", " \ + This points to the file which will be displayed if a \ + boolean expression syntax error was found. \ +" }, \ +{ "tcp_max_retries", "1", \ + "integer", "htdig", "Server", "3.2.0b1", "Indexing:Connection", "tcp_max_retries: 6", " \ + This option set the maximum number of attempts when a connection \ + <A href=\"#timeout\">timeout</A>s. \ + After all these retries, the connection attempt results <timed out>. \ +" }, \ +{ "tcp_wait_time", "5", \ + "integer", "htdig", "Server", "3.2.0b1", "Indexing:Connection", "tcp_wait_time: 10", " \ + This attribute sets the wait time (in seconds) after a connection \ + fails and the <A href=\"#timeout\">timeout</A> is raised. \ +" }, \ +{ "template_map", "Long builtin-long builtin-long Short builtin-short builtin-short", \ + "quoted string list", "htsearch", "", "3.0", "Presentation:Files,Searching:UI", "template_map: \ + Short short ${common_dir}/short.html \\<br> \ + Normal normal builtin-long \\<br> \ + Detailed detail ${common_dir}/detail.html \ +", " \ + This maps match template names to internal names and \ + template file names. It is a list of triplets. The \ + first element in each triplet is the name that will be \ + displayed in the FORMAT menu. The second element is the \ + name used internally and the third element is a \ + filename of the template to use.<br> \ + There are two predefined templates, namely <strong> \ + builtin-long</strong> and <strong> \ + builtin-short</strong>. If the filename is one of \ + those, they will be used instead.<br> \ + More information about templates can be found in the \ + <a href=\"htsearch.html\" target=\"_top\">htsearch</a> \ + documentation. The particular template is selecterd by the \ + <a href=\"hts_form.html#format\">format</a> cgi argument, and the \ + default is given by <a href=\"#template_name\">template_name</a> in \ + the config file. \ +" }, \ +{ "template_name", "builtin-long", \ + "string", "htsearch", "", "3.0", "Searching:UI,Presentation:How", "template_name: long", " \ + Specifies the default template if no \ + <a href=\"hts_form.html#format\">format</a> field is given by the \ + search form. This needs to map to the \ + <a href=\"#template_map\">template_map</a>. \ +" }, \ +{ "template_patterns", "", \ + "string list", "htsearch", "", "3.1.4", "Presentation:How", "template_patterns: \ + http://www.sdsu.edu ${common_dir}/sdsu.html \\<br> \ + http://www.ucsd.edu ${common_dir}/ucsd.html \ +", " \ + This attribute allows the results template to be changed \ + depending on the URL or the match it is used for. This \ + is mainly to make a visual distinction between matches \ + on different web sites. The results for each site could \ + thus be shown in a style matching that site.<br> \ + The format is simply a list of pairs. The first \ + element of each pair is a pattern, the second element \ + is the name of the template file for that pattern.<br> \ + More information about templates can be found in the \ + <a href=\"htsearch.html\" target=\"_top\">htsearch</a> \ + documentation.<br> \ + Normally, when using this template selection method, you \ + would disable user selection of templates via the <strong>format</strong> \ + input parameter in search forms, as the two methods were not \ + really designed to interact. Templates selected by URL patterns \ + would override any user selection made in the form. If you want \ + to use the two methods together, see the notes on \ + <a href=\"hts_selectors.html#template_patterns\">combining</a> \ + them for an example of how to do this. \ +" }, \ +{ "text_factor", "1", \ + "number", "htsearch", "", "3.0", "Searching:Ranking", "text_factor: 0", " \ + This is a factor which will be used to multiply the \ + weight of words that are not in any special part of a \ + document. Setting a factor to 0 will cause normal words \ + to be ignored. The number may be a floating point \ + number. See also the <a href=\"#heading_factor\"> heading_factor</a> \ + attribute. \ +" }, \ +{ "timeout", "30", \ + "integer", "htdig", "Server", "all", "Indexing:Connection", "timeout: 42", " \ + Specifies the time the digger will wait to complete a \ + network read. This is just a safeguard against \ + unforeseen things like the all too common \ + transformation from a network to a notwork.<br> \ + The timeout is specified in seconds. \ +" }, \ +{ "title_factor", "100", \ + "number", "htsearch", "", "all", "Searching:Ranking", "title_factor: 12", " \ + This is a factor which will be used to multiply the \ + weight of words in the title of a document. Setting a \ + factor to 0 will cause words in the title to be \ + ignored. The number may be a floating point number. See \ + also the <a href=\"#heading_factor\"> \ + heading_factor</a> attribute. \ +" }, \ +{ "translate_latin1", "true", \ + "boolean", "htdig htsearch", "", "3.2.0b5", "Indexing:What", "translate_latin1: false", " \ + If set to false, the SGML entities for ISO-8859-1 (or \ + Latin 1) characters above &nbsp; (or &#160;) \ + will not be translated into their 8-bit equivalents. \ + This attribute should be set to false when using a \ + <a href=\"#locale\">locale</a> that doesn't use the \ + ISO-8859-1 character set, to avoid these entities \ + being mapped to inappropriate 8-bit characters, or \ + perhaps more importantly to avoid 8-bit characters from \ + your locale being mapped back to Latin 1 SGML entities \ + in search results. \ +" }, \ +{ "url_list", "${database_base}.urls", \ + "string", "htdig", "", "all", "Extra Output", "url_list: /tmp/urls", " \ + This file is only created if \ + <em><a href=\"#create_url_list\">create_url_list</a></em> is set to \ + true. It will contain a list of all URLs that were \ + seen. \ +" }, \ +{ "url_log", "${database_base}.log", \ + "string", "htdig", "", "3.1.0", "Extra Output", "url_log: /tmp/htdig.progress", " \ + If <a href=\"htdig.html\">htdig</a> is \ + interrupted, it will write out its progress to this \ + file. Note that if it has a large number of URLs to write, \ + it may take some time to exit. This can especially happen \ + when running update digs and the run is interrupted soon \ + after beginning. \ +" }, \ +{ "url_part_aliases", "", \ + "string list", "all", "", "3.1.0", "URLs", "url_part_aliases: \ + http://search.example.com/~htdig *site \\<br> \ + http://www.htdig.org/this/ *1 \\<br> \ + .html *2 \ +url_part_aliases: \ + http://www.htdig.org/ *site \\<br> \ + http://www.htdig.org/that/ *1 \\<br> \ + .htm *2 \ +", " \ + A list of translations pairs <em>from</em> and \ + <em>to</em>, used when accessing the database. \ + If a part of an URL matches with the \ + <em>from</em>-string of each pair, it will be \ + translated into the <em>to</em>-string just before \ + writing the URL to the database, and translated \ + back just after reading it from the database.<br> \ + This is primarily used to provide an easy way to \ + rename parts of URLs for e.g. changing \ + www.example.com/~htdig to www.htdig.org. Two \ + different configuration files for digging and \ + searching are then used, with url_part_aliases \ + having different <em>from</em> strings, but \ + identical <em>to</em>-strings.<br> \ + See also <a \ + href=\"#common_url_parts\">common_url_parts</a>.<br> \ + Strings that are normally incorrect in URLs or \ + very seldom used, should be used as \ + <em>to</em>-strings, since extra storage will be \ + used each time one is found as normal part of a \ + URL. Translations will be performed with priority \ + for the leftmost longest match. Each \ + <em>to</em>-string must be unique and not be a \ + part of any other <em>to</em>-string. It also helps \ + to keep the <em>to</em>-strings short to save space \ + in the database. Other than that, the choice of \ + <em>to</em>-strings is pretty arbitrary, as they \ + just provide a temporary, internal encoding in the \ + databases, and none of the characters in these \ + strings have any special meaning.<br> \ + Note that when this attribute is changed, the \ + database should be rebuilt, unless the effect of \ + \"moving\" the affected URLs in the database is \ + wanted, as described above.<br> \ + <strong>Please note:</strong> Don't just copy the \ + example below into a single configuration file. \ + There are two separate settings of \ + <em>url_part_aliases</em> below; the first one is \ + for the configuration file to be used by htdig, \ + htmerge, and htnotify, and the second one is for the \ + configuration file to be used by htsearch. \ + In this example, htdig will encode the URL \ + \"http://search.example.com/~htdig/contrib/stuff.html\" \ + as \"*sitecontrib/stuff*2\" in the databases, and \ + htsearch will decode it as \ + \"http://www.htdig.org/contrib/stuff.htm\".<br> \ + As of version 3.1.6, you can also do more complex \ + rewriting of URLs using \ + <a href=\"#url_rewrite_rules\">url_rewrite_rules</a> and \ + <a href=\"#search_rewrite_rules\">search_rewrite_rules</a>. \ +" }, \ +{ "url_rewrite_rules", "", \ + "string list", "htdig", "", "3.2.0b3", "URLs", "url_rewrite_rules: (.*)\\\\?JServSessionIdroot=.* \\\\1 \\<br> \ + (.*)\\\\&JServSessionIdroot=.* \\\\1 \\<br> \ + (.*)&context=.* \\\\1<br>", " \ + This is a list of pairs, <em>regex</em> <em>replacement</em> used to \ + permanently rewrite URLs as they are indexed. The left hand string is \ + a regular expression; the right hand string is a literal string with \ + embedded placeholders for fragments that matched inside brackets in \ + the regex. \\0 is the whole matched string, \\1 to \\9 are bracketted \ + substrings. Note that the <strong>entire</strong> URL is replaced by \ + the right hand string (not just the portion which matches the left hand\ + string). Thus, a leading and trailing (.*) should be included in the \ + pattern, with matching placeholders in the replacement string.<br> \ + Rewrite rules are applied sequentially to each \ + incoming URL before normalization occurs. Rewriting does not stop \ + once a match has been made, so multiple rules may affect a given URL. \ + See also <a href=\"#url_part_aliases\">url_part_aliases</a> which \ + allows URLs to be of one \ +form during indexing and translated for results. \ +"}, \ +{ "url_seed_score", "", \ + "string list", "htsearch", "", "3.2.0b2", "Searching::Ranking", "url_seed_score: \ + /mailinglist/ *.5-1e6 <br> \ + /docs/|/news/ *1.5 <br> \ + /testresults/ "*.7 -200" <br> \ + /faq-area/ *2+10000", " \ + This is a list of pairs, <em>pattern</em> \ + <em>formula</em>, used to weigh the score of \ + hits, depending on the URL of the document.<br> \ + The <em>pattern</em> part is a substring to match \ + against the URL. Pipe ('|') characters can be \ + used in the pattern to concatenate substrings for \ + web-areas that have the same formula.<br> \ + The formula describes a <em>factor</em> and a \ + <em>constant</em>, by which the hit score is \ + weighed. The <em>factor</em> part is multiplied \ + to the original score, then the <em>constant</em> \ + part is added.<br> \ + The format of the formula is the factor part: \ + "*<em>N</em>" optionally followed by comma and \ + spaces, followed by the constant part : \ + "+<em>M</em>", where the plus sign may be emitted \ + for negative numbers. Either part is optional, \ + but must come in this order.<br> \ + The numbers <em>N</em> and <em>M</em> are floating \ + point constants.<br> \ + More straightforward is to think of the format as \ + "newscore = oldscore*<em>N</em>+<em>M</em>", \ + but with the "newscore = oldscore" part left out. \ +" }, \ +{ "url_text_factor", "1", \ + "number", "htsearch", "", "??", "Searching:Ranking", "url_text_factor: 1", " \ + TO BE COMPLETED<br> \ + See also <a href=\"#heading_factor\">heading_factor</a>. \ +" }, \ +{ "use_doc_date", "false", \ + "boolean", "htdig", "", "3.2.0b1", "Indexing:How", "use_doc_date: true", " \ + If set to true, htdig will use META date tags in documents, \ + overriding the modification date returned by the server. \ + Any documents that do not have META date tags will retain \ + the last modified date returned by the server or found on \ + the local file system. \ + As of version 3.1.6, in addition to META date tags, htdig will also \ + recognize dc.date, dc.date.created and dc.date.modified. \ +" }, \ +{ "use_meta_description", "false", \ + "boolean", "htsearch", "", "3.1.0b1", "Presentation:How", "use_meta_description: true", " \ + If set to true, any META description tags will be used as \ + excerpts by htsearch. Any documents that do not have META \ + descriptions will retain their normal excerpts. \ +" }, \ +{ "use_star_image", "true", \ + "boolean", "htsearch", "", "all", "Presentation:How", "use_star_image: no", " \ + If set to true, the <em><a href=\"#star_image\"> \ + star_image</a></em> attribute is used to display upto \ + <em><a href=\"#max_stars\">max_stars</a></em> images for \ + each match. \ +" }, \ +{ "user_agent", "htdig", \ + "string", "htdig", "Server", "3.1.0b2", "Indexing:Out", "user_agent: htdig-digger", " \ + This allows customization of the user_agent: field sent when \ + the digger requests a file from a server. \ +" }, \ +{ "valid_extensions", "", \ + "string list", "htdig", "URL", "3.1.4", "Indexing:Where", "valid_extensions: .html .htm .shtml", " \ + This is a list of extensions on URLs which are \ + the only ones considered acceptable. This list is used to \ + supplement the MIME-types that the HTTP server provides \ + with documents. Some HTTP servers do not have a correct \ + list of MIME-types and so can advertise certain \ + documents as text while they are some binary format. \ + If the list is empty, then all extensions are acceptable, \ + provided they pass other criteria for acceptance or rejection. \ + If the list is not empty, only documents with one of the \ + extensions in the list are parsed. \ + See also <a href=\"#bad_extensions\">bad_extensions</a>. \ +" }, \ +{ "valid_punctuation", ".-_/!#\\$%^&'", \ + "string", "htdig htsearch", "", "all", "Indexing:What", "valid_punctuation: -'", " \ + This is the set of characters which may be deleted \ + from the document before determining what a word is. \ + This means that if a document contains something like \ + <code>half-hearted</code> the digger will see this as the three \ + words <code> half</code>, <code>hearted</code> and \ + <code>halfhearted</code>.<br> \ + These characters are also removed before keywords are passed to the \ + search engine, so a search for \"half-hearted\" works as expected.<br> \ + Note that the dollar sign ($) and backslash (\\) must be escaped by a \ + backslash in both valid_punctuation and extra_word_characters. \ + Moreover, the backslash should not be the last character on the line. \ + There is currently no way to include a back-quote (`) in \ + extra_word_characters or valid_punctuation.<br> \ + See also the \ + <a href=\"#extra_word_characters\">extra_word_characters</a> \ + and <a href=\"#allow_numbers\">allow_numbers</a> \ + attributes. \ +" }, \ +{ "version", VERSION, \ + "string", "htsearch", "", "all", "Presentation:Text", "version: 3.2.0", " \ + This specifies the value of the VERSION \ + variable which can be used in search templates. \ + The default value of this attribute is determined \ + at compile time, and will not normally be set \ + in configuration files. \ +" }, \ +{ "word_db", "${database_base}.words.db", \ + "string", "all", "", "all", "File Layout", "word_db: ${database_base}.allwords.db", " \ + This is the main word database. It is an index of all \ + the words to a list of documents that contain the \ + words. This database can grow large pretty quickly. \ +" }, \ +{ "word_dump", "${database_base}.worddump", \ + "string", "htdig htdump htload", "", "3.2.0b1", "File Layout", "word_dump: /tmp/words.txt", " \ + This file is basically a text version of the file \ + specified in <em><a href=\"#word_db\">word_db</a></em>. Its \ + only use is to have a human readable database of all \ + words. The file is easy to parse with tools like \ + perl or tcl. \ +" }, \ +{ "wordlist_cache_inserts", "false", \ + "boolean", "???", "", "???", "Indexing:How", "wordlist_cache_inserts: true", " \ + If true, create a cache of size wordlist_cache_size/2 for class \ + WordListOne. <em>I don't know what this is for. Does anyone?</em> \ +" }, \ +{ "wordlist_cache_size", "10000000", \ + "integer", "all", "", "3.2.0b1", "Indexing:How", "wordlist_cache_size: 40000000", " \ + Size (in bytes) of memory cache used by Berkeley DB (DB used by the indexer) \ + IMPORTANT: It makes a <strong>huge</strong> difference. The rule \ + is that the cache size should be at least 2% of the expected index size. The \ + Berkeley DB file has 1% of internal pages that <em>must</em> be cached for good \ + performances. Giving an additional 1% leaves room for caching leaf pages. \ +" }, \ +{ "wordlist_compress", "true", \ + "boolean", "all", "", "3.2.0b1", "Indexing:How", "wordlist_compress: false", " \ + Enables or disables the default compression system for the indexer. \ + This currently attempts to compress the index by a factor of 8. If the \ + Zlib library is not found on the system, the default is false. \ +" }, \ +{ "wordlist_compress_zlib", "true", \ + "boolean", "all", "", "3.2.0b4", "Indexing:How", "wordlist_compress_zlib: false", " \ + Enables or disables the zlib compression system for the indexer. \ + Both <a href=\"#wordlist_compress\">wordlist_compress</a> and \ + <a href=\"#compression_level\">compression_level</a> must be true \ + (non-zero) to use this option!\ +" }, \ +{ "wordlist_monitor", "false", \ + "boolean", "all", "", "3.2.0b1", "Extra Output", "wordlist_monitor: true", " \ + This enables monitoring of what's happening in the indexer. \ + It can help to detect performance/configuration problems. \ +" }, \ +{ "wordlist_monitor_period","0", \ + "number", "all", "", "3.2.0b1", "Extra Output", "wordlist_monitor_period: .1", " \ + Sets the number of seconds between each monitor output. \ +" }, \ +{ "wordlist_monitor_output","", \ + "string", "all", "", "3.2.0b1", "Extra Output", "wordlist_monitor_output: myfile", " \ + Print monitoring output on file instead of the default stderr. \ +" }, +{ "wordlist_page_size", "0", \ + "integer", "all", "", "3.2.0b1", "Indexing:How", "wordlist_page_size: 8192", " \ + Size (in bytes) of pages used by Berkeley DB (DB used by the indexer). \ + Must be a power of two. \ +" }, \ +{ "wordlist_verbose", "", \ + "integer", "", "", "", "", "wordlist_verbose: true", " \ + wordlist_verbose 1 walk logic<br> \ + wordlist_verbose 2 walk logic details<br> \ + wordlist_verbose 2 walk logic lots of details<br> \ +" }, \ +{ "wordlist_wordkey_description", "Word/DocID 32/Flags 8/Location 16", \ + "string", "all", "", "3.2.0b1", "Indexing:How", "**this should not be configured by user**", " \ + Internal key description: *not user configurable* \ +" }, \ +{ "wordlist_wordrecord_description", "DATA", \ + "string", "all", "", "3.2.0b1", "Indexing:How", "**this should not be configured by user**", " \ + Internal data description: *not user configurable* \ +" }, \ +{0, 0, 0, 0, 0, 0, 0, 0, 0} +}; + +HtConfiguration config; diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/defaults.h b/debian/htdig/htdig-3.2.0b6/htcommon/defaults.h new file mode 100644 index 00000000..35bf6da2 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/defaults.h @@ -0,0 +1,26 @@ +// +// defaults.h +// +// defaults: Default configuration values for the ht programs +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: defaults.h,v 1.7 2004/05/28 13:15:13 lha Exp $ +// +#ifndef _defaults_h_ +#define _defaults_h_ + +#include "HtConfiguration.h" + + +extern ConfigDefaults defaults[]; +//extern HtConfiguration config; + + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/defaults.xml b/debian/htdig/htdig-3.2.0b6/htcommon/defaults.xml new file mode 100644 index 00000000..f3fd2eb7 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/defaults.xml @@ -0,0 +1,4379 @@ +<!DOCTYPE HtdigAttributes SYSTEM "defaults.dtd" > +<HtdigAttributes> + <attribute name="accents_db" + type="string" + programs="htfuzzy htsearch" + version="all" + category="File Layout" > + <default>${database_base}.accents.db</default> + <example>${database_base}.uml.db</example> + <description> + The database file used for the fuzzy "accents" search + algorithm. This database is created by + <ref type="program">htfuzzy</ref> and used by + <ref type="program">htsearch</ref>. + </description> + </attribute> + + <attribute name="accept_language" + type="string_list" + programs="htdig" + version="3.2.0b4" + category="Indexing:Out" + block="Server" > + <default></default> + <example>en-us en it</example> + <description> + This attribute allows you to restrict the set of natural languages + that are preferred as a response to an HTTP request performed by the + digger. This can be done by putting one or more language tags + (as defined by RFC 1766) in the preferred order, separated by spaces. + By doing this, when the server performs a content negotiation based + on the 'accept-language' given by the HTTP user agent, a different + content can be shown depending on the value of this attribute. If + set to an empty list, no language will be sent and the server default + will be returned. + </description> + </attribute> + + <attribute name="add_anchors_to_excerpt" + type="boolean" + programs="htsearch" + version="3.1.0" + category="Presentation:How" > + <default>true</default> + <example>no</example> + <description> + If set to true, the first occurrence of each matched + word in the excerpt will be linked to the closest + anchor in the document. This only has effect if the + <strong>EXCERPT</strong> variable is used in the output + template and the excerpt is actually going to be displayed. + </description> + </attribute> + + <attribute name="allow_double_slash" + type="boolean" + programs="htdig" + version="3.2.0b4" + category="Indexing:Out" > + <default>false</default> + <example>true</example> + <description> + If set to true, strings of multiple slashes ('/') in URL paths + will be left intact, rather than being collapsed. This is necessary + for some search engine URLs which use slashes to separate fields rather + than to separate directory components. However, it can lead to multiple database + entries refering to the same file, and it causes '/foo//../' to + be equivalent to '/foo/', rather than to '/'. + </description> + </attribute> + + <attribute name="allow_in_form" + type="string_list" + programs="htsearch" + version="3.1.0" + category="Searching:UI" > + <default></default> + <example>search_algorithm search_results_header</example> + <description> Allows the specified config file attributes to be specified + in search forms as separate fields. This could be used to + allow form writers to design their own headers and footers + and specify them in the search form. Another example would + be to offer a menu of search_algorithms in the form. + <codeblock> + <SELECT NAME="search_algorithm"> + <OPTION VALUE="exact:1 prefix:0.6 synonyms:0.5 endings:0.1" SELECTED>fuzzy + <OPTION VALUE="exact:1">exact + </SELECT> + </codeblock> + The general idea behind this is to make an input parameter out + of any configuration attribute that's not already automatically + handled by an input parameter. You can even make up your own + configuration attribute names, for purposes of passing data from + the search form to the results output. You're not restricted to + the existing attribute names. The attributes listed in the + allow_in_form list will be settable in the search form using + input parameters of the same name, and will be propagated to + the follow-up search form in the results template using template + variables of the same name in upper-case. + You can also make select lists out of any of these input + parameters, in the follow-up search form, using the + <ref type="attr">build_select_lists</ref> + configuration attribute. +</description> + </attribute> + + <attribute name="allow_numbers" + type="boolean" + programs="htdig htsearch" + version="all" + category="Indexing:What" > + <default>false</default> + <example>true</example> + <description> + If set to true, numbers are considered words. This + means that searches can be done on number as well as + regular words. All the same rules apply to numbers as + to words. See the description of + <ref type="attr">valid_punctuation</ref> for the + rules used to determine what a word is. + </description> + </attribute> + + <attribute name="allow_space_in_url" + type="boolean" + programs="htdig" + version="3.2.0b6" + category="Indexing:Where" > + <default>false</default> + <example>true</example> + <description> + If set to true, htdig will handle URLs that contain + embedded spaces. Technically, this is a violation of + <em>RFC 2396</em>, which says spaces should be stripped out + (as htdig does by default). However, many web browsers + and HTML code generators violate this standard already, + so enabling this attribute allows htdig to handle these + non-compliant URLs. Even with this attribute set, htdig + still strips out all white space (leading, trailing and + embedded), except that space characters embedded within + the URL will be encoded as %20. + </description> + </attribute> + + <attribute name="allow_virtual_hosts" + type="boolean" + programs="htdig" + version="3.0.8b2" + category="Indexing:Where" > + <default>true</default> + <example>false</example> + <description> + If set to true, htdig will index virtual web sites as + expected. If false, all URL host names will be + normalized into whatever the DNS server claims the IP + address to map to. If this option is set to false, + there is no way to index either "soft" or "hard" + virtual web sites. + </description> + </attribute> + + <attribute name="anchor_target" + type="string" + programs="htdig" + version="3.1.6" + category="Presentation:How" > + <default></default> + <example>body</example> + <description> + When the first matched word in the excerpt is linked + to the closest anchor in the document, this string + can be set to specify a target in the link so the + resulting page is displayed in the desired frame. + This value will only be used if the + <ref type="attr">add_anchors_to_excerpt</ref> + attribute is set to true, the <strong>EXCERPT</strong> + variable is used in the output template and the + excerpt is actually displayed with a link. + </description> + </attribute> + + <attribute name="any_keywords" + type="boolean" + programs="htsearch" + version="3.2.0b2" + category="Searching:Method" > + <default>false</default> + <example>yes</example> + <description> + If set to true, the words in the <strong>keywords</strong> + input parameter in the search form will be joined with logical + ORs rather than ANDs, so that any of the words provided will do. + Note that this has nothing to do with limiting the search to + words in META keywords tags. See the <a href="hts_form.html"> + search form</a> documentation for details on this. + </description> + </attribute> + + <attribute name="author_factor" + type="number" + programs="htsearch" + version="??" + category="Searching:Ranking" > + <default>1</default> + <example>1</example> + <description> + TO BE COMPLETED<br/> + See also <ref type="attr">heading_factor</ref>. + </description> + </attribute> + + <attribute name="authorization" + type="string" + programs="htdig" + version="3.1.4" + category="Indexing:Out" + block="URL" > + <default></default> + <example>mypassword</example> + <description> + This tells htdig to send the supplied + <em>username</em><strong>:</strong><em>password</em> with each HTTP request. + The credentials will be encoded using the "Basic" authentication + scheme. There <em>must</em> be a colon (:) between the username and + password.<br/> + This attribute can also be specified on htdig's command line using + the -u option, and will be blotted out so it won't show up in a + process listing. If you use it directly in a configuration file, + be sure to protect it so it is readable only by you, and do not + use that same configuration file for htsearch. + </description> + </attribute> + + <attribute name="backlink_factor" + type="number" + programs="htsearch" + version="3.1.0" + category="Searching:Ranking" > + <default>1000</default> + <example>501.1</example> + <description> + This is a weight of "how important" a page is, based on + the number of URLs pointing to it. It's actually + multiplied by the ratio of the incoming URLs (backlinks) + and outgoing URLs (links on the page), to balance out pages + with lots of links to pages that link back to them. The ratio + gives lower weight to "link farms", which often have many + links to them. This factor can + be changed without changing the database in any way. + However, setting this value to something other than 0 + incurs a slowdown on search results. + </description> + </attribute> + + <attribute name="bad_extensions" + type="string_list" + programs="htdig" + version="all" + category="Indexing:Where" + block="URL" > + <default>.wav .gz .z .sit .au .zip .tar .hqx .exe .com .gif .jpg .jpeg .aiff .class .map .ram .tgz .bin .rpm .mpg .mov .avi .css</default> + <example>.foo .bar .bad</example> + <description> + This is a list of extensions on URLs which are + considered non-parsable. This list is used mainly to + supplement the MIME-types that the HTTP server provides + with documents. Some HTTP servers do not have a correct + list of MIME-types and so can advertise certain + documents as text while they are some binary format. + If the list is empty, then all extensions are acceptable, + provided they pass other criteria for acceptance or rejection. + See also <ref type="attr">valid_extensions</ref>. + </description> + </attribute> + + <attribute name="bad_querystr" + type="pattern_list" + programs="htdig" + version="3.1.0" + category="Indexing:Where" + block="URL" > + <default></default> + <example>forum=private section=topsecret&passwd=required</example> + <description> + This is a list of CGI query strings to be excluded from + indexing. This can be used in conjunction with CGI-generated + portions of a website to control which pages are + indexed. + </description> + </attribute> + + <attribute name="bad_word_list" + type="string" + programs="htdig htsearch" + version="all" + category="Indexing:What,Searching:Method" > + <default>${common_dir}/bad_words</default> + <example>${common_dir}/badwords.txt</example> + <description> + This specifies a file which contains words which should + be excluded when digging or searching. This list should + include the most common words or other words that you + don't want to be able to search on (things like <em> + sex</em> or <em>smut</em> are examples of these.)<br/> + The file should contain one word per line. A sample + bad words file is located in the <code>contrib/examples</code> + directory. + </description> + </attribute> + + <attribute name="bin_dir" + type="string" + programs="all" + version="all" + category="File Layout" > + <default configmacro="true">BIN_DIR</default> + <example>/usr/local/bin</example> + <description> + This is the directory in which the executables + related to ht://Dig are installed. It is never used + directly by any of the programs, but other attributes + can be defined in terms of this one. + <p> + The default value of this attribute is determined at + compile time. + </p> + </description> + </attribute> + + <attribute name="boolean_keywords" + type="string list" + programs="htsearch" + version="3.1.6" + category="Presentation:How" > + <default configmacro="true">and or not</default> + <example>et ou non</example> + <description> + These three strings are used as the keywords used in + constructing the LOGICAL_WORDS template variable, + and in parsing the <a href="hts_form.html#words">words</a> input + parameter when the <a href="hts_form.html#method">method</a> parameter + or <ref type="attr">match_method</ref> attribute + is set to <code>boolean</code>. + See also the <ref type="attr">boolean_syntax_errors</ref> attribute. + </description> + </attribute> + + <attribute name="boolean_syntax_errors" + type="quoted string list" + programs="htsearch" + version="3.1.6" + category="Presentation:How" > + <default configmacro="true">Expected 'a search word, a quoted phrase, a boolean expression between ()' 'at the end' 'instead of' 'end of expression' quotes</default> + <example> Attendait "un mot" "à la fin" "au lieu de" "fin d'expression" "points de quotation" </example> + <description> + These six strings are used as the keywords used to + construct various syntax error messages for errors encountered in + parsing the <a href="hts_form.html#words">words</a> input + parameter when the <a href="hts_form.html#method">method</a> parameter + or <ref type="attr">match_method</ref> attribute + is set to <code>boolean</code>. + They are used in conjunction with the + <ref type="attr">boolean_keywords</ref> attribute, and comprise all + English-specific parts of these error messages. The order in which + the strings are put together may not be ideal, or even gramatically + correct, for all languages, but they can be used to make fairly + intelligible messages in many languages. + </description> + </attribute> + + <attribute name="build_select_lists" + type="quoted_string_list" + programs="htsearch" + version="3.2.0b1" + category="Searching:UI" > + <default></default> + <example>MATCH_LIST matchesperpage matches_per_page_list \ + 1 1 1 matches_per_page "Previous Amount" \ + RESTRICT_LIST,multiple restrict restrict_names 2 1 2 restrict "" \ + FORMAT_LIST,radio format template_map 3 2 1 template_name ""</example> + <description> + This list allows you to define any htsearch input parameter as + a select list for use in templates, provided you also define + the corresponding name list attribute which enumerates all the + choices to put in the list. It can be used for existing input + parameters, as well as any you define using the + <ref type="attr">allow_in_form</ref> + attribute. The entries in this list each consist of an octuple, + a set of eight strings defining the variables and how they are to + be used to build a select list. The attribute can contain many + of these octuples. The strings in the string list are merely + taken eight at a time. For each octuple of strings specified in + build_select_lists, the elements have the following meaning: + <ol> + <li>the name of the template variable to be defined as a list, + optionally followed by a comma and the type of list, and + optional formatting codes</li> + <li>the input parameter name that the select list will set</li> + <li>the name of the user-defined attribute containing the + name list</li> + <li>the tuple size used in the name list above</li> + <li>the index into a name list tuple for the value</li> + <li>the index for the corresponding label on the selector</li> + <li>the configuration attribute where the default value for + this input parameter is defined</li> + <li>the default label, if not an empty string, which will be + used as the label for an additional list item for the current + input parameter value if it doesn't match any value in the + given list</li> + </ol> + See the <a href="hts_selectors.html">select list documentation</a> + for more information on this attribute. + </description> + </attribute> + + <attribute name="caps_factor" + type="number" + programs="htsearch" + version="??" + category="Searching:Ranking" > + <default>1</default> + <example>1</example> + <description> + TO BE COMPLETED<br/> + See also <ref type="attr">heading_factor</ref>. + </description> + </attribute> + + <attribute name="case_sensitive" + type="boolean" + programs="htdig" + version="3.1.0b2" + category="Indexing:Where" > + <default>true</default> + <example>false</example> + <description> + This specifies whether ht://Dig should consider URLs + case-sensitive or not. If your server is case-insensitive, + you should probably set this to false. + </description> + </attribute> + + <attribute name="check_unique_date" + type="boolean" + programs="htdig" + version="3.2.0b3" + category="" + block="Global" > + <default>false</default> + <example>false</example> + <description> + Include the modification date of the page in the MD5 hash, to reduce the + problem with identical but physically separate pages in different parts of the tree pointing to + different pages. + </description> + </attribute> + + <attribute name="check_unique_md5" + type="boolean" + programs="htdig" + version="3.2.0b3" + category="" + block="Global" > + <default>false</default> + <example>false</example> + <description> + Uses the MD5 hash of pages to reject aliases, prevents multiple entries + in the index caused by such things as symbolic links + Note: May not do the right thing for incremental update + </description> + </attribute> + + <attribute name="collection_names" + type="string_list" + programs="htsearch" + version="3.2.0b2" + category="" > + <default></default> + <example>htdig_docs htdig_bugs</example> + <description> + This is a list of config file names that are used for searching multiple databases. + Simply put, htsearch will loop through the databases specified by each of these config + files and present the result of the search on all of the databases. + The corresponding config files are looked up in the <ref type="attr">config_dir</ref> directory. + Each listed config file <strong>must</strong> exist, as well as the corresponding databases. + </description> + </attribute> + + <attribute name="common_dir" + type="string" + programs="all" + version="all" + category="File Layout" > + <default configmacro="true">COMMON_DIR</default> + <example>/tmp</example> + <description> + Specifies the directory for files that will or can be + shared among different search databases. The default + value for this attribute is defined at compile time. + </description> + </attribute> + + <attribute name="common_url_parts" + type="string_list" + programs="all" + version="3.1.0" + category="URLs" > + <default>http:// http://www. ftp:// ftp://ftp. /pub/ .html .htm .gif .jpg .jpeg /index.html /index.htm .com/ .com mailto:</default> + <example>//www.htdig.org/ml/ \ +.html \ +http://dev.htdig.org/ \ +http://www.htdig.org/</example> + <description> + Sub-strings often found in URLs stored in the + database. These are replaced in the database by an + internal space-saving encoding. If a string + specified in <ref type="attr">url_part_aliases</ref>, + overlaps any string in common_url_parts, the + common_url_parts string is ignored.<br/> + Note that when this attribute is changed, the + database should be rebuilt, unless the effect of + "changing" the affected URLs in the database is + wanted.<br/> + </description> + </attribute> + + <attribute name="compression_level" + type="integer" + programs="htdig" + version="3.1.0" + category="Indexing:How" > + <default>0</default> + <example>6</example> + <description> + If specified and the <a + href="http://www.cdrom.com/pub/infozip/zlib/">zlib</a> + compression library was available when compiled, + this attribute controls + the amount of compression used in the <ref type="attr">doc_excerpt</ref> file. + </description> + </attribute> + + <attribute name="config" + type="string" + programs="all" + version="??" + category="File Layout" > + <default configmacro="true">DEFAULT_CONFIG_FILE</default> + <example></example> + <description> + Name of configuration file to load. + For security reasons, restrictions are placed on the values which + can be specified on the command line to + <ref type="program">htsearch</ref>. + The default value of this attribute is determined at + compile time. + </description> + </attribute> + + <attribute name="config_dir" + type="string" + programs="all" + version="all" + category="File Layout" > + <default configmacro="true">CONFIG_DIR</default> + <example>/var/htdig/conf</example> + <description> + This is the directory which contains all configuration + files related to ht://Dig. It is never used + directly by any of the programs, but other attributes + or the <ref type="attr">include</ref> directive + can be defined in terms of this one. + <p> + The default value of this attribute is determined at + compile time. + </p> + </description> + </attribute> + + <attribute name="cookies_input_file" + type="string" + programs="htdig" + version="3.2.0b4" + category="Indexing:Connection" > + <default></default> + <example>${common_dir}/cookies.txt</example> + <description> + Specifies the location of the file used for importing cookies + for the crawl. These cookies will be preloaded into htdig's + in-memory cookie jar, but aren't written back to the file. + Cookies are specified according to Netscape's format + (tab-separated fields). If this attribute is left blank, + no cookie file will be read. + <p> + For more information, see the sample cookies.txt file in the + ht://Dig source distribution. + </p> + </description> + </attribute> + + <attribute name="create_image_list" + type="boolean" + programs="htdig" + version="all" + category="Extra Output" > + <default>false</default> + <example>yes</example> + <description> + If set to true, a file with all the image URLs that + were seen will be created, one URL per line. This list + will not be in any order and there will be lots of + duplicates, so after htdig has completed, it should be + piped through <code>sort -u</code> to get a unique list. + </description> + </attribute> + + <attribute name="create_url_list" + type="boolean" + programs="htdig" + version="all" + category="Extra Output" > + <default>false</default> + <example>yes</example> + <description> + If set to true, a file with all the URLs that were seen + will be created, one URL per line. This list will not + be in any order and there will be lots of duplicates, + so after htdig has completed, it should be piped + through <code>sort -u</code> to get a unique list. + </description> + </attribute> + + <attribute name="database_base" + type="string" + programs="all" + version="all" + category="File Layout" > + <default>${database_dir}/db</default> + <example>${database_dir}/sales</example> + <description> + This is the common prefix for files that are specific + to a search database. Many different attributes use + this prefix to specify filenames. Several search + databases can share the same directory by just changing + this value for each of the databases. + </description> + </attribute> + + <attribute name="database_dir" + type="string" + programs="all" + version="all" + category="File Layout" > + <default configmacro="true">DATABASE_DIR</default> + <example>/var/htdig</example> + <description> + This is the directory which contains all database and + other files related to ht://Dig. It is never used + directly by any of the programs, but other attributes + are defined in terms of this one. + <p> + The default value of this attribute is determined at + compile time. + </p> + </description> + </attribute> + + <attribute name="date_factor" + type="number" + programs="htsearch" + version="3.1.0" + category="Searching:Ranking" > + <default>0</default> + <example>0.35</example> + <description> + This factor, gives higher + rankings to newer documents and lower rankings to older + documents. Before setting this factor, it's advised to + make sure your servers are returning accurate dates + (check the dates returned in the long format). + Additionally, setting this to a nonzero value incurs a + small performance hit on searching. + </description> + </attribute> + + <attribute name="date_format" + type="string" + programs="htsearch" + version="3.1.2" + category="Presentation:How" > + <default></default> + <example>%Y-%m-%d</example> + <description> + This format string determines the output format for + modification dates of documents in the search results. + It is interpreted by your system's <em>strftime</em> + function. Please refer to your system's manual page + for this function, for a description of available + format codes. If this format string is empty, as it + is by default, + <ref type="program">htsearch</ref> + will pick a format itself. In this case, the <ref type="attr">iso_8601</ref> attribute can be used + to modify the appearance of the date. + </description> + </attribute> + + <attribute name="description_factor" + type="number" + programs="htsearch" + version="3.1.0b3" + category="Searching:Ranking" > + <default>150</default> + <example>350</example> + <description> + Plain old "descriptions" are the text of a link pointing + to a document. This factor gives weight to the words of + these descriptions of the document. Not surprisingly, + these can be pretty accurate summaries of a document's + content. See also <ref type="attr">heading_factor</ref> + and <ref type="attr">meta_description_factor</ref>. + </description> + </attribute> + + <attribute name="description_meta_tag_names" + type="number" + programs="htsearch" + version="3.1.6" + category="Searching:Ranking" > + <default>description</default> + <example>"description htdig-description"</example> + <description> + The words in this list are used to search for descriptions in HTML + <em>META</em> tags. This list can contain any number of strings + that each will be seen as the name for whatever description + convention is used. While words in any of the specified + description contents will be indexed, only the last meta tag + containing a description will be kept as the meta description + field for the document, for use in search results. The order in + which the names are specified in this configuration attribute + is irrelevant, as it is the order in which the tags appear in + the documents that matters.<br/> The <em>META</em> tags have the + following format:<br/> + <code> <META name="<em>somename</em>" + content="<em>somevalue</em>"> </code><br/> + See also <ref type="attr">meta_description_factor</ref>. + </description> + </attribute> + + <attribute name="disable_cookies" + type="boolean" + programs="htdig" + version="3.2.0b4" + category="Indexing:Connection" + block="Server" > + <default>true</default> + <example>true</example> + <description> + This option, if set to true, will disable HTTP cookies. + </description> + </attribute> + + <attribute name="doc_db" + type="string" + programs="all" + version="all" + category="File Layout" > + <default>${database_base}.docdb</default> + <example>${database_base}documents.db</example> + <description> + This file will contain a Berkeley database of documents + indexed by document number. It contains all the information + gathered for each document, except the document excerpts + which are stored in the <ref type="attr">doc_excerpt</ref> file. + </description> + </attribute> + + <attribute name="doc_excerpt" + type="string" + programs="all" + version="3.2.0b1" + category="File Layout" > + <default>${database_base}.excerpts</default> + <example>${database_base}excerpts.db</example> + <description> + This file will contain a Berkeley database of document excerpts + indexed by document number. It contains all the text + gathered for each document, so this file can become + rather large if <ref type="attr">max_head_length</ref> is set to a large value. + The size can be reduced by setting the + <ref type="attr">compression_level</ref>, + if supported on your system. + </description> + </attribute> + + <attribute name="doc_index" + type="string" + programs="htdig" + version="all" + category="File Layout" > + <default>${database_base}.docs.index</default> + <example>documents.index.db</example> + <description> + This file contains a mapping of document numbers to URLs and is + used by htdig during indexing. It is used on updates if it exists. + </description> + </attribute> + + <attribute name="doc_list" + type="string" + programs="htdig htdump htload" + version="all" + category="File Layout" > + <default>${database_base}.docs</default> + <example>/tmp/documents.text</example> + <description> + This file is basically a text version of the file + specified in <ref type="attr">doc_db</ref>. Its + only use is to have a human readable database of all + documents. The file is easy to parse with tools like + perl or tcl. + </description> + </attribute> + + <attribute name="endday" + type="integer" + programs="htsearch" + version="3.1.6" + category="Searching:Method" > + <default></default> + <example>31</example> + <description> + Day component of last date allowed as last-modified date + of returned docutments. + This is most usefully specified as a + <a href="hts_form.html#startyear">GCI argument</a>. + See also <ref type="attr">startyear</ref>. + </description> + </attribute> + + <attribute name="end_ellipses" + type="string" + programs="htsearch" + version="all" + category="Presentation:Text" > + <default><strong><code> ...</code></strong></default> + <example>...</example> + <description> + When excerpts are displayed in the search output, this + string will be appended to the excerpt if there is text + following the text displayed. This is just a visual + reminder to the user that the excerpt is only part of + the complete document. + </description> + </attribute> + + <attribute name="end_highlight" + type="string" + programs="htsearch" + version="3.1.4" + category="Presentation:Text" > + <default></strong></default> + <example></font></example> + <description> + When excerpts are displayed in the search output, matched + words will be highlighted using <ref type="attr">start_highlight</ref> and this string. + You should ensure that highlighting tags are balanced, + that is, this string should close any formatting + tag opened by start_highlight. + </description> + </attribute> + + <attribute name="endings_affix_file" + type="string" + programs="htfuzzy" + version="all" + category="File Layout" > + <default>${common_dir}/english.aff</default> + <example>/var/htdig/affix_rules</example> + <description> + Specifies the location of the file which contains the + affix rules used to create the endings search algorithm + databases. Consult the documentation on + <ref type="program">htfuzzy</ref> for more information on the + format of this file. + </description> + </attribute> + + <attribute name="endings_dictionary" + type="string" + programs="htfuzzy" + version="all" + category="File Layout" > + <default>${common_dir}/english.0</default> + <example>/var/htdig/dictionary</example> + <description> + Specifies the location of the file which contains the + dictionary used to create the endings search algorithm + databases. Consult the documentation on + <ref type="program">htfuzzy</ref> for more information on the + format of this file. + </description> + </attribute> + + <attribute name="endings_root2word_db" + type="string" + programs="htfuzzy htsearch" + version="all" + category="File Layout" > + <default>${common_dir}/root2word.db</default> + <example>/var/htdig/r2w.db</example> + <description> + This attributes specifies the database filename to be + used in the 'endings' fuzzy search algorithm. The + database maps word roots to all legal words with that + root. For more information about this and other fuzzy + search algorithms, consult the + <ref type="program">htfuzzy</ref> documentation.<br/> + Note that the default value uses the + <ref type="attr">common_dir</ref> attribute instead of the + <ref type="attr">database_dir</ref> attribute. + This is because this database can be shared with + different search databases. + </description> + </attribute> + + <attribute name="endings_word2root_db" + type="string" + programs="htfuzzy htsearch" + version="all" + category="File Layout" > + <default>${common_dir}/word2root.db</default> + <example>/var/htdig/w2r.bm</example> + <description> + This attributes specifies the database filename to be + used in the 'endings' fuzzy search algorithm. The + database maps words to their root. For more information + about this and other fuzzy search algorithms, consult + the <ref type="program">htfuzzy</ref> + documentation.<br/> + Note that the default value uses the + <ref type="attr">common_dir</ref> attribute instead of the + <ref type="attr">database_dir</ref> attribute. + This is because this database can be shared with + different search databases. + </description> + </attribute> + + <attribute name="endmonth" + type="integer" + programs="htsearch" + version="3.1.6" + category="Searching:Method" > + <default></default> + <example>12</example> + <description> + Month component of last date allowed as last-modified date + of returned docutments. + This is most usefully specified as a + <a href="hts_form.html#startyear">GCI argument</a>. + See also <ref type="attr">startyear</ref>. + </description> + </attribute> + + <attribute name="endyear" + type="integer" + programs="htsearch" + version="3.1.6" + category="Searching:Method" > + <default></default> + <example>2002</example> + <description> + Year component of last date allowed as last-modified date + of returned docutments. + This is most usefully specified as a + <a href="hts_form.html#startyear">GCI argument</a>. + See also <ref type="attr">startyear</ref>. + </description> + </attribute> + + <attribute name="excerpt_length" + type="integer" + programs="htsearch" + version="all" + category="Presentation:How" > + <default>300</default> + <example>500</example> + <description> + This is the maximum number of characters the displayed + excerpt will be limited to. The first matched word will + be highlighted in the middle of the excerpt so that there is + some surrounding context.<br/> + The <ref type="attr">start_ellipses</ref> and + <ref type="attr">end_ellipses</ref> are used to + indicate that the document contains text before and + after the displayed excerpt respectively. + The <ref type="attr">start_highlight</ref> and + <ref type="attr">end_highlight</ref> are used to + specify what formatting tags are used to highlight matched words. + </description> + </attribute> + + <attribute name="excerpt_show_top" + type="boolean" + programs="htsearch" + version="all" + category="Presentation:How" > + <default>false</default> + <example>yes</example> + <description> + If set to true, the excerpt of a match will always show + the top of the matching document. If it is false (the + default), the excerpt will attempt to show the part of + the document that actually contains one of the words. + </description> + </attribute> + + <attribute name="exclude" + type="pattern_list" + programs="htsearch" + version="3.2.0b4" + category="Searching:Method" > + <default></default> + <example>myhost.com/mailarchive/</example> + <description> + If a URL contains any of the space separated patterns, it will be + discarded in the searching phase. This is used to exclude certain + URLs from search results. The list can be specified from within + the configuration file, and can be overridden with the "exclude" + input parameter in the search form. + </description> + </attribute> + + <attribute name="exclude_urls" + type="pattern_list" + programs="htdig" + version="all" + category="Indexing:Where" + block="URL" > + <default>/cgi-bin/ .cgi</default> + <example>students.html cgi-bin</example> + <description> + If a URL contains any of the space separated patterns, + it will be rejected. This is used to exclude such + common things such as an infinite virtual web-tree + which start with cgi-bin. + </description> + </attribute> + + <attribute name="external_parsers" + type="quoted_string_list" + programs="htdig" + version="3.0.7" + category="External:Parsers" > + <default></default> + <example>text/html /usr/local/bin/htmlparser \ + application/pdf /usr/local/bin/parse_doc.pl \ + application/msword->text/plain "/usr/local/bin/mswordtotxt -w" \ + application/x-gunzip->user-defined /usr/local/bin/ungzipper</example> + <description> + This attribute is used to specify a list of + content-type/parsers that are to be used to parse + documents that cannot by parsed by any of the internal + parsers. The list of external parsers is examined + before the builtin parsers are checked, so this can be + used to override the internal behavior without + recompiling htdig.<br/> + The external parsers are specified as pairs of + strings. The first string of each pair is the + content-type that the parser can handle while the + second string of each pair is the path to the external + parsing program. If quoted, it may contain parameters, + separated by spaces.<br/> + External parsing can also be done with external + converters, which convert one content-type to + another. To do this, instead of just specifying + a single content-type as the first string + of a pair, you specify two types, in the form + <em>type1</em><strong>-></strong><em>type2</em>, + as a single string with no spaces. The second + string will define an external converter + rather than an external parser, to convert + the first type to the second. If the second + type is <strong>user-defined</strong>, then + it's up to the converter script to put out a + "Content-Type: <em>type</em>" header followed + by a blank line, to indicate to htdig what type it + should expect for the output, much like what a CGI + script would do. The resulting content-type must + be one that htdig can parse, either internally, + or with another external parser or converter.<br/> + Only one external parser or converter can be + specified for any given content-type. However, + an external converter for one content-type can be + chained to the internal parser for the same type, + by appending <strong>-internal</strong> to the + second type string (e.g. text/html->text/html-internal) + to perform external preprocessing on documents of + this type before internal parsing. + There are two internal parsers, for text/html and + text/plain.<p> + The parser program takes four command-line + parameters, not counting any parameters already + given in the command string:<br/> + <em>infile content-type URL configuration-file</em><br/> + </p> +<table border="1"> + <tr> + <th> + Parameter + </th> + <th> + Description + </th> + <th> + Example + </th> + </tr> + <tr> + <td valign="top"> + infile + </td> + <td> + A temporary file with the contents to be parsed. + </td> + <td> + /var/tmp/htdext.14242 + </td> + </tr> + <tr> + <td valign="top"> + content-type + </td> + <td> + The MIME-type of the contents. + </td> + <td> + text/html + </td> + </tr> + <tr> + <td valign="top"> + URL + </td> + <td> + The URL of the contents. + </td> + <td> + http://www.htdig.org/attrs.html + </td> + </tr> + <tr> + <td valign="top"> + configuration-file + </td> + <td> + The configuration-file in effect. + </td> + <td> + /etc/htdig/htdig.conf + </td> + </tr> + </table><p> + The external parser is to write information for + htdig on its standard output. Unless it is an + external converter, which will output a document + of a different content-type, then its output must + follow the format described here.<br/> + The output consists of records, each record terminated + with a newline. Each record is a series of (unless + expressively allowed to be empty) non-empty tab-separated + fields. The first field is a single character + that specifies the record type. The rest of the fields + are determined by the record type. + </p> +<table border="1"> + <tr> + <th> + Record type + </th> + <th> + Fields + </th> + <th> + Description + </th> + </tr> + <tr> + <th rowspan="3" valign="top"> + w + </th> + <td valign="top"> + word + </td> + <td> + A word that was found in the document. + </td> + </tr> + <tr> + <td valign="top"> + location + </td> + <td> + A number indicating the normalized location of + the word within the document. The number has to + fall in the range 0-1000 where 0 means the top of + the document. + </td> + </tr> + <tr> + <td valign="top"> + heading level + </td> + <td> + A heading level that is used to compute the + weight of the word depending on its context in + the document itself. The level is in the range of + 0-10 and are defined as follows: + <dl compact="true"> + <dt> + 0 + </dt> + <dd> + Normal text + </dd> + <dt> + 1 + </dt> + <dd> + Title text + </dd> + <dt> + 2 + </dt> + <dd> + Heading 1 text + </dd> + <dt> + 3 + </dt> + <dd> + Heading 2 text + </dd> + <dt> + 4 + </dt> + <dd> + Heading 3 text + </dd> + <dt> + 5 + </dt> + <dd> + Heading 4 text + </dd> + <dt> + 6 + </dt> + <dd> + Heading 5 text + </dd> + <dt> + 7 + </dt> + <dd> + Heading 6 text + </dd> + <dt> + 8 + </dt> + <dd> + <em>unused</em> + </dd> + <dt> + 9 + </dt> + <dd> + <em>unused</em> + </dd> + <dt> + 10 + </dt> + <dd> + Keywords + </dd> + </dl> + </td> + </tr> + <tr> + <th rowspan="2" valign="top"> + u + </th> + <td valign="top"> + document URL + </td> + <td> + A hyperlink to another document that is + referenced by the current document. It must be + complete and non-relative, using the URL parameter to + resolve any relative references found in the document. + </td> + </tr> + <tr> + <td valign="top"> + hyperlink description + </td> + <td> + For HTML documents, this would be the text + between the <a href...> and </a> + tags. + </td> + </tr> + <tr> + <th valign="top"> + t + </th> + <td valign="top"> + title + </td> + <td> + The title of the document + </td> + </tr> + <tr> + <th valign="top"> + h + </th> + <td valign="top"> + head + </td> + <td> + The top of the document itself. This is used to + build the excerpt. This should only contain + normal ASCII text + </td> + </tr> + <tr> + <th valign="top"> + a + </th> + <td valign="top"> + anchor + </td> + <td> + The label that identifies an anchor that can be + used as a target in an URL. This really only + makes sense for HTML documents. + </td> + </tr> + <tr> + <th valign="top"> + i + </th> + <td valign="top"> + image URL + </td> + <td> + An URL that points at an image that is part of + the document. + </td> + </tr> + <tr> + <th rowspan="3" valign="top"> + m + </th> + <td valign="top"> + http-equiv + </td> + <td> + The HTTP-EQUIV attribute of a + <a href="meta.html"><em>META</em> tag</a>. + May be empty. + </td> + </tr> + <tr> + <td valign="top"> + name + </td> + <td> + The NAME attribute of this + <a href="meta.html"><em>META</em> tag</a>. + May be empty. + </td> + </tr> + <tr> + <td valign="top"> + contents + </td> + <td> + The CONTENTS attribute of this + <a href="meta.html"><em>META</em> tag</a>. + May be empty. + </td> + </tr> + </table> + <p><em>See also FAQ questions <ref type="faq">4.8</ref> and <ref type="faq">4.9</ref> for more + examples.</em></p> + </description> + </attribute> + + <attribute name="external_protocols" + type="quoted_string_list" + programs="htdig" + version="3.2.0b1" + category="External:Protocols" > + <default></default> + <example>https /usr/local/bin/handler.pl \ + ftp /usr/local/bin/ftp-handler.pl</example> + <description> + This attribute is a bit like <ref type="attr">external_parsers</ref> + since it specifies a list of protocols/handlers that are used to download documents + that cannot be retrieved using the internal methods. This enables htdig to index + documents with URL schemes it does not understand, or to use more advanced authentication + for the documents it is retrieving. This list is checked before HTTP or other methods, + so this can override the internal behavior without writing additional code for htdig.<br/> + The external protocols are specified as pairs of strings, the first being the URL scheme that + the script can handle while the second is the path to the script itself. If the second is + quoted, then additional command-line arguments may be given.<br/> + If the external protocol does not contain a colon (:), it is assumed + to have the standard format + "protocol://[usr[:password]@]address[:port]/path". + If it ends with a colon, then it is assumed to have the simpler format + "protocol:path". If it ends with "://" then the standard form is + again assumed. <br/> + If the external protocol does not contain a colon (:), it is assumed + to have the standard format + "protocol://[usr[:password]@]address[:port]/path". + If it ends with a colon, then it is assumed to have the simpler format + "protocol:path". If it ends with "://" then the standard form is + again assumed. <br/> + The program takes three command-line parameters, not counting any parameters already given + in the command string:<br/> + <em>protocol URL configuration-file</em><br/> + <table border="1"> + <tr> + <th> + Parameter + </th> + <th> + Description + </th> + <th> + Example + </th> + </tr> + <tr> + <td valign="top"> + protocol + </td> + <td> + The URL scheme to be used. + </td> + <td> + https + </td> + </tr> + <tr> + <td valign="top"> + URL + </td> + <td> + The URL to be retrieved. + </td> + <td> + https://www.htdig.org:8008/attrs.html + </td> + </tr> + <tr> + <td valign="top"> + configuration-file + </td> + <td> + The configuration-file in effect. + </td> + <td> + /etc/htdig/htdig.conf + </td> + </tr> + </table><p> + The external protocol script is to write information for htdig on the + standard output. The output must follow the form described here. The output + consists of a header followed by a blank line, followed by the contents of + the document. Each record in the header is terminated with a newline. + Each record is a series of (unless expressively allowed to be empty) non-empty + tab-separated fields. The first field is a single character that specifies the + record type. The rest of the fields are determined by the record type. + </p> +<table border="1"> + <tr> + <th> + Record type + </th> + <th> + Fields + </th> + <th> + Description + </th> + </tr> + <tr> + <th valign="top"> + s + </th> + <td valign="top"> + status code + </td> + <td> + An HTTP-style status code, e.g. 200, 404. Typical codes include: + <dl compact="true"> + <dt> + 200 + </dt> + <dd> + Successful retrieval + </dd> + <dt> + 304 + </dt> + <dd> + Not modified (for example, if the document hasn't changed) + </dd> + <dt> + 301 + </dt> + <dd> + Redirect (to another URL) + </dd> + <dt> + 401 + </dt> + <dd> + Not authorized + </dd> + <dt> + 404 + </dt> + <dd> + Not found + </dd> + </dl> + </td> + </tr> + <tr> + <th valign="top"> + r + </th> + <td valign="top"> + reason + </td> + <td> + A text string describing the status code, e.g "Redirect" or "Not Found." + </td> + </tr> + <tr> + <th valign="top"> + m + </th> + <td valign="top"> + status code + </td> + <td> + The modification time of this document. While the code is fairly flexible + about the time/date formats it accepts, it is recommended to use something + standard, like RFC1123: Sun, 06 Nov 1994 08:49:37 GMT, or ISO-8601: + 1994-11-06 08:49:37 GMT. + </td> + </tr> + <tr> + <th valign="top"> + t + </th> + <td valign="top"> + content-type + </td> + <td> + A valid MIME type for the document, like text/html or text/plain. + </td> + </tr> + <tr> + <th valign="top"> + l + </th> + <td valign="top"> + content-length + </td> + <td> + The length of the document on the server, which may not necessarily + be the length of the buffer returned. + </td> + </tr> + <tr> + <th valign="top"> + u + </th> + <td valign="top"> + url + </td> + <td> + The URL of the document, or in the case of a redirect, the URL + that should be indexed as a result of the redirect. + </td> + </tr> + </table> + </description> + </attribute> + + <attribute name="extra_word_characters" + type="string" + programs="htdig htsearch" + version="3.1.2" + category="Indexing:What" > + <default></default> + <example>_</example> + <description> + These characters are considered part of a word. + In contrast to the characters in the + <ref type="attr">valid_punctuation</ref> + attribute, they are treated just like letter + characters.<br/> + Note that the <ref type="attr">locale</ref> attribute + is normally used to configure which characters + constitute letter characters. + </description> + </attribute> + + <attribute name="head_before_get" + type="boolean" + programs="htdig" + version="3.2.0b1" + category="Indexing:Connection" + block="Server" > + <default>false</default> + <example>true</example> + <description> + This option works only if we take advantage of persistent connections (see + persistent_connections attribute). If set to true an HTTP/1.1 <em>HEAD</em> + call is made in order to retrieve header information about a document. + If the status code and the content-type returned let the document be parsable, + then a following 'GET' call is made. + </description> + </attribute> + + <attribute name="heading_factor" + type="number" + programs="htsearch" + version="3.2.0b1" + category="Searching:Ranking" > + <default>5</default> + <example>20</example> + <description> + This is a factor which will be used to multiply the + weight of words between <h1> and </h1> + tags, as well as headings of levels <h2> through + <h6>. It is used to assign the level of importance + to headings. Setting a factor to 0 will cause words + in these headings to be ignored. The number may be a + floating point number. See also + <ref type="attr">author_factor</ref> + <ref type="attr">backlink_factor</ref> + <ref type="attr">caps_factor</ref> + <ref type="attr">date_factor</ref> + <ref type="attr">description_factor</ref> + <ref type="attr">keywords_factor</ref> + <ref type="attr">meta_description_factor</ref> + <ref type="attr">text_factor</ref> + <ref type="attr">title_factor</ref> + <ref type="attr">url_text_factor</ref> + </description> + </attribute> + + <attribute name="htnotify_prefix_file" + type="string" + programs="htnotify" + version="3.2.0b3" + category="Extra Output" > + <default></default> + <example>${common_dir}/notify_prefix.txt</example> + <description> + Specifies the file containing text to be inserted in each mail + message sent by htnotify before the list of expired webpages. If omitted, + nothing is inserted. + </description> + </attribute> + + <attribute name="htnotify_replyto" + type="string" + programs="htnotify" + version="3.2.0b3" + category="Extra Output" > + <default></default> + <example>[email protected]</example> + <description> + This specifies the email address that htnotify email messages + include in the Reply-to: field. + </description> + </attribute> + + <attribute name="htnotify_sender" + type="string" + programs="htnotify" + version="all" + category="Extra Output" > + <default>webmaster@www</default> + <example>[email protected]</example> + <description> + This specifies the email address that htnotify email + messages get sent out from. The address is forged using + /usr/lib/sendmail. Check htnotify/htnotify.cc for + detail on how this is done. + </description> + </attribute> + + <attribute name="htnotify_suffix_file" + type="string" + programs="htnotify" + version="3.2.0b3" + category="Extra Output" > + <default></default> + <example>${common_dir}/notify_suffix.txt</example> + <description> + Specifies the file containing text to be inserted in each mail message + sent by htnotify after the list of expired webpages. If omitted, htnotify + will insert a standard message. + </description> + </attribute> + + <attribute name="htnotify_webmaster" + type="string" + programs="htnotify" + version="3.2.0b3" + category="Extra Output" > + <default>ht://Dig Notification Service</default> + <example>Notification Service</example> + <description> + This provides a name for the From field, in addition to the email address + for the email messages sent out by htnotify. + </description> + </attribute> + + <attribute name="http_proxy" + type="string" + programs="htdig" + version="3.0" + category="Indexing:Connection" + block="URL" > + <default></default> + <example>3128</example> + <description> + When this attribute is set, all HTTP document + retrievals will be done using the HTTP-PROXY protocol. + The URL specified in this attribute points to the host + and port where the proxy server resides.<br/> + The use of a proxy server greatly improves performance + of the indexing process. + </description> + </attribute> + + <attribute name="http_proxy_authorization" + type="string" + programs="htdig" + version="3.2.0b4" + category="Indexing:Connection" + block="URL" > + <default></default> + <example>mypassword</example> + <description> + This tells htdig to send the supplied + <em>username</em><strong>:</strong><em>password</em> with each HTTP request, + when using a proxy with authorization requested. + The credentials will be encoded using the "Basic" authentication + scheme. There <em>must</em> be a colon (:) between the username and + password. + </description> + </attribute> + + <attribute name="http_proxy_exclude" + type="pattern_list" + programs="htdig" + version="3.1.0b3" + category="Indexing:Connection" > + <default></default> + <example>//intranet.foo.com/</example> + <description> + When this is set, URLs matching this will not use the + proxy. This is useful when you have a mixture of sites + near to the digging server and far away. + </description> + </attribute> + + <attribute name="ignore_alt_text" + type="boolean" + programs="htdig" + version="3.1.6" + category="Indexing:What" > + <default>false</default> + <example>true</example> + <description> + If set, this causes the text of the ALT field in an <IMG...> tag + not to be indexed as part of the text of the document, nor included in + excerpts. + </description> + </attribute> + + <attribute name="ignore_dead_servers" + type="boolean" + programs="htdig" + version="3.1.6" + category="Indexing:Connection" > + <default>true</default> + <example>false</example> + <description> + Determines whether htdig will continue to index URLs from a + server after an attempted connection to the server fails as + "no host found" or "host not found (port)." If + set to false, htdig will try <em>every</em> URL from that server. + </description> + </attribute> + + <attribute name="image_list" + type="string" + programs="htdig" + version="all" + category="Extra Output" > + <default>${database_base}.images</default> + <example>allimages</example> + <description> + This is the file that a list of image URLs gets written + to by <ref type="program">htdig</ref> when the + <ref type="attr">create_image_list</ref> is set to + true. As image URLs are seen, they are just appended to + this file, so after htdig finishes it is probably a + good idea to run <code>sort -u</code> on the file to + eliminate duplicates from the file. + </description> + </attribute> + + <attribute name="image_url_prefix" + type="string" + programs="htsearch" + version="all" + category="Presentation:Text" > + <default configmacro="true">IMAGE_URL_PREFIX</default> + <example>/images/htdig</example> + <description> + This specifies the directory portion of the URL used + to display star images. This attribute isn't directly + used by htsearch, but is used in the default URL for + the <ref type="attr">star_image</ref> and + <ref type="attr">star_blank</ref> attributes, and + other attributes may be defined in terms of this one. + <p> + The default value of this attribute is determined at + compile time. + </p> + </description> + </attribute> + + <attribute name="include" + type="string" + programs="all" + version="3.1.0" + category="" > + <default></default> + <example>${config_dir}/htdig.conf</example> + <description> + This is not quite a configuration attribute, but + rather a directive. It can be used within one + configuration file to include the definitions of + another file. The last definition of an attribute + is the one that applies, so after including a file, + any of its definitions can be overridden with + subsequent definitions. This can be useful when + setting up many configurations that are mostly the + same, so all the common attributes can be maintained + in a single configuration file. The include directives + can be nested, but watch out for nesting loops. + </description> + </attribute> + + <attribute name="iso_8601" + type="boolean" + programs="htsearch htnotify" + version="3.1.0b2" + category="Presentation:How,Extra Output" > + <default>false</default> + <example>true</example> + <description> + This sets whether dates should be output in ISO 8601 + format. For example, this was written on: 1998-10-31 11:28:13 EST. + See also the <ref type="attr">date_format</ref> attribute, which + can override any date format that + <ref type="program">htsearch</ref> + picks by default.<br/> + This attribute also affects the format of the date + <ref type="program">htnotify</ref> expects to find + in a <strong>htdig-notification-date</strong> field. + </description> + </attribute> + + <attribute name="keywords" + type="string_list" + programs="htsearch" + version="??" + category="Searching:Method" > + <default></default> + <example>documentation</example> + <description> + Keywords which <strong>must</strong> be found on all pages returned, + even if the "or" ("Any") <ref type="attr">method</ref> is + selected. + </description> + </attribute> + + <attribute name="keywords_factor" + type="number" + programs="htsearch" + version="all" + category="Searching:Ranking" > + <default>100</default> + <example>12</example> + <description> + This is a factor which will be used to multiply the + weight of words in the list of keywords of a document. + The number may be a floating point number. See also the + <ref type="attr">heading_factor</ref>attribute. + </description> + </attribute> + + <attribute name="keywords_meta_tag_names" + type="string_list" + programs="htdig" + version="3.0.6" + category="Indexing:What" > + <default>keywords htdig-keywords</default> + <example>keywords description</example> + <description> The words in this list are used to search for keywords + in HTML <em>META</em> tags. This list can contain any + number of strings that each will be seen as the name + for whatever keyword convention is used.<br/> + The <em>META</em> tags have the following format: + <codeblock> + <META name="<em>somename</em>" content="<em>somevalue</em>"> + </codeblock> +</description> + </attribute> + + <attribute name="limit_normalized" + type="pattern_list" + programs="htdig" + version="3.1.0b2" + category="Indexing:Where" > + <default></default> + <example>//www.mydomain.com</example> + <description> + This specifies a set of patterns that all URLs have to + match against in order for them to be included in the + search. Unlike the limit_urls_to attribute, this is done + <strong>after</strong> the URL is normalized and the + <ref type="attr">server_aliases</ref> + attribute is applied. This allows filtering after any + hostnames and DNS aliases are resolved. Otherwise, this + attribute is the same as the <ref type="attr">limit_urls_to</ref> attribute. + </description> + </attribute> + + <attribute name="limit_urls_to" + type="pattern_list" + programs="htdig" + version="all" + category="Indexing:Where" > + <default>${start_url}</default> + <example>.sdsu.edu kpbs [.*\.html]</example> + <description> + This specifies a set of patterns that all URLs have to + match against in order for them to be included in the + search. Any number of strings can be specified, + separated by spaces. If multiple patterns are given, at + least one of the patterns has to match the URL.<br/> + Matching, by default, is a case-insensitive string match on the URL + to be used, unless the <ref type="attr">case_sensitive</ref> + attribute is set. The match will be performed <em>after</em> + the relative references have been converted to a valid + URL. This means that the URL will <em>always</em> start + with <code>http://</code>.<br/> + Granted, this is not the perfect way of doing this, + but it is simple enough and it covers most cases. + </description> + </attribute> + + <attribute name="local_default_doc" + type="string_list" + programs="htdig" + version="3.0.8b2" + category="Indexing:Where" + block="Server" > + <default>index.html</default> + <example>default.html default.htm index.html index.htm</example> + <description> + Set this to the default documents in a directory used by the + server. This is used for local filesystem access to + translate URLs like http://foo.com/ into something like + /home/foo.com/index.html<br/> + The list should only contain names that the local server + recognizes as default documents for directory URLs, as defined + by the DirectoryIndex setting in Apache's srm.conf, for example. + As of version 3.1.5, this can be a string list rather than a single name, + and htdig will use the first name that works. Since this requires a + loop, setting the most common name first will improve performance. + Special characters can be embedded in these names using %xx hex encoding. + </description> + </attribute> + + <attribute name="local_urls" + type="string_list" + programs="htdig" + version="3.0.8b2" + category="Indexing:Where" > + <default></default> + <example>//www.foo.com/=/usr/www/htdocs/</example> + <description> + Set this to tell ht://Dig to access certain URLs through + local filesystems. At first ht://Dig will try to access + pages with URLs matching the patterns through the + filesystems specified. If it cannot find the file, or + if it doesn't recognize the file name extension, it will + try the URL through HTTP instead. Note the example--the + equal sign and the final slashes in both the URL and the + directory path are critical. + <br/>The fallback to HTTP can be disabled by setting the + <ref type="attr">local_urls_only</ref> attribute to true. + To access user directory URLs through the local filesystem, + set <ref type="attr">local_user_urls</ref>. The only + file name extensions currently recognized for local filesystem + access are .html, .htm, .txt, .asc, .ps, .eps and .pdf. For + anything else, htdig must ask the HTTP server for the file, + so it can determine the MIME content-type of it. + As of version 3.1.5, you can provide multiple mappings of a given + URL to different directories, and htdig will use the first + mapping that works. + Special characters can be embedded in these names using %xx hex encoding. + For example, you can use %3D to embed an "=" sign in an URL pattern. + </description> + </attribute> + + <attribute name="local_urls_only" + type="boolean" + programs="htdig" + version="3.1.4" + category="Indexing:Where" > + <default>false</default> + <example>true</example> + <description> + Set this to tell ht://Dig to access files only through the + local filesystem, for URLs matching the patterns in the + <ref type="attr">local_urls</ref> or + <ref type="attr">local_user_urls</ref> attribute. If it cannot + find the file, it will give up rather than trying HTTP or another protocol. + </description> + </attribute> + + <attribute name="local_user_urls" + type="string_list" + programs="htdig" + version="3.0.8b2" + category="Indexing:Where" > + <default></default> + <example>//www.my.org/=/home/,/www/</example> + <description> + Set this to access user directory URLs through the local + filesystem. If you leave the "path" portion out, it will + look up the user's home directory in /etc/password (or NIS + or whatever). As with <ref type="attr">local_urls</ref>, + if the files are not found, ht://Dig will try with HTTP or the + appropriate protocol. Again, note the + example's format. To map http://www.my.org/~joe/foo/bar.html + to /home/joe/www/foo/bar.html, try the example below. + <br/>The fallback to HTTP can be disabled by setting the + <ref type="attr">local_urls_only</ref> attribute to true. + As of version 3.1.5, you can provide multiple mappings of a given + URL to different directories, and htdig will use the first + mapping that works. + Special characters can be embedded in these names using %xx hex encoding. + For example, you can use %3D to embed an "=" sign in an URL pattern. + </description> + </attribute> + + <attribute name="locale" + type="string" + programs="htdig" + version="3.0" + category="Indexing:What,Presentation:How" > + <default>C</default> + <example>en_US</example> + <description> + Set this to whatever locale you want your search + database cover. It affects the way international + characters are dealt with. On most systems a list of + legal locales can be found in /usr/lib/locale. Also + check the <strong>setlocale(3C)</strong> man page. + Note that depending the locale you choose, and whether + your system's locale implementation affects floating + point input, you may need to specify the decimal point + as a comma rather than a period. This will affect + settings of <ref type="attr">search_algorithm</ref> + and any of the scoring factors. + </description> + </attribute> + + <attribute name="logging" + type="boolean" + programs="htsearch" + version="3.1.0b2" + category="Extra Output" > + <default>false</default> + <example>true</example> + <description> + This sets whether htsearch should use the syslog() to log + search requests. If set, this will log requests with a + default level of LOG_INFO and a facility of LOG_LOCAL5. For + details on redirecting the log into a separate file or other + actions, see the <strong>syslog.conf(5)</strong> man + page. To set the level and facility used in logging, change + LOG_LEVEL and LOG_FACILITY in the include/htconfig.h file + before compiling. + <dl> + <dt> + Each line logged by htsearch contains the following: + </dt> + <dd> + REMOTE_ADDR [config] (match_method) [words] + [logicalWords] (matches/matches_per_page) - + page, HTTP_REFERER + </dd> + </dl> + where any of the above are null or empty, it + either puts in '-' or 'default' (for config). + </description> + </attribute> + + <attribute name="maintainer" + type="string" + programs="htdig" + version="all" + category="Indexing:Out" + block="Server" > + <default>[email protected]</default> + <example>[email protected]</example> + <description> + This should be the email address of the person in + charge of the digging operation. This string is added + to the user-agent: field when the digger sends a + request to a server. + </description> + </attribute> + + <attribute name="match_method" + type="string" + programs="htsearch" + version="3.0" + category="Searching:Method" > + <default>and</default> + <example>boolean</example> + <description> + This is the default method for matching that htsearch + uses. The valid choices are: + <ul> + <li> or </li> + <li> and </li> + <li> boolean </li> + </ul> + This attribute will only be used if the HTML form that + calls htsearch didn't have the <a href="hts_form.html#method">method</a> + value set. + </description> + </attribute> + + <attribute name="matches_per_page" + type="integer" + programs="htsearch" + version="3.0" + category="Searching:Method" > + <default>10</default> + <example>999</example> + <description> + If this is set to a relatively small number, the + matches will be shown in pages instead of all at once. + This attribute will only be used if the HTML form that + calls htsearch didn't have the + <a href="hts_form.html#matchesperpage">matchesperpage</a> value set. + </description> + </attribute> + + <attribute name="max_connection_requests" + type="integer" + programs="htdig" + version="3.2.0b1" + category="Indexing:Connection" > + <default>-1</default> + <example>100</example> + <description> + This attribute tells htdig to limit the number of requests it will + send to a server using a single, persistent HTTP connection. This + only applies when the + <ref type="attr">persistent_connections</ref> + attribute is set. You may set the limit as high as you want, + but it must be at least 1. A value of -1 specifies no limit. + Requests in the queue for a server will be combined until either + the limit is reached, or the queue is empty. + </description> + </attribute> + + <attribute name="max_description_length" + type="integer" + programs="htdig" + version="all" + category="Indexing:What" > + <default>60</default> + <example>40</example> + <description> + While gathering descriptions of URLs, + <ref type="program">htdig</ref> will only record those + descriptions which are shorter than this length. This + is used mostly to deal with broken HTML. (If a + hyperlink is not terminated with a </a> the + description will go on until the end of the document.) + </description> + </attribute> + + <attribute name="max_descriptions" + type="integer" + programs="htdig" + version="all" + category="Indexing:What" > + <default>5</default> + <example>15</example> + <description> + While gathering descriptions of URLs, + <ref type="program">htdig</ref> will only record up to this + number of descriptions, in the order in which it encounters + them. This is used to prevent the database entry for a document + from growing out of control if the document has a huge number + of links to it. + </description> + </attribute> + + <attribute name="max_doc_size" + type="integer" + programs="htdig" + version="3.0" + category="Indexing:What" + block="URL" > + <default>100000</default> + <example>5000000</example> + <description> + This is the upper limit to the amount of data retrieved + for documents. This is mainly used to prevent + unreasonable memory consumption since each document + will be read into memory by <ref type="program">htdig</ref>. + </description> + </attribute> + + <attribute name="max_excerpts" + type="integer" + programs="htsearch" + version="3.1.6" + category="Presentation:How" + block="URL" > + <default>1</default> + <example>10</example> + <description> + This value determines the maximum number of excerpts + that can be displayed for one matching document in the + search results. + </description> + </attribute> + + <attribute name="max_head_length" + type="integer" + programs="htdig" + version="all" + category="Indexing:How" > + <default>512</default> + <example>50000</example> + <description> + For each document retrieved, the top of the document is + stored. This attribute determines the size of this + block. The text that will be stored is only the text; + no markup is stored.<br/> + We found that storing 50,000 bytes will store about + 95% of all the documents completely. This really + depends on how much storage is available and how much + you want to show. + </description> + </attribute> + + <attribute name="max_hop_count" + type="integer" + programs="htdig" + version="all" + category="Indexing:Where" > + <default>999999</default> + <example>4</example> + <description> + Instead of limiting the indexing process by URL + pattern, it can also be limited by the number of hops + or clicks a document is removed from the starting URL. + <br/> + The starting page or pages will have hop count 0. + </description> + </attribute> + + <attribute name="max_keywords" + type="integer" + programs="htdig" + version="3.2.0b1" + category="Indexing:What" > + <default>-1</default> + <example>10</example> + <description> + This attribute can be used to limit the number of keywords + per document that htdig will accept from meta keywords tags. + A value of -1 or less means no limit. This can help combat meta + keyword spamming, by limiting the amount of keywords that will be + indexed, but it will not completely prevent irrelevant matches + in a search if the first few keywords in an offending document + are not relevant to its contents. + </description> + </attribute> + + <attribute name="max_meta_description_length" + type="integer" + programs="htdig" + version="3.1.0b1" + category="Indexing:How" > + <default>512</default> + <example>1000</example> + <description> + While gathering descriptions from meta description tags, + <ref type="program">htdig</ref> will only store up to + this much of the text for each document. + </description> + </attribute> + + <attribute name="max_prefix_matches" + type="integer" + programs="htsearch" + version="3.1.0b1" + category="Searching:Method" > + <default>1000</default> + <example>100</example> + <description> + The Prefix fuzzy algorithm could potentially match a + very large number of words. This value limits the + number of words each prefix can match. Note + that this does not limit the number of documents that + are matched in any way. + </description> + </attribute> + + <attribute name="max_retries" + type="number" + programs="htdig" + version="3.2.0b1" + category="Indexing:Connection" > + <default>3</default> + <example>6</example> + <description> + This option set the maximum number of retries when retrieving a document + fails (mainly for reasons of connection). + </description> + </attribute> + + <attribute name="max_stars" + type="number" + programs="htsearch" + version="all" + category="Presentation:How" > + <default>4</default> + <example>6</example> + <description> + When stars are used to display the score of a match, + this value determines the maximum number of stars that + can be displayed. + </description> + </attribute> + + <attribute name="maximum_page_buttons" + type="integer" + programs="htsearch" + version="3.2.0b3" + category="Presentation:How" > + <default>${maximum_pages}</default> + <example>20</example> + <description> + This value limits the number of page links that will be + included in the page list at the bottom of the search + results page. By default, it takes on the value of the + <ref type="attr">maximum_pages</ref> + attribute, but you can set it to something lower to allow + more pages than buttons. In this case, pages above this + number will have no corresponding button. + </description> + </attribute> + + <attribute name="maximum_pages" + type="integer" + programs="htsearch" + version="all" + category="Presentation:How" > + <default>10</default> + <example>20</example> + <description> + This value limits the number of page links that will be + included in the page list at the bottom of the search + results page. As of version 3.1.4, this will limit the + total number of matching documents that are shown. + You can make the number of page buttons smaller than the + number of allowed pages by setting the + <ref type="attr">maximum_page_buttons</ref> + attribute. + </description> + </attribute> + + <attribute name="maximum_word_length" + type="integer" + programs="htdig htsearch" + version="3.1.3" + category="Indexing:What" > + <default>32</default> + <example>15</example> + <description> + This sets the maximum length of words that will be + indexed. Words longer than this value will be silently + truncated when put into the index, or searched in the + index. + </description> + </attribute> + + <attribute name="md5_db" + type="string" + programs="htdig" + version="3.2.0b3" + category="File Layout" > + <default>${database_base}.md5hash.db</default> + <example>${database_base}.md5.db</example> + <description> + This file holds a database of md5 and date hashes of pages to + catch and eliminate duplicates of pages. See also the + <ref type="attr">check_unique_md5</ref> and + <ref type="attr">check_unique_date</ref> attributes. + </description> + </attribute> + + <attribute name="meta_description_factor" + type="number" + programs="htsearch" + version="3.1.0b1" + category="Searching:Ranking" > + <default>50</default> + <example>20</example> + <description> + This is a factor which will be used to multiply the + weight of words in any META description tags in a document. + The number may be a floating point number. See also the + <ref type="attr">heading_factor</ref> attribute and the + <ref type="attr">description_factor</ref> attribute. + </description> + </attribute> + + <attribute name="metaphone_db" + type="string" + programs="htfuzzy htsearch" + version="all" + category="File Layout" > + <default>${database_base}.metaphone.db</default> + <example>${database_base}.mp.db</example> + <description> + The database file used for the fuzzy "metaphone" search + algorithm. This database is created by + <ref type="program">htfuzzy</ref> and used by + <ref type="program">htsearch</ref>. + </description> + </attribute> + + <attribute name="method_names" + type="quoted_string_list" + programs="htsearch" + version="all" + category="Searching:UI" > + <default>and All or Any boolean Boolean</default> + <example>or Or and And</example> + <description> + These values are used to create the <strong> + method</strong> menu. It consists of pairs. The first + element of each pair is one of the known methods, the + second element is the text that will be shown in the + menu for that method. This text needs to be quoted if + it contains spaces. + See the <a href="hts_selectors.html">select list documentation</a> + for more information on how this attribute is used. + </description> + </attribute> + + <attribute name="mime_types" + type="string" + programs="htdig" + version="3.2.0b1" + category="Indexing:Where" > + <default>${config_dir}/mime.types</default> + <example>/etc/mime.types</example> + <description> + This file is used by htdig for local file access and resolving + file:// URLs to ensure the files are parsable. If you are running + a webserver with its own MIME file, you should set this attribute + to point to that file. + </description> + </attribute> + + <attribute name="minimum_prefix_length" + type="integer" + programs="htsearch" + version="3.1.0b1" + category="Searching:Method" > + <default>1</default> + <example>2</example> + <description> + This sets the minimum length of prefix matches used by the + "prefix" fuzzy matching algorithm. Words shorter than this + will not be used in prefix matching. + </description> + </attribute> + + <attribute name="minimum_speling_length" + type="integer" + programs="htsearch" + version="3.2.0b1" + category="Searching:Method" > + <default>5</default> + <example>3</example> + <description> + This sets the minimum length of words used by the + "speling" fuzzy matching algorithm. Words shorter than this + will not be used in this fuzzy matching. + </description> + </attribute> + + <attribute name="minimum_word_length" + type="integer" + programs="htdig htsearch" + version="all" + category="Indexing:What" > + <default>3</default> + <example>2</example> + <description> + This sets the minimum length of words that will be + indexed. Words shorter than this value will be silently + ignored but still put into the excerpt.<br/> + Note that by making this value less than 3, a lot more + words that are very frequent will be indexed. It might + be advisable to add some of these to the + <ref type="attr">bad_word_list</ref>. + </description> + </attribute> + + <attribute name="multimatch_factor" + type="number" + programs="htsearch" + version="3.1.6" + category="Searching:Ranking" > + <default>1</default> + <example>1000</example> + <description> + This factor gives higher rankings to documents that have more than + one matching search word when the <strong>or</strong> + <ref type="attr">match_method</ref> is used. + In version 3.1.6, the matching words' combined scores were multiplied + by this factor for each additional matching word. Currently, this + multiplier is applied at most once. + </description> + </attribute> + + <attribute name="next_page_text" + type="string" + programs="htsearch" + version="3.1.0" + category="Presentation:Text" > + <default>[next]</default> + <example><img src="/htdig/buttonr.gif"></example> + <description> + The text displayed in the hyperlink to go to the next + page of matches. + </description> + </attribute> + + <attribute name="no_excerpt_show_top" + type="boolean" + programs="htsearch" + version="3.1.0b3" + category="Presentation:How" > + <default>false</default> + <example>yes</example> + <description> + If no excerpt is available, this option will act the + same as <ref type="attr">excerpt_show_top</ref>, that is, + it will show the top of the document. + </description> + </attribute> + + <attribute name="no_excerpt_text" + type="string" + programs="htsearch" + version="3.0" + category="Presentation:Text" > + <default><em>(None of the search words were found in the top of this document.)</em></default> + <example></example> + <description> + This text will be displayed in place of the excerpt if + there is no excerpt available. If this attribute is set + to nothing (blank), the excerpt label will not be + displayed in this case. + </description> + </attribute> + + <attribute name="no_next_page_text" + type="string" + programs="htsearch" + version="3.0" + category="Presentation:Text" > + <default>[next]</default> + <example></example> + <description> + The text displayed where there would normally be a + hyperlink to go to the next page of matches. + </description> + </attribute> + + <attribute name="no_page_list_header" + type="string" + programs="htsearch" + version="3.0" + category="Presentation:Text" > + <default></default> + <example><hr noshade size=2>All results on this page.<br></example> + <description> + This text will be used as the value of the PAGEHEADER + variable, for use in templates or the + <ref type="attr">search_results_footer</ref> + file, when all search results fit on a single page. + </description> + </attribute> + + <attribute name="no_page_number_text" + type="quoted_string_list" + programs="htsearch" + version="3.0" + category="Presentation:Text" > + <default></default> + <example><strong>1</strong> <strong>2</strong> \ + <strong>3</strong> <strong>4</strong> \ + <strong>5</strong> <strong>6</strong> \ + <strong>7</strong> <strong>8</strong> \ + <strong>9</strong> <strong>10</strong> +</example> + <description> + The text strings in this list will be used when putting + together the PAGELIST variable, for use in templates or + the <ref type="attr">search_results_footer</ref> + file, when search results fit on more than page. The PAGELIST + is the list of links at the bottom of the search results page. + There should be as many strings in the list as there are + pages allowed by the <ref type="attr">maximum_page_buttons</ref> + attribute. If there are not enough, or the list is empty, + the page numbers alone will be used as the text for the links. + An entry from this list is used for the current page, as the + current page is shown in the page list without a hypertext link, + while entries from the <ref type="attr">page_number_text</ref> list are used for the links to other pages. + The text strings can contain HTML tags to highlight page numbers + or embed images. The strings need to be quoted if they contain + spaces. + </description> + </attribute> + + <attribute name="no_prev_page_text" + type="string" + programs="htsearch" + version="3.0" + category="Presentation:Text" > + <default>[prev]</default> + <example></example> + <description> + The text displayed where there would normally be a + hyperlink to go to the previous page of matches. + </description> + </attribute> + + <attribute name="no_title_text" + type="string" + programs="htsearch" + version="3.1.0" + category="Presentation:Text" > + <default>filename</default> + <example>"No Title Found"</example> + <description> + This specifies the text to use in search results when no + title is found in the document itself. If it is set to + filename, htsearch will use the name of the file itself, + enclosed in brackets (e.g. [index.html]). + </description> + </attribute> + + <attribute name="noindex_end" + type="string" + programs="htdig" + version="3.1.0" + category="Indexing:What" > + <default><!--/htdig_noindex--></default> + <example></SCRIPT></example> + <description> + This string marks the end of a section of an HTML file that should be + completely ignored when indexing. It works together with + <ref type="attr">noindex_start</ref>. + As in the defaults, this can be SGML comment + declarations that can be inserted anywhere in the documents to exclude + different sections from being indexed. However, existing tags can also be + used; this is especially useful to exclude some sections from being indexed + where the files to be indexed can not be edited. The example shows how + SCRIPT sections in 'uneditable' documents can be skipped. + Note that the match for this string is case insensitive. + </description> + </attribute> + + <attribute name="noindex_start" + type="string" + programs="htdig" + version="3.1.0" + category="Indexing:What" > + <default><!--htdig_noindex--></default> + <example><SCRIPT</example> + <description> + This string marks the start of a section of an HTML file that should be + completely ignored when indexing. It works together with + <ref type="attr">noindex_end</ref>. + As in the defaults, this can be SGML comment + declarations that can be inserted anywhere in the documents to exclude + different sections from being indexed. However, existing tags can also be + used; this is especially useful to exclude some sections from being indexed + where the files to be indexed can not be edited. The example shows how + SCRIPT sections in 'uneditable' documents can be skipped; note how + noindex_start does not contain an ending >: this allows for all SCRIPT + tags to be matched regardless of attributes defined (different types or + languages). Note that the match for this string is case insensitive. + </description> + </attribute> + + <attribute name="nothing_found_file" + type="string" + programs="htsearch" + version="all" + category="Presentation:Files" > + <default>${common_dir}/nomatch.html</default> + <example>/www/searching/nothing.html</example> + <description> + This specifies the file which contains the <code> + HTML</code> text to display when no matches were found. + The file should contain a complete <code>HTML</code> + document.<br/> + Note that this attribute could also be defined in + terms of <ref type="attr">database_base</ref> to + make is specific to the current search database. + </description> + </attribute> + + <attribute name="nph" + type="boolean" + programs="htsearch" + version="3.2.0b2" + category="Presentation:How" > + <default>false</default> + <example>true</example> + <description> + This attribute determines whether htsearch sends out full HTTP + headers as required for an NPH (non-parsed header) CGI. Some + servers assume CGIs will act in this fashion, for example MS + IIS. If your server does not send out full HTTP headers, you + should set this to true. + </description> + </attribute> + + <attribute name="page_list_header" + type="string" + programs="htsearch" + version="3.0" + category="Presentation:Text" > + <default><hr noshade size=2>Pages:<br></default> + <example></example> + <description> + This text will be used as the value of the PAGEHEADER + variable, for use in templates or the + <ref type="attr">search_results_footer</ref> + file, when all search results fit on more than one page. + </description> + </attribute> + + <attribute name="page_number_separator" + type="quoted_string_list" + programs="htsearch" + version="3.1.4" + category="Presentation:Text" > + <default>" "</default> + <example>"</td> <td>"</example> + <description> + The text strings in this list will be used when putting + together the PAGELIST variable, for use in templates or + the <ref type="attr">search_results_footer</ref> + file, when search results fit on more than page. The PAGELIST + is the list of links at the bottom of the search results page. + The strings in the list will be used in rotation, and will + separate individual entries taken from + <ref type="attr">page_number_text</ref> and + <ref type="attr">no_page_number_text</ref>. + There can be as many or as few strings in the list as you like. + If there are not enough for the number of pages listed, it goes + back to the start of the list. If the list is empty, a space is + used. The text strings can contain HTML tags. The strings need + to be quoted if they contain spaces, or to specify an empty string. + </description> + </attribute> + + <attribute name="page_number_text" + type="quoted_string_list" + programs="htsearch" + version="3.0" + category="Presentation:Text" > + <default></default> + <example><em>1</em> <em>2</em> \ + <em>3</em> <em>4</em> \ + <em>5</em> <em>6</em> \ + <em>7</em> <em>8</em> \ + <em>9</em> <em>10</em> +</example> + <description> + The text strings in this list will be used when putting + together the PAGELIST variable, for use in templates or + the <ref type="attr">search_results_footer</ref> + file, when search results fit on more than page. The PAGELIST + is the list of links at the bottom of the search results page. + There should be as many strings in the list as there are + pages allowed by the <ref type="attr">maximum_page_buttons</ref> + attribute. If there are not enough, or the list is empty, + the page numbers alone will be used as the text for the links. + Entries from this list are used for the links to other pages, + while an entry from the <ref type="attr">no_page_number_text</ref> list is used for the current page, as the + current page is shown in the page list without a hypertext link. + The text strings can contain HTML tags to highlight page numbers + or embed images. The strings need to be quoted if they contain + spaces. + </description> + </attribute> + + <attribute name="persistent_connections" + type="boolean" + programs="htdig" + version="3.2.0b1" + category="Indexing:Connection" + block="Server" > + <default>true</default> + <example>false</example> + <description> + If set to true, when servers make it possible, htdig can take advantage + of persistent connections, as defined by HTTP/1.1 (<em>RFC2616</em>). This permits + to reduce the number of open/close operations of connections, when retrieving + a document with HTTP. + </description> + </attribute> + + <attribute name="plural_suffix" + type="string" + programs="htsearch" + version="3.2.0b2" + category="Presentation: Text" > + <default>s</default> + <example>en</example> + <description> + Specifies the value of the PLURAL_MATCHES template + variable used in the header, footer and template files. + This can be used for localization for non-English languages + where 's' is not the appropriate suffix. + </description> + </attribute> + + <attribute name="prefix_match_character" + type="string" + programs="htsearch" + version="3.1.0b1" + category="Searching:Method" > + <default>*</default> + <example>ing</example> + <description> + A null prefix character means that prefix matching should be + applied to every search word. Otherwise a match is + returned only if the word does not end in the characters specified. + </description> + </attribute> + + <attribute name="prev_page_text" + type="string" + programs="htsearch" + version="3.0" + category="Presentation:Text" > + <default>[prev]</default> + <example><img src="/htdig/buttonl.gif"></example> + <description> + The text displayed in the hyperlink to go to the + previous page of matches. + </description> + </attribute> + + <attribute name="regex_max_words" + type="integer" + programs="htsearch" + version="3.2.0b1" + category="Searching:Method" > + <default>25</default> + <example>10</example> + <description> + The "regex" fuzzy algorithm could potentially match a + very large number of words. This value limits the + number of words each regular expression can match. Note + that this does not limit the number of documents that + are matched in any way. + </description> + </attribute> + + <attribute name="remove_bad_urls" + type="boolean" + programs="htpurge" + version="all" + category="Indexing:How" + block="Server" > + <default>true</default> + <example>true</example> + <description> + If TRUE, htpurge will remove any URLs which were marked + as unreachable by htdig from the database. If FALSE, it + will not do this. When htdig is run in initial mode, + documents which were referred to but could not be + accessed should probably be removed, and hence this + option should then be set to TRUE, however, if htdig is + run to update the database, this may cause documents on + a server which is temporarily unavailable to be + removed. This is probably NOT what was intended, so + hence this option should be set to FALSE in that case. + </description> + </attribute> + + <attribute name="remove_default_doc" + type="string_list" + programs="htdig" + version="3.1.0" + category="Indexing:How" > + <default>index.html</default> + <example>default.html default.htm index.html index.htm</example> + <description> + Set this to the default documents in a directory used by the + servers you are indexing. These document names will be stripped + off of URLs when they are normalized, if one of these names appears + after the final slash, to translate URLs like + http://foo.com/index.html into http://foo.com/<br/> + Note that you can disable stripping of these names during + normalization by setting the list to an empty string. + The list should only contain names that all servers you index + recognize as default documents for directory URLs, as defined + by the DirectoryIndex setting in Apache's srm.conf, for example. + This only applies to http:// and https:// URLS. + </description> + </attribute> + + <attribute name="remove_unretrieved_urls" + type="boolean" + programs="htpurge" + version="3.2.0b1" + category="Indexing:How" + block="Server" > + <default>false</default> + <example>true</example> + <description> + If TRUE, htpurge will remove any URLs which were discovered + and included as stubs in the database but not yet retrieved. If FALSE, it + will not do this. When htdig is run in initial mode with no restrictions + on hopcount or maximum documents, these should probably be removed and set + to true. However, if you are hoping to index a small set of documents and + eventually get to the rest, you should probably leave this as false. + </description> + </attribute> + + <attribute name="restrict" + type="pattern_list" + programs="htsearch" + version="3.2.0b4" + category="Searching:Method" > + <default></default> + <example>//www.acme.com/widgets/</example> + <description> + This specifies a set of patterns that all URLs have to + match against in order for them to be included in the search + results. Any number of strings can be specified, separated by + spaces. If multiple patterns are given, at least one of the + patterns has to match the URL. The list can be specified + from within the configuration file, and can be overridden + with the "restrict" input parameter in the search form. Note + that the restrict list does not take precedence over the + <ref type="attr">exclude</ref> list - if a URL matches patterns + in both lists it is still excluded from the search results. + </description> + </attribute> + + <attribute name="robotstxt_name" + type="string" + programs="htdig" + version="3.0.7" + category="Indexing:Out" + block="Server" > + <default>htdig</default> + <example>myhtdig</example> + <description> + Sets the name that htdig will look for when parsing + robots.txt files. This can be used to make htdig appear + as a different spider than ht://Dig. Useful to + distinguish between a private and a global index. + </description> + </attribute> + + <attribute name="script_name" + type="string" + programs="htsearch" + version="3.1.4" + category="Presentation:Text" > + <default></default> + <example>/search/results.shtml</example> + <description> + Overrides the value of the SCRIPT_NAME + environment attribute. This is useful if + htsearch is not being called directly as a CGI + program, but indirectly from within a dynamic + .shtml page using SSI directives. Previously, + you needed a wrapper script to do this, but + this configuration attribute makes wrapper + scripts obsolete for SSI and possibly for + other server scripting languages, as + well. (You still need a wrapper script when + using PHP, though.)<br/> + Check out the <code>contrib/scriptname</code> + directory for a small example. Note that this + attribute also affects the value of the <a + href="hts_templates.html#CGI">CGI</a> variable + used in htsearch templates. + </description> + </attribute> + + <attribute name="search_algorithm" + type="string_list" + programs="htsearch" + version="all" + category="Searching:Method" > + <default>exact:1</default> + <example>0.3</example> + <description> + Specifies the search algorithms and their weight to use + when searching. Each entry in the list consists of the + algorithm name, followed by a colon (:) followed by a + weight multiplier. The multiplier is a floating point + number between 0 and 1. Note that depending on your + <ref type="attr">locale</ref> setting, and whether your + system's locale implementation affects floating point + input, you may need to specify the decimal point as a + comma rather than a period.<br/> + <strong>Note:</strong>If the exact + method is not listed, the search may not work since the + original terms will not be used.<br/> + Current algorithms supported are: + <dl> + <dt> + exact + </dt> + <dd> + The default exact word matching algorithm. This + will find only exactly matched words. + </dd> + <dt> + soundex + </dt> + <dd> + Uses a slightly modified soundex algorithm to match + words. This requires that the soundex database be + present. It is generated with the + <ref type="program">htfuzzy</ref> program. + </dd> + <dt> + metaphone + </dt> + <dd> + Uses the metaphone algorithm for matching words. + This algorithm is more specific to the english + language than soundex. It requires the metaphone + database, which is generated with the <ref type="program">htfuzzy</ref> program. + </dd> + <dt> + accents + </dt> + <dd> + Uses the accents algorithm for matching words. + This algorithm will treat all accented letters + as equivalent to their unaccented counterparts. + It requires the accents database, which is + generated with the <ref type="program">htfuzzy</ref> program. + </dd> + <dt> + endings + </dt> + <dd> + This algorithm uses language specific word endings + to find matches. Each word is first reduced to its + word root and then all known legal endings are used + for the matching. This algorithm uses two databases + which are generated with <ref type="program">htfuzzy</ref>. + </dd> + <dt> + synonyms + </dt> + <dd> + Performs a dictionary lookup on all the words. This + algorithm uses a database generated with the <ref type="program">htfuzzy</ref> program. + </dd> + <dt> + substring + </dt> + <dd> + Matches all words containing the queries as + substrings. Since this requires checking every word in + the database, this can really slow down searches + considerably. + </dd> + <dt> + prefix + </dt> + <dd> + Matches all words beginning with the query + strings. Uses the option <ref type="attr">prefix_match_character</ref> + to decide whether a query requires prefix + matching. For example "abc*" would perform prefix + matching on "abc" since * is the default + prefix_match_character. + </dd> + <dt> + regex + </dt> + <dd> + Matches all words that match the patterns given as regular + expressions. Since this requires checking every word in + the database, this can really slow down searches + considerably. + </dd> + <dt> + speling + </dt> + <dd> + A simple fuzzy algorithm that tries to find one-off spelling + mistakes, such as transposition of two letters or an extra character. + Since this usually generates just a few possibilities, it is + relatively quick. + </dd> + </dl> + </description> + </attribute> + + <attribute name="search_results_footer" + type="string" + programs="htsearch" + version="all" + category="Presentation:Files" > + <default>${common_dir}/footer.html</default> + <example>/usr/local/etc/ht/end-stuff.html</example> + <description> + This specifies a filename to be output at the end of + search results. While outputting the footer, some + variables will be expanded. Variables use the same + syntax as the Bourne shell. If there is a variable VAR, + the following will all be recognized: + <ul> + <li> + $VAR + </li> + <li> + $(VAR) + </li> + <li> + ${VAR} + </li> + </ul> + The following variables are available. See + <a href="hts_template.html">hts_template.html</a> for a complete + list. + <dl> + <dt> + MATCHES + </dt> + <dd> + The number of documents that were matched. + </dd> + <dt> + PLURAL_MATCHES + </dt> + <dd> + If MATCHES is not 1, this will be the string "s", + else it is an empty string. This can be used to say + something like "$(MATCHES) + document$(PLURAL_MATCHES) were found" + </dd> + <dt> + MAX_STARS + </dt> + <dd> + The value of the <ref type="attr">max_stars</ref> + attribute. + </dd> + <dt> + LOGICAL_WORDS + </dt> + <dd> + A string of the search words with either "and" or + "or" between the words, depending on the type of + search. + </dd> + <dt> + WORDS + </dt> + <dd> + A string of the search words with spaces in + between. + </dd> + <dt> + PAGEHEADER + </dt> + <dd> + This expands to either the value of the + <ref type="attr">page_list_header</ref> or + <ref type="attr">no_page_list_header</ref> + attribute depending on how many pages there are. + </dd> + </dl> + Note that this file will <strong>NOT</strong> be output + if no matches were found. In this case the + <ref type="attr">nothing_found_file</ref> + attribute is used instead. + Also, this file will not be output if it is + overridden by defining the + <ref type="attr">search_results_wrapper</ref> + attribute. + </description> + </attribute> + + <attribute name="search_results_header" + type="string" + programs="htsearch" + version="all" + category="Presentation:Files" > + <default>${common_dir}/header.html</default> + <example>/usr/local/etc/ht/start-stuff.html</example> + <description> + This specifies a filename to be output at the start of + search results. While outputting the header, some + variables will be expanded. Variables use the same + syntax as the Bourne shell. If there is a variable VAR, + the following will all be recognized: + <ul> + <li> + $VAR + </li> + <li> + $(VAR) + </li> + <li> + ${VAR} + </li> + </ul> + The following variables are available. See + <a href="hts_template.html">hts_template.html</a> for a complete + list. + <!-- Do these need to be listed for both _footer and _header? --> + <dl> + <dt> + MATCHES + </dt> + <dd> + The number of documents that were matched. + </dd> + <dt> + PLURAL_MATCHES + </dt> + <dd> + If MATCHES is not 1, this will be the string "s", + else it is an empty string. This can be used to say + something like "$(MATCHES) + document$(PLURAL_MATCHES) were found" + </dd> + <dt> + MAX_STARS + </dt> + <dd> + The value of the <ref type="attr">max_stars</ref> + attribute. + </dd> + <dt> + LOGICAL_WORDS + </dt> + <dd> + A string of the search words with either "and" or + "or" between the words, depending on the type of + search. + </dd> + <dt> + WORDS + </dt> + <dd> + A string of the search words with spaces in + between. + </dd> + </dl> + Note that this file will <strong>NOT</strong> be output + if no matches were found. In this case the + <ref type="attr">nothing_found_file</ref> + attribute is used instead. + Also, this file will not be output if it is + overridden by defining the + <ref type="attr">search_results_wrapper</ref> + attribute. + </description> + </attribute> + + <attribute name="search_results_order" + type="string_list" + programs="htsearch" + version="3.2.0b2" + category="Searching:Ranking" > + <default></default> + <example>/docs/|faq.html * /maillist/ /testresults/</example> + <description> + This specifies a list of patterns for URLs in + search results. Results will be displayed in the + specified order, with the search algorithm result + as the second order. Remaining areas, that do not + match any of the specified patterns, can be placed + by using * as the pattern. If no * is specified, + one will be implicitly placed at the end of the + list.<br/> + See also <ref type="attr">url_seed_score</ref>. + </description> + </attribute> + + <attribute name="search_results_wrapper" + type="string" + programs="htsearch" + version="3.1.0" + category="Presentation:Files" > + <default></default> + <example>${common_dir}/wrapper.html</example> + <description> + This specifies a filename to be output at the start and + end of search results. This file replaces the + <ref type="attr">search_results_header</ref> and + <ref type="attr">search_results_footer</ref> + files, with the contents of both in one file, and uses the + pseudo-variable <strong>$(HTSEARCH_RESULTS)</strong> as a + separator for the header and footer sections. + If the filename is not specified, the file is unreadable, + or the pseudo-variable above is not found, htsearch reverts + to the separate header and footer files instead. + While outputting the wrapper, + some variables will be expanded, just as for the + <ref type="attr">search_results_header</ref> and + <ref type="attr">search_results_footer</ref> + files.<br/> + Note that this file will <strong>NOT</strong> be output + if no matches were found. In this case the + <ref type="attr">nothing_found_file</ref> + attribute is used instead. + </description> + </attribute> + + <attribute name="search_rewrite_rules" + type="string list" + programs="htsearch" + version="3.1.6" + category="URLs" > + <default></default> + <example> http://(.*)\\.mydomain\\.org/([^/]*) http://\\2.\\1.com \ + http://www\\.myschool\\.edu/myorgs/([^/]*) http://\\1.org + </example> + <description> + This is a list of pairs, <em>regex</em> <em>replacement</em>, used + to rewrite URLs in the search results. The left hand string is a + regular expression; the right hand string is a literal string with + embedded placeholders for fragments that matched inside brackets in the + regular expression. \0 is the whole matched string, \1 to \9 are + bracketted substrings. The backslash must be doubled-up in the + attribute setting to get past the variable expansion parsing. Rewrite + rules are applied sequentially to each URL before it is displayed + or checked against the <ref type="attr">restrict</ref> or + <ref type="attr">exclude</ref> lists. Rewriting does not stop once a + match has been made, so multiple rules may affect a given URL. See + also <ref type="attr">url_part_aliases</ref> which allows URLs + to be of one form during indexing and translated for results, + and <ref type="attr">url_rewrite_rules</ref> which allows URLs + to be rewritten while indexing. + </description> + </attribute> + + <attribute name="server_aliases" + type="string_list" + programs="htdig" + version="3.1.0b2" + category="Indexing:Where" > + <default></default> + <example>foo.mydomain.com:80=www.mydomain.com:80 \ + bar.mydomain.com:80=www.mydomain.com:80 +</example> + <description> + This attribute tells the indexer that servers have several + DNS aliases, which all point to the same machine and are NOT + virtual hosts. This allows you to ensure pages are indexed + only once on a given machine, despite the alias used in a URL. + As shown in the example, the mapping goes from left to right, + so the server name on the right hand side is the one that is + used. As of version 3.1.3, the port number is optional, and is + assumed to be 80 if omitted. There is no easy way to map all + ports from one alias to another without listing them all. + </description> + </attribute> + + <attribute name="server_max_docs" + type="integer" + programs="htdig" + version="3.1.0b3" + category="Indexing:Where" + block="Server" > + <default>-1</default> + <example>50</example> + <description> + This attribute tells htdig to limit the dig to retrieve a maximum + number of documents from each server. This can cause + unusual behavior on update digs since the old URLs are + stored alphabetically. Therefore, update digs will add + additional URLs in pseudo-alphabetical order, up to the + limit of the attribute. However, it is most useful to + partially index a server as the URLs of additional + documents are entered into the database, marked as never + retrieved.<br/> + A value of -1 specifies no limit. + </description> + </attribute> + + <attribute name="server_wait_time" + type="integer" + programs="htdig" + version="3.1.0b3" + category="Indexing:Connection" + block="Server" > + <default>0</default> + <example>20</example> + <description> + This attribute tells htdig to ensure a server has had a + delay (in seconds) from the beginning of the last + connection. This can be used to prevent "server abuse" + by digging without delay. It's recommended to set this + to 10-30 (seconds) when indexing servers that you don't + monitor yourself. Additionally, this attribute can slow + down local indexing if set, which may or may not be what + you intended. + </description> + </attribute> + + <attribute name="sort" + type="string" + programs="htsearch" + version="3.1.0" + category="Presentation:How" > + <default>score</default> + <example>revtime</example> + <description> + This is the default sorting method that htsearch + uses to determine the order in which matches are displayed. + The valid choices are: + <table border="0"> + <tr> + <td> + <ul> + <li> score </li> + <li> time </li> + <li> title </li> + </ul> + </td> + <td> + <ul> + <li> revscore </li> + <li> revtime </li> + <li> revtitle </li> + </ul> + </td> + </tr> + </table> + This attribute will only be used if the HTML form that + calls htsearch didn't have the <strong>sort</strong> + value set. The words date and revdate can be used instead + of time and revtime, as both will sort by the time that + the document was last modified, if this information is + given by the server. The default is to sort by the score, + which ranks documents by best match. The sort methods that + begin with "rev" simply reverse the order of the + sort. Note that setting this to something other than + "score" will incur a slowdown in searches. + </description> + </attribute> + + <attribute name="sort_names" + type="quoted_string_list" + programs="htsearch" + version="3.1.0" + category="Searching:UI" > + <default>score Score time Time title Title revscore 'Reverse Score' revtime 'Reverse Time' revtitle 'Reverse Title'</default> + <example>score 'Best Match' time Newest title A-Z \ + revscore 'Worst Match' revtime Oldest revtitle Z-A +</example> + <description> + These values are used to create the <strong> + sort</strong> menu. It consists of pairs. The first + element of each pair is one of the known sort methods, the + second element is the text that will be shown in the + menu for that sort method. This text needs to be quoted if + it contains spaces. + See the <a href="hts_selectors.html">select list documentation</a> + for more information on how this attribute is used. + </description> + </attribute> + + <attribute name="soundex_db" + type="string" + programs="htfuzzy htsearch" + version="all" + category="File Layout" > + <default>${database_base}.soundex.db</default> + <example>${database_base}.snd.db</example> + <description> + The database file used for the fuzzy "soundex" search + algorithm. This database is created by + <ref type="program">htfuzzy</ref> and used by + <ref type="program">htsearch</ref>. + </description> + </attribute> + + <attribute name="star_blank" + type="string" + programs="htsearch" + version="all" + category="Presentation:Text" > + <default>${image_url_prefix}/star_blank.gif</default> + <example>//www.somewhere.org/icons/noelephant.gif</example> + <description> + This specifies the URL to use to display a blank of the + same size as the star defined in the + <ref type="attr">star_image</ref> attribute or in the + <ref type="attr">star_patterns</ref> attribute. + </description> + </attribute> + + <attribute name="star_image" + type="string" + programs="htsearch" + version="all" + category="Presentation:Text" > + <default>${image_url_prefix}/star.gif</default> + <example>//www.somewhere.org/icons/elephant.gif</example> + <description> + This specifies the URL to use to display a star. This + allows you to use some other icon instead of a star. + (We like the star...)<br/> + The display of stars can be turned on or off with the + <ref type="attr">use_star_image</ref> + attribute and the maximum number of stars that can be + displayed is determined by the + <ref type="attr">max_stars</ref> attribute.<br/> + Even though the image can be changed, the ALT value + for the image will always be a '*'. + </description> + </attribute> + + <attribute name="star_patterns" + type="string_list" + programs="htsearch" + version="3.0" + category="Presentation:How" > + <default></default> + <example>http://www.sdsu.edu /sdsu.gif \ + http://www.ucsd.edu /ucsd.gif +</example> + <description> + This attribute allows the star image to be changed + depending on the URL or the match it is used for. This + is mainly to make a visual distinction between matches + on different web sites. The star image could be + replaced with the logo of the company the match refers + to.<br/> + It is advisable to keep all the images the same size + in order to line things up properly in a short result + listing.<br/> + The format is simple. It is a list of pairs. The first + element of each pair is a pattern, the second element + is a URL to the image for that pattern. + </description> + </attribute> + + <attribute name="startday" + type="integer" + programs="htsearch" + version="3.1.6" + category="Searching:Method" > + <default></default> + <example>1</example> + <description> + Day component of first date allowed as last-modified date + of returned docutments. + This is most usefully specified as a + <a href="hts_form.html#startyear">GCI argument</a>. + See also <ref type="attr">startyear</ref>. + </description> + </attribute> + + <attribute name="start_ellipses" + type="string" + programs="htsearch" + version="all" + category="Presentation:Text" > + <default><strong><code>... </code></strong></default> + <example>...</example> + <description> + When excerpts are displayed in the search output, this + string will be prepended to the excerpt if there is + text before the text displayed. This is just a visual + reminder to the user that the excerpt is only part of + the complete document. + </description> + </attribute> + + <attribute name="start_highlight" + type="string" + programs="htsearch" + version="3.1.4" + category="Presentation:Text" > + <default><strong></default> + <example><font color="#FF0000"></example> + <description> + When excerpts are displayed in the search output, matched + words will be highlighted using this string and + <ref type="attr">end_highlight</ref>. + You should ensure that highlighting tags are balanced, + that is, any formatting tags that this string + opens should be closed by end_highlight. + </description> + </attribute> + + <attribute name="startmonth" + type="integer" + programs="htsearch" + version="3.1.6" + category="Searching:Method" > + <default></default> + <example>1</example> + <description> + Month component of first date allowed as last-modified date + of returned docutments. + This is most usefully specified as a + <a href="hts_form.html#startyear">GCI argument</a>. + See also <ref type="attr">startyear</ref>. + </description> + </attribute> + + <attribute name="start_url" + type="string_list" + programs="htdig" + version="all" + category="Indexing:Where" > + <default>http://www.htdig.org/</default> + <example>//www.somewhere.org/alldata/index.html</example> + <description> + This is the list of URLs that will be used to start a + dig when there was no existing database. Note that + multiple URLs can be given here. + <br/>Note also that the value of <em>start_url</em> + will be the default value for + <href type="attr">limit_urls_to</ref>, so if + you set start_url to the URLs for specific files, + rather than a site or subdirectory URL, you may need + to set limit_urls_to to something less restrictive + so htdig doesn't reject links in the documents. + </description> + </attribute> + + <attribute name="startyear" + type="integer" + programs="htsearch" + version="3.1.6" + category="Searching:Method" > + <default>1970</default> + <example>2001</example> + <description> + This specifies the year of the cutoff start date for + search results. If the start or end date are specified, + only results with a last modified date within this + range are shown. + See also <ref type="attr">startday</ref>, + <ref type="attr">startmonth</ref>, + <ref type="attr">endday</ref>, + <ref type="attr">endmonth</ref>, + <a href="endyear">endyear</a>. + These are most usefully specified as a + <a href="hts_form.html#startyear">GCI argument</a>.<br/> + For each component, if a negative number is given, + it is taken as relative to the current date. + Relative days can span several months or even years if desired, + and relative months can span several years. A startday of + -90 will select matching documents modified within + the last 90 days. + </description> + </attribute> + + <attribute name="substring_max_words" + type="integer" + programs="htsearch" + version="3.0.8b1" + category="Searching:Method" > + <default>25</default> + <example>100</example> + <description> + The Substring fuzzy algorithm could potentially match a + very large number of words. This value limits the + number of words each substring pattern can match. Note + that this does not limit the number of documents that + are matched in any way. + </description> + </attribute> + + <attribute name="synonym_db" + type="string" + programs="htsearch htfuzzy" + version="3.0" + category="File Layout" > + <default>${common_dir}/synonyms.db</default> + <example>${database_base}.syn.db</example> + <description> + Points to the database that <ref type="program">htfuzzy</ref> creates when the <strong>synonyms</strong> + algorithm is used.<br/> + <ref type="program">htsearch</ref> + uses this to perform synonym dictionary lookups. + </description> + </attribute> + + <attribute name="synonym_dictionary" + type="string" + programs="htfuzzy" + version="3.0" + category="File Layout" > + <default>${common_dir}/synonyms</default> + <example>/usr/dict/synonyms</example> + <description> + This points to a text file containing the synonym + dictionary used for the synonyms search algorithm.<br/> + Each line of this file has at least two words. The + first word is the word to replace, the rest of the + words are synonyms for that word. + </description> + </attribute> + + <attribute name="syntax_error_file" + type="string" + programs="htsearch" + version="all" + category="Presentation:Files" > + <default>${common_dir}/syntax.html</default> + <example>${common_dir}/synerror.html</example> + <description> + This points to the file which will be displayed if a + boolean expression syntax error was found. + </description> + </attribute> + + <attribute name="tcp_max_retries" + type="integer" + programs="htdig" + version="3.2.0b1" + category="Indexing:Connection" + block="Server" > + <default>1</default> + <example>6</example> + <description> + This option set the maximum number of attempts when a connection + <ref type="attr">timeout</ref>s. + After all these retries, the connection attempt results <timed out>. + </description> + </attribute> + + <attribute name="tcp_wait_time" + type="integer" + programs="htdig" + version="3.2.0b1" + category="Indexing:Connection" + block="Server" > + <default>5</default> + <example>10</example> + <description> + This attribute sets the wait time (in seconds) after a connection + fails and the <ref type="attr">timeout</ref> is raised. + </description> + </attribute> + + <attribute name="template_map" + type="quoted_string_list" + programs="htsearch" + version="3.0" + category="Presentation:Files,Searching:UI" > + <default>Long builtin-long builtin-long Short builtin-short builtin-short</default> + <example>Short short ${common_dir}/short.html \ + Normal normal builtin-long \ + Detailed detail ${common_dir}/detail.html +</example> + <description> + This maps match template names to internal names and + template file names. It is a list of triplets. The + first element in each triplet is the name that will be + displayed in the FORMAT menu. The second element is the + name used internally and the third element is a + filename of the template to use.<br/> + There are two predefined templates, namely <strong> + builtin-long</strong> and <strong> + builtin-short</strong>. If the filename is one of + those, they will be used instead.<br/> + More information about templates can be found in the + <ref type="program">htsearch</ref> + documentation. The particular template is selecterd by the + <a href="hts_form.html#format">format</a> cgi argument, and the + default is given by <ref type="attr">template_name</ref> in + the config file. + </description> + </attribute> + + <attribute name="template_name" + type="string" + programs="htsearch" + version="3.0" + category="Searching:UI,Presentation:How" > + <default>builtin-long</default> + <example>long</example> + <description> + Specifies the default template if no + <a href="hts_form.html#format">format</a> field is given by the + search form. This needs to map to the + <ref type="attr">template_map</ref>. + </description> + </attribute> + + <attribute name="template_patterns" + type="string_list" + programs="htsearch" + version="3.1.4" + category="Presentation:How" > + <default></default> + <example>http://www.sdsu.edu ${common_dir}/sdsu.html \ + http://www.ucsd.edu ${common_dir}/ucsd.html +</example> + <description> + This attribute allows the results template to be changed + depending on the URL or the match it is used for. This + is mainly to make a visual distinction between matches + on different web sites. The results for each site could + thus be shown in a style matching that site.<br/> + The format is simply a list of pairs. The first + element of each pair is a pattern, the second element + is the name of the template file for that pattern.<br/> + More information about templates can be found in the + <ref type="program">htsearch</ref> + documentation.<br/> + Normally, when using this template selection method, you + would disable user selection of templates via the <strong>format</strong> + input parameter in search forms, as the two methods were not + really designed to interact. Templates selected by URL patterns + would override any user selection made in the form. If you want + to use the two methods together, see the notes on + <a href="hts_selectors.html#template_patterns">combining</a> + them for an example of how to do this. + </description> + </attribute> + + <attribute name="text_factor" + type="number" + programs="htsearch" + version="3.0" + category="Searching:Ranking" > + <default>1</default> + <example>0</example> + <description> + This is a factor which will be used to multiply the + weight of words that are not in any special part of a + document. Setting a factor to 0 will cause normal words + to be ignored. The number may be a floating point + number. See also the <ref type="attr">heading_factor</ref> + attribute. + </description> + </attribute> + + <attribute name="timeout" + type="integer" + programs="htdig" + version="all" + category="Indexing:Connection" + block="Server" > + <default>30</default> + <example>42</example> + <description> + Specifies the time the digger will wait to complete a + network read. This is just a safeguard against + unforeseen things like the all too common + transformation from a network to a notwork.<br/> + The timeout is specified in seconds. + </description> + </attribute> + + <attribute name="title_factor" + type="number" + programs="htsearch" + version="all" + category="Searching:Ranking" > + <default>100</default> + <example>12</example> + <description> + This is a factor which will be used to multiply the + weight of words in the title of a document. Setting a + factor to 0 will cause words in the title to be + ignored. The number may be a floating point number. See + also the <ref type="attr">heading_factor</ref> attribute. + </description> + </attribute> + + <attribute name="url_list" + type="string" + programs="htdig" + version="all" + category="Extra Output" > + <default>${database_base}.urls</default> + <example>/tmp/urls</example> + <description> + This file is only created if + <ref type="attr">create_url_list</ref> is set to + true. It will contain a list of all URLs that were + seen. + </description> + </attribute> + + <attribute name="url_log" + type="string" + programs="htdig" + version="3.1.0" + category="Extra Output" > + <default>${database_base}.log</default> + <example>/tmp/htdig.progress</example> + <description> + If <ref type="program">htdig</ref> is run with the -l option + and interrupted, it will write out its progress to this + file. Note that if it has a large number of URLs to write, + it may take some time to exit. This can especially happen + when running update digs and the run is interrupted soon + after beginning. + </description> + </attribute> + + <attribute name="url_part_aliases" + type="string_list" + programs="all" + version="3.1.0" + category="URLs" > + <default></default> + <example>http://search.example.com/~htdig *site \ + http://www.htdig.org/this/ *1 \ + .html *2 + </example> + <example>http://www.htdig.org/ *site \ + http://www.htdig.org/that/ *1 \ + .htm *2 +</example> + <description> + A list of translations pairs <em>from</em> and + <em>to</em>, used when accessing the database. + If a part of an URL matches with the + <em>from</em>-string of each pair, it will be + translated into the <em>to</em>-string just before + writing the URL to the database, and translated + back just after reading it from the database.<br/> + This is primarily used to provide an easy way to + rename parts of URLs for e.g. changing + www.example.com/~htdig to www.htdig.org. Two + different configuration files for digging and + searching are then used, with url_part_aliases + having different <em>from</em> strings, but + identical <em>to</em>-strings.<br/> + See also <ref type="attr">common_url_parts</ref>.<br/> + Strings that are normally incorrect in URLs or + very seldom used, should be used as + <em>to</em>-strings, since extra storage will be + used each time one is found as normal part of a + URL. Translations will be performed with priority + for the leftmost longest match. Each + <em>to</em>-string must be unique and not be a + part of any other <em>to</em>-string.<br/> + Note that when this attribute is changed, the + database should be rebuilt, unless the effect of + "moving" the affected URLs in the database is + wanted, as described above.<br/> + <strong>Please note:</strong> Don't just copy the + example below into a single configuration file. + There are two separate settings of + <em>url_part_aliases</em> below; the first one is + for the configuration file to be used by htdig, + htmerge, and htnotify, and the second one is for the + configuration file to be used by htsearch. + </description> + </attribute> + + <attribute name="url_rewrite_rules" + type="string_list" + programs="htdig" + version="3.2.0b3" + category="URLs" > + <default></default> + <example>(.*)\\?JServSessionIdroot=.* \\1 \ + (.*)\\&JServSessionIdroot=.* \\1 \ + (.*)&context=.* \\1</example> + <description> + This is a list of pairs, <em>regex</em> <em>replacement</em> used to + permanently rewrite URLs as they are indexed. The left hand string is + a regex; the right hand string is a literal string with embedded + placeholders for fragments that matched inside brackets in the + regex. \0 is the whole matched string, \1 to \9 are bracketted + substrings. Rewrite rules are applied sequentially to each + incoming URL before normalization occurs. Rewriting does not stop + once a match has been made, so multiple rules may affect a given URL. + See also <ref type="attr">url_part_aliases</ref> which + allows URLs to be of one +form during indexing and translated for results. + </description> + </attribute> + + <attribute name="url_seed_score" + type="string_list" + programs="htsearch" + version="3.2.0b2" + category="Searching::Ranking" > + <default></default> + <example>/mailinglist/ *.5-1e6 + /docs/|/news/ *1.5 + /testresults/ "*.7 -200" + /faq-area/ *2+10000</example> + <description> + This is a list of pairs, <em>pattern</em> + <em>formula</em>, used to weigh the score of + hits, depending on the URL of the document.<br/> + The <em>pattern</em> part is a substring to match + against the URL. Pipe ('|') characters can be + used in the pattern to concatenate substrings for + web-areas that have the same formula.<br/> + The formula describes a <em>factor</em> and a + <em>constant</em>, by which the hit score is + weighed. The <em>factor</em> part is multiplied + to the original score, then the <em>constant</em> + part is added.<br/> + The format of the formula is the factor part: + "*<em>N</em>" optionally followed by comma and + spaces, followed by the constant part : + "+<em>M</em>", where the plus sign may be emitted + for negative numbers. Either part is optional, + but must come in this order.<br/> + The numbers <em>N</em> and <em>M</em> are floating + point constants.<br/> + More straightforward is to think of the format as + "newscore = oldscore*<em>N</em>+<em>M</em>", + but with the "newscore = oldscore" part left out. + </description> + </attribute> + + <attribute name="url_text_factor" + type="number" + programs="htsearch" + version="??" + category="Searching:Ranking" > + <default>1</default> + <example>1</example> + <description> + TO BE COMPLETED<br/> + See also <ref type="attr">heading_factor</ref>. + </description> + </attribute> + + <attribute name="use_doc_date" + type="boolean" + programs="htdig" + version="3.2.0b1" + category="Indexing:How" > + <default>false</default> + <example>true</example> + <description> + If set to true, htdig will use META date tags in documents, + overriding the modification date returned by the server. + Any documents that do not have META date tags will retain + the last modified date returned by the server or found on + the local file system. + </description> + </attribute> + + <attribute name="use_meta_description" + type="boolean" + programs="htsearch" + version="3.1.0b1" + category="Presentation:How" > + <default>false</default> + <example>true</example> + <description> + If set to true, any META description tags will be used as + excerpts by htsearch. Any documents that do not have META + descriptions will retain their normal excerpts. + </description> + </attribute> + + <attribute name="use_star_image" + type="boolean" + programs="htsearch" + version="all" + category="Presentation:How" > + <default>true</default> + <example>no</example> + <description> + If set to true, the <ref type="attr">star_image</ref> attribute is used to display upto + <ref type="attr">max_stars</ref> images for + each match. + </description> + </attribute> + + <attribute name="user_agent" + type="string" + programs="htdig" + version="3.1.0b2" + category="Indexing:Out" + block="Server" > + <default>htdig</default> + <example>htdig-digger</example> + <description> + This allows customization of the user_agent: field sent when + the digger requests a file from a server. + </description> + </attribute> + + <attribute name="valid_extensions" + type="string_list" + programs="htdig" + version="3.1.4" + category="Indexing:Where" + block="URL" > + <default></default> + <example>.html .htm .shtml</example> + <description> + This is a list of extensions on URLs which are + the only ones considered acceptable. This list is used to + supplement the MIME-types that the HTTP server provides + with documents. Some HTTP servers do not have a correct + list of MIME-types and so can advertise certain + documents as text while they are some binary format. + If the list is empty, then all extensions are acceptable, + provided they pass other criteria for acceptance or rejection. + If the list is not empty, only documents with one of the + extensions in the list are parsed. + See also <ref type="attr">bad_extensions</ref>. + </description> + </attribute> + + <attribute name="valid_punctuation" + type="string" + programs="htdig htsearch" + version="all" + category="Indexing:What" > + <default>.-_/!#$%^&'</default> + <example>-'</example> + <description> + This is the set of characters which will be deleted + from the document before determining what a word is. + This means that if a document contains something like + <code>Andrew's</code> the digger will see this as <code> + Andrews</code>.<br/> + The same transformation is performed on the keywords + the search engine gets.<br/> + See also the <ref type="attr">extra_word_characters</ref> + attribute. + </description> + </attribute> + + <attribute name="version" + type="string" + programs="htsearch" + version="all" + category="Presentation:Text" > + <default configmacro="true">VERSION</default> + <example>3.2.0</example> + <description> + This specifies the value of the VERSION + variable which can be used in search templates. + The default value of this attribute is determined + at compile time, and will not normally be set + in configuration files. + </description> + </attribute> + + <attribute name="word_db" + type="string" + programs="all" + version="all" + category="File Layout" > + <default>${database_base}.words.db</default> + <example>${database_base}.allwords.db</example> + <description> + This is the main word database. It is an index of all + the words to a list of documents that contain the + words. This database can grow large pretty quickly. + </description> + </attribute> + + <attribute name="word_dump" + type="string" + programs="htdig htdump htload" + version="3.2.0b1" + category="File Layout" > + <default>${database_base}.worddump</default> + <example>/tmp/words.txt</example> + <description> + This file is basically a text version of the file + specified in <ref type="attr">word_db</ref>. Its + only use is to have a human readable database of all + words. The file is easy to parse with tools like + perl or tcl. + </description> + </attribute> + + <attribute name="wordlist_cache_size" + type="integer" + programs="all" + version="3.2.0b1" + category="Indexing:How" > + <default>10000000</default> + <example>40000000</example> + <description> + Size of memory cache used by Berkeley DB (DB used by the indexer) + IMPORTANT: It makes a <strong>huge</strong> difference. The rule + is that the cache size should be at least 2% of the expected index size. The + Berkeley DB file has 1% of internal pages that *must* be cached for good + performances. Giving an additional 1% leaves room for caching leaf pages. + </description> + </attribute> + + <attribute name="wordlist_compress" + type="boolean" + programs="all" + version="3.2.0b1" + category="Indexing:How" > + <default>true</default> + <example>true</example> + <description> + Enables or disables the default compression system for the indexer. + This currently compresses the index by a factor of 8. If the + Zlib library is not found on the system, the default is false. + </description> + </attribute> + + <attribute name="wordlist_compress_zlib" + type="boolean" + programs="all" + version="3.2.0b4" + category="Indexing:How" > + <default>true</default> + <example>true</example> + <description> + Enables or disables the zlib compression system for the indexer. + wordlist_compress must be true to use this option!` + </description> + </attribute> + + <attribute name="wordlist_monitor" + type="boolean" + programs="all" + version="3.2.0b1" + category="Extra Output" > + <default>false</default> + <example>true</example> + <description> + This enables monitoring of what's happening in the indexer. + It can help to detect performance/configuration problems. + </description> + </attribute> + + <attribute name="wordlist_monitor_period" + type="number" + programs="all" + version="3.2.0b1" + category="Extra Output" > + <default>0</default> + <example>.1</example> + <description> + Sets the number of seconds between each monitor output. + </description> + </attribute> + + <attribute name="wordlist_monitor_output" + type="string" + programs="all" + version="3.2.0b1" + category="Extra Output" > + <default></default> + <example>myfile</example> + <description> + Print monitoring output on file instead of the default stderr. + </description> + </attribute> + + <attribute name="wordlist_page_size" + type="integer" + programs="all" + version="3.2.0b1" + category="Indexing:How" > + <default>0</default> + <example>8192</example> + <description> + Size of pages used by Berkeley DB (DB used by the indexer) + </description> + </attribute> + + <attribute name="wordlist_verbose" + type="integer" + programs="" + version="" + category="" > + <default></default> + <example>true</example> + <description> + wordlist_verbose 1 walk logic<br/> + wordlist_verbose 2 walk logic details<br/> + wordlist_verbose 2 walk logic lots of details<br/> + </description> + </attribute> + + <attribute name="wordlist_wordkey_description" + type="string" + programs="all" + version="3.2.0b1" + category="Indexing:How" > + <default>Word/DocID 32/Flags 8/Location 16</default> + <nodocs/> + </attribute> + + <attribute name="wordlist_wordrecord_description" + type="string" + programs="all" + version="3.2.0b1" + category="Indexing:How" > + <default>DATA</default> + <nodocs/> + </attribute> + +</HtdigAttributes> |