diff options
author | Slávek Banko <[email protected]> | 2021-11-05 13:28:23 +0100 |
---|---|---|
committer | Slávek Banko <[email protected]> | 2021-11-05 13:28:23 +0100 |
commit | 8c787c3591c1c885b91a54128835b400858c5cca (patch) | |
tree | eca1b776912a305c4d45b3964038278a2fae1ead /debian/htdig/htdig-3.2.0b6/htfuzzy | |
parent | fe188b907cdf30dfdfe0eba9412e7f8749fec158 (diff) | |
download | extra-dependencies-8c787c3591c1c885b91a54128835b400858c5cca.tar.gz extra-dependencies-8c787c3591c1c885b91a54128835b400858c5cca.zip |
DEB htdig: Added to repository.
Signed-off-by: Slávek Banko <[email protected]>
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htfuzzy')
31 files changed, 3965 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/.cvsignore b/debian/htdig/htdig-3.2.0b6/htfuzzy/.cvsignore new file mode 100644 index 00000000..c0c9bc11 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/.cvsignore @@ -0,0 +1,8 @@ +Makefile +*.lo +*.la +.purify +.pure +.deps +.libs +htfuzzy diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Accents.cc b/debian/htdig/htdig-3.2.0b6/htfuzzy/Accents.cc new file mode 100644 index 00000000..619615a6 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Accents.cc @@ -0,0 +1,168 @@ +// +// Accents.cc +// +// Accents: A fuzzy matching algorithm by Robert Marchand, to treat all +// ISO-8859-1 accented letters as equivalent to their unaccented +// counterparts. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 2000-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Accents.cc,v 1.5 2004/05/28 13:15:19 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "Configuration.h" +#include "Accents.h" +#include "Dictionary.h" +#include <ctype.h> + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +extern int debug; + +/*-------------------------------------------------------------------. +| Ajoute par Robert Marchand pour permettre le traitement adequat de | +| l'ISO-LATIN (provient du code de Pierre Rosa) | +`-------------------------------------------------------------------*/ + +/*--------------------------------------------------. +| table iso-latin1 "minusculisee" et "de-accentuee" | +`--------------------------------------------------*/ + +static char MinusculeISOLAT1[256] = { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 'a', 'b', 'c', 'd', 'e', 'f', 'g', + 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', + 'x', 'y', 'z', 91, 92, 93, 94, 95, + 96, 'a', 'b', 'c', 'd', 'e', 'f', 'g', + 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', + 'x', 'y', 'z', 123, 124, 125, 126, 127, + 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, + 168, 168, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, + 184, 185, 186, 187, 188, 189, 190, 191, + 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', + 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', + 208, 'n', 'o', 'o', 'o', 'o', 'o', 'o', + 'o', 'u', 'u', 'u', 'u', 'y', 222, 223, + 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', + 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', + 240, 'n', 'o', 'o', 'o', 'o', 'o', 'o', + 'o', 'u', 'u', 'u', 'u', 'y', 254, 255}; + + +//***************************************************************************** +// Accents::Accents(const HtConfiguration& config_arg) +// +Accents::Accents(const HtConfiguration& config_arg) : + Fuzzy(config_arg) +{ + name = "accents"; +} + + +//***************************************************************************** +// Accents::~Accents() +// +Accents::~Accents() +{ +} + +//***************************************************************************** +// void Accents::generateKey(char *word, String &key) +// +void +Accents::generateKey(char *word, String &key) +{ + static int maximum_word_length = config.Value("maximum_word_length", 12); + + if (!word || !*word) + return; + + String temp(word); + if (temp.length() > maximum_word_length) + temp.chop(temp.length()-maximum_word_length); + word = temp.get(); + key = '0'; + while (*word) { + key << MinusculeISOLAT1[ (unsigned char) *word++ ]; + } +} + + +//***************************************************************************** +// void Accents::addWord(char *word) +// +void +Accents::addWord(char *word) +{ + if (!dict) + { + dict = new Dictionary; + } + + String key; + generateKey(word, key); + + // Do not add fuzzy key as a word, will be added at search time. + if (mystrcasecmp(word, key.get()) == 0) + return; + + String *s = (String *) dict->Find(key); + if (s) + { + // if (mystrcasestr(s->get(), word) != 0) + (*s) << ' ' << word; + } + else + { + dict->Add(key, new String(word)); + } +} + + +//***************************************************************************** +// void Accents::getWords(char *word, List &words) +// +void +Accents::getWords(char *word, List &words) +{ + + if (!word || !*word) + return; + + Fuzzy::getWords(word, words); + + // fuzzy key itself is always searched. + String fuzzyKey; + generateKey(word, fuzzyKey); + if (mystrcasecmp(fuzzyKey.get(), word) != 0) + words.Add(new String(fuzzyKey)); +} diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Accents.h b/debian/htdig/htdig-3.2.0b6/htfuzzy/Accents.h new file mode 100644 index 00000000..b736c682 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Accents.h @@ -0,0 +1,41 @@ +// +// Accents.h +// +// Accents: A fuzzy matching algorithm by Robert Marchand, to treat all +// ISO-8859-1 accented letters as equivalent to their unaccented +// counterparts. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 2000-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Accents.h,v 1.4 2004/05/28 13:15:20 lha Exp $ +// +// +#ifndef _Accents_h_ +#define _Accents_h_ + +#include "Fuzzy.h" + +class Accents : public Fuzzy +{ +public: + // + // Construction/Destruction + // + Accents(const HtConfiguration& config_arg); + virtual ~Accents(); + + virtual void generateKey(char *word, String &key); + + virtual void addWord(char *word); + + virtual void getWords(char *word, List &words); + +private: +}; + +#endif + diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Endings.cc b/debian/htdig/htdig-3.2.0b6/htfuzzy/Endings.cc new file mode 100644 index 00000000..e8176677 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Endings.cc @@ -0,0 +1,175 @@ +// +// Endings.cc +// +// Endings: A fuzzy matching algorithm to match the grammatical endings rules +// used by the ispell dictionary files. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Endings.cc,v 1.12 2004/05/28 13:15:20 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <fcntl.h> + +#include "StringList.h" +#include "Endings.h" +#include "htfuzzy.h" +#include "HtConfiguration.h" + + +//***************************************************************************** +// Endings::Endings() +// +Endings::Endings(const HtConfiguration& config_arg) : + Fuzzy(config_arg) +{ + root2word = 0; + word2root = 0; + name = "endings"; +} + + +//***************************************************************************** +// Endings::~Endings() +// +Endings::~Endings() +{ + if (root2word) + { + root2word->Close(); + delete root2word; + root2word = 0; + } + + if (word2root) + { + word2root->Close(); + delete word2root; + word2root = 0; + } +} + + +//***************************************************************************** +// void Endings::getWords(char *word, String &words) +// Return a list of words with some common English word endings. +// +void +Endings::getWords(char *w, List &words) +{ + if (!word2root || !root2word) + return; + + String data; + + String word = w; + word.lowercase(); + HtStripPunctuation(word); + String saveword = word.get(); + + // + // Look for word's root(s). Some words may have more than one root, + // so handle them all. Whether or not a word has a root, it's assumed + // to be root in itself. + // + if (word2root->Get(word, data) == OK) + word << ' ' << data; + + StringList roots(word, " "); + Object *root; + roots.Start_Get(); + while ((root = roots.Get_Next()) != 0) + { + // + // Found a root. Look for new words that have this root. + // + word = ((String *)root)->get(); + if (root2word->Get(word, data) == OK) + word << ' ' << data; + + // + // Iterate through the root's permutations + // + char *token = strtok(word.get(), " "); + while (token) + { + if (mystrcasecmp(token, saveword.get()) != 0) + { + // + // This permutation isn't the original word, so we add it + // to the list if it's not already there. + // + Object *obj; + words.Start_Get(); + while((obj = words.Get_Next()) != 0) + { + if (mystrcasecmp(token, ((String *)obj)->get()) == 0) + break; + } + if (obj == 0) + words.Add(new String(token)); + } + token = strtok(0, " "); + } + } +} + + +//***************************************************************************** +// void Endings::generateKey(char *word, String &key) +// Not needed. +void +Endings::generateKey(char *, String &) +{ +} + + +//***************************************************************************** +// void Endings::addWord(char *word) +// Not needed. +void +Endings::addWord(char *) +{ +} + + +//***************************************************************************** +// int Endings::openIndex() +// Dummy method. Just makde sure we don't actually create a database. +// +int +Endings::openIndex() +{ + String filename = config["endings_word2root_db"]; + word2root = Database::getDatabaseInstance(DB_BTREE); + if (word2root->OpenRead((char*)filename) == NOTOK) + return NOTOK; + + filename = config["endings_root2word_db"]; + root2word = Database::getDatabaseInstance(DB_BTREE); + if (root2word->OpenRead((char*)filename) == NOTOK) + return NOTOK; + + return OK; +} + + +//***************************************************************************** +// int Endings::writeDB() +// Dummy method. Just making sure we don't actually write anything. +// +int +Endings::writeDB() +{ + return OK; +} + + diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Endings.h b/debian/htdig/htdig-3.2.0b6/htfuzzy/Endings.h new file mode 100644 index 00000000..fa00b09e --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Endings.h @@ -0,0 +1,58 @@ +// +// Endings.h +// +// Endings: A fuzzy matching algorithm to match the grammatical endings rules +// used by the ispell dictionary files. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Endings.h,v 1.7 2004/05/28 13:15:20 lha Exp $ +// + +#ifndef _Endings_h_ +#define _Endings_h_ + +#include "Fuzzy.h" + +class Dictionary; +class String; +class List; + + +class Endings : public Fuzzy +{ +public: + // + // Construction/Destruction + // + Endings(const HtConfiguration& config_arg); + virtual ~Endings(); + + virtual void getWords(char *word, List &words); + virtual void generateKey(char *word, String &key); + virtual void addWord(char *word); + virtual int openIndex(); + virtual int writeDB(); + + // + // Special member which will create the two databases needed for this + // algorithm. + // + int createDB(const HtConfiguration &config); + + static void mungeWord(char *, String &); + +private: + Database *root2word; + Database *word2root; + + int createRoot(Dictionary &, char *, char *, const String&); + int readRules(Dictionary &, const String&); + void expandWord(String &, List &, Dictionary &, char *, char *); +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/EndingsDB.cc b/debian/htdig/htdig-3.2.0b6/htfuzzy/EndingsDB.cc new file mode 100644 index 00000000..81dec74b --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/EndingsDB.cc @@ -0,0 +1,441 @@ +// +// EndingsDB.cc +// +// EndingsDB: Implementation of the private endings database +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: EndingsDB.cc,v 1.17 2004/05/28 13:15:20 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <fcntl.h> + +#include "Endings.h" +#include "htfuzzy.h" +#include "SuffixEntry.h" +#include "Dictionary.h" +#include "List.h" +#include "HtConfiguration.h" + +#include "filecopy.h" + +// This is an attempt to get around compatibility problems +// with the included regex +#ifdef _MSC_VER /* _WIN32 */ +#include "regex_win32.h" +#else +# ifdef USE_RX +# include <rxposix.h> +# else // Use regex +# ifdef HAVE_BROKEN_REGEX +# include <regex.h> +# else // include regex code and header +# include "gregex.h" +# endif +# endif +#endif //_MSC_VER /* _WIN32 */ + +#include <stdio.h> +#include <stdlib.h> +#include <sys/stat.h> + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +//***************************************************************************** +// +int +Endings::createDB(const HtConfiguration &config) +{ + Dictionary rules; + String tmpdir = getenv("TMPDIR"); + String word2root, root2word; + +#if defined(LIBHTDIG) || defined(LIBHTDIGPHP) || defined(_MSC_VER) //WIN32 + int ret = -1; + char * source = NULL; + char * dest = NULL; +#endif + + if (tmpdir.length()) + { + word2root = tmpdir; + root2word = tmpdir; + } + else + { + word2root = "/tmp"; + root2word = "/tmp"; + } + + word2root << "/word2root.db"; + root2word << "/root2word.db"; + + if (debug) + cout << "htfuzzy/endings: Reading rules\n"; + + if (readRules(rules, config["endings_affix_file"]) == NOTOK) + return NOTOK; + + if (debug) + cout << "htfuzzy/endings: Creating databases\n"; + + if (createRoot(rules, word2root, root2word, + config["endings_dictionary"]) == NOTOK) + return NOTOK; + + // + // Since we used files in TMPDIR for our temporary databases, we need + // to now move them to the correct location as defined in the config + // database. + // + +#if defined(LIBHTDIG) || defined(LIBHTDIGPHP) || defined(_MSC_VER) //WIN32 + + //Uses file_copy function - works on Unix/Linux & WinNT + source = root2word.get(); + dest = (char *)config["endings_root2word_db"].get(); + + //Attempt rename, if fail attempt copy & delete. + ret = rename(source, dest); + if (ret < 0) + { + ret = file_copy(source, dest, FILECOPY_OVERWRITE_ON); + if (ret == TRUE) + unlink(source); + else + return NOTOK; + } + + source = word2root.get(); + dest = (char *)config["endings_word2root_db"].get(); + + //Attempt rename, if fail attempt copy & delete. + ret = rename(source, dest); + if (ret < 0) + { + ret = file_copy(source, dest, FILECOPY_OVERWRITE_ON); + if (ret == TRUE) + unlink(source); + else + return NOTOK; + } + +#else //This code uses a system call - Phase this out + + struct stat stat_buf; + String mv("mv"); // assume it's in the PATH if predefined setting fails + if ((stat(MV, &stat_buf) != -1) && S_ISREG(stat_buf.st_mode)) + mv = MV; + system(form("%s %s %s;%s %s %s", + mv.get(), root2word.get(), config["endings_root2word_db"].get(), + mv.get(), word2root.get(), config["endings_word2root_db"].get())); + +#endif + + return OK; + +} + + +//***************************************************************************** +int +Endings::readRules(Dictionary &rules, const String& rulesFile) +{ + FILE *fl = fopen(rulesFile, "r"); + + if (fl == NULL) + return NOTOK; + + int inSuffixes = 0; + char currentSuffix[2] = " "; + char *p; + char input[1024]; + String line; + + while (fgets(input, sizeof(input), fl)) + { + if (input[0] == '\n' || input[0] == '#') + continue; + + if (mystrncasecmp(input, "suffixes", 8) == 0) + { + inSuffixes = 1; + continue; + } + else if (mystrncasecmp(input, "prefixes", 8) == 0) + { + inSuffixes = 0; + continue; + } + if (!inSuffixes) + continue; + + if (mystrncasecmp(input, "flag ", 5) == 0) + { + p = input + 5; + while (*p == '*' || *p == ' ' || *p == '\t') + p++; + currentSuffix[0] = *p; + } + else + { + line << input; + line.chop("\r\n"); + if (line.indexOf('>') > 0) + { + List *list; + SuffixEntry *se = new SuffixEntry(line); + + if (rules.Exists(currentSuffix)) + { + list = (List *) rules[currentSuffix]; + } + else + { + list = new List; + rules.Add(currentSuffix, list); + } + list->Add(se); + line = 0; + } + } + } + + fclose(fl); + return OK; +} + + +//***************************************************************************** +int +Endings::createRoot(Dictionary &rules, char *word2root, char *root2word, const String& dictFile) +{ + FILE *fl = fopen(dictFile, "r"); + if (fl == NULL) + return NOTOK; + + Database *w2r = Database::getDatabaseInstance(DB_BTREE); + Database *r2w = Database::getDatabaseInstance(DB_BTREE); + + w2r->OpenReadWrite(word2root, 0664); + r2w->OpenReadWrite(root2word, 0664); + + char input[1024]; + char *p; + String words; + String word; + List wordList; + int count = 0; + String data; + + while (fgets(input, sizeof(input), fl)) + { + if ((count % 100) == 0 && debug == 1) + { + cout << "htfuzzy/endings: words: " << count << '\n'; + cout.flush(); + } + count++; + + p = strchr(input, '/'); + if (p == NULL) + continue; // Only words that have legal endings are used + + *p++ = '\0'; + + mungeWord(input, word); + expandWord(words, wordList, rules, word, p); + + if (debug > 1) + cout << "htfuzzy/endings: " << word << " --> " << words << endl; + + // + // Store the root mapped to the list of expanded words. + // + r2w->Put(word, words); + + // + // For each of the expanded words, build a map to its root. + // + for (int i = 0; i < wordList.Count(); i++) + { + // + // Append to existing record if there is one. + // + data = ""; + if (w2r->Get(*(String *)wordList[i], data) == OK) + data << ' '; + data << word; + w2r->Put(*(String *)wordList[i], data); + } + } + + if (debug == 1) + cout << endl; + + fclose(fl); + w2r->Close(); + r2w->Close(); + delete w2r; + delete r2w; + + return OK; +} + + +//***************************************************************************** +// Convert a word from the dictionary format into something we can actually +// use. This means that the word will be converted to lowercase and that +// any accents will be combined into single characters. +// +void +Endings::mungeWord(char *input, String &word) +{ + char *p = input + 1; + + word = 0; + while (*input) + { + p = input + 1; + switch (*p) + { + case '"': // The previous character needs to get an umlaut + switch (*input) + { + case 'a': + case 'A': + word << char(228); + input += 2; + continue; + break; + case 'e': + case 'E': + word << char(235); + input += 2; + continue; + break; + case 'i': + case 'I': + word << char(239); + input += 2; + continue; + break; + case 'o': + case 'O': + word << char(246); + input += 2; + continue; + break; + case 'u': + case 'U': + word << char(252); + input += 2; + continue; + break; + } + break; + + case 'S': // See if the previous character needs to be an sz + if (*input == 's') + { + word << char(223); + input += 2; + continue; + } + else + { + word << *input; + } + break; + + default: + word << *input; + break; + } + input++; + } + word.lowercase(); +} + + +//***************************************************************************** +void +Endings::expandWord(String &words, List &wordList, + Dictionary &rules, char *word, char *suffixes) +{ + char suffix[2] = " "; + String root; + SuffixEntry *entry; + List *suffixRules; + char *p; + String rule; + + words = 0; + wordList.Destroy(); + + while (*suffixes > ' ') + { + suffix[0] = *suffixes++; + if (!rules.Exists(suffix)) + continue; + + suffixRules = (List *) rules[suffix]; + for (int i = 0; i < suffixRules->Count(); i++) + { + entry = (SuffixEntry *) (*suffixRules)[i]; + root = word; + regex_t reg; + rule = entry->rule; + if (strchr((char*)rule, '\'')) + continue; + if (debug > 2) + cout << "Applying regex '" << entry->expression << "' to " << word << endl; + regcomp(®, (char*)entry->expression, REG_ICASE | REG_NOSUB | REG_EXTENDED); + if (regexec(®, word, 0, NULL, 0) == 0) + { + // + // Matched + // + if (rule[0] == '-') + { + // + // We need to remove something... + // + p = strchr((char*)rule, ','); + if (p) + { + *p++ = '\0'; + root.chop((int)strlen(rule.get()) - 1); + root << p; + } + } + else + { + root << rule; + } + root.lowercase(); + if (debug > 2) + cout << word << " with " << rule << " --> '" << root << "'\n"; + wordList.Add(new String(root)); + words << root << ' '; + } + regfree(®); + } + } + words.chop(1); +} diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Exact.cc b/debian/htdig/htdig-3.2.0b6/htfuzzy/Exact.cc new file mode 100644 index 00000000..d433d413 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Exact.cc @@ -0,0 +1,78 @@ +// +// Exact.cc +// +// Exact: The exact-match "fuzzy" matching. Simply returns the word (minus punctuation) +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Exact.cc,v 1.11 2004/05/28 13:15:20 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <fcntl.h> + +#include "Exact.h" +#include "htString.h" +#include "List.h" + + +//***************************************************************************** +// Exact::Exact() +// +Exact::Exact(const HtConfiguration& config_arg) : + Fuzzy(config_arg) +{ + name = "exact"; +} + + +//***************************************************************************** +// Exact::~Exact() +// +Exact::~Exact() +{ +} + + +//***************************************************************************** +void +Exact::getWords(char *w, List &words) +{ + String stripped = w; + HtStripPunctuation(stripped); + + words.Add(new String(stripped)); +} + + +//***************************************************************************** +int +Exact::openIndex() +{ + return 0; +} + + +//***************************************************************************** +void +Exact::generateKey(char *, String &) +{ +} + + +//***************************************************************************** +void +Exact::addWord(char *) +{ +} + + + + diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Exact.h b/debian/htdig/htdig-3.2.0b6/htfuzzy/Exact.h new file mode 100644 index 00000000..0c664ccd --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Exact.h @@ -0,0 +1,45 @@ +// +// Exact.h +// +// Exact: The exact-match "fuzzy" matching. Simply returns the word (minus punctuation) +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Exact.h,v 1.7 2004/05/28 13:15:20 lha Exp $ +// + +#ifndef _Exact_h_ +#define _Exact_h_ + +#include "Fuzzy.h" + +class Dictionary; +class String; +class List; + + +class Exact : public Fuzzy +{ +public: + // + // Construction/Destruction + // + Exact(const HtConfiguration& config_arg); + virtual ~Exact(); + + virtual void getWords(char *word, List &words); + virtual int openIndex(); + + virtual void generateKey(char *, String &); + virtual void addWord(char *); + +private: +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Fuzzy.cc b/debian/htdig/htdig-3.2.0b6/htfuzzy/Fuzzy.cc new file mode 100644 index 00000000..5777f4af --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Fuzzy.cc @@ -0,0 +1,229 @@ +// +// Fuzzy.cc +// +// Fuzzy: This is the base class for all the different types of fuzzy searches. +// We only define the interface. +// +// There are two main uses of classes derived from this class: +// 1) Creation of a fuzzy index +// 2) Searching for a word using the fuzzy index +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Fuzzy.cc,v 1.20 2004/05/28 13:15:20 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <fcntl.h> + +#include "Fuzzy.h" +#include "htfuzzy.h" +#include "HtConfiguration.h" +#include "List.h" +#include "StringList.h" +#include "Endings.h" +#include "Exact.h" +#include "Metaphone.h" +#include "Soundex.h" +#include "Synonym.h" +#include "Substring.h" +#include "Prefix.h" +#include "Regexp.h" +#include "Speling.h" +#include "Accents.h" + +//***************************************************************************** +// Fuzzy::Fuzzy(const HtConfiguration& config) +// +Fuzzy::Fuzzy(const HtConfiguration& config_arg) : + config(config_arg) +{ + dict = 0; + index = 0; +} + + +//***************************************************************************** +// Fuzzy::~Fuzzy() +// +Fuzzy::~Fuzzy() +{ + if (index) + { + index->Close(); + delete index; + index = 0; + } + delete dict; +} + + +//***************************************************************************** +// void Fuzzy::getWords(char *word, List &words) +// +void +Fuzzy::getWords(char *word, List &words) +{ + if (!index) + return; + if (!word || !*word) + return; + + // + // Convert the word to a fuzzy key + // + String fuzzyKey; + String data; + String stripped = word; + HtStripPunctuation(stripped); + generateKey(stripped, fuzzyKey); + if (debug > 2) + cout << "\n\tkey: " << fuzzyKey << endl; + + words.Destroy(); + + if (index->Get(fuzzyKey, data) == OK) + { + // + // Found the entry + // + char *token = strtok(data.get(), " "); + while (token) + { + if (mystrcasecmp(token, word) != 0) + { + words.Add(new String(token)); + } + token = strtok(0, " "); + } + } + else + { + // + // The key wasn't found. + // + } +} + + +//***************************************************************************** +// int Fuzzy::openIndex(const HtConfiguration &config) +// +int +Fuzzy::openIndex() +{ + String var = name; + var << "_db"; + const String filename = config[var]; + + index = Database::getDatabaseInstance(DB_HASH); + if (index->OpenRead(filename) == NOTOK) + { + delete index; + index = 0; + return NOTOK; + } + + return OK; +} + + +//***************************************************************************** +// int Fuzzy::writeDB(HtConfiguration &config) +// +int +Fuzzy::writeDB() +{ + String var = name; + var << "_db"; + const String filename = config[var]; + + index = Database::getDatabaseInstance(DB_HASH); + if (index->OpenReadWrite(filename, 0664) == NOTOK) + return NOTOK; + + String *s; + char *fuzzyKey; + + int count = 0; + + dict->Start_Get(); + while ((fuzzyKey = dict->Get_Next())) + { + s = (String *) dict->Find(fuzzyKey); + index->Put(fuzzyKey, *s); + + if (debug > 1) + { + cout << "htfuzzy: '" << fuzzyKey << "' ==> '" << s->get() << "'\n"; + } + count++; + if ((count % 100) == 0 && debug == 1) + { + cout << "htfuzzy: keys: " << count << '\n'; + cout.flush(); + } + } + if (debug == 1) + { + cout << "htfuzzy:Total keys: " << count << "\n"; + } + return OK; +} + + +//***************************************************************************** +// Fuzzy algorithm factory. +// +Fuzzy * +Fuzzy::getFuzzyByName(char *name, const HtConfiguration& config) +{ + if (mystrcasecmp(name, "exact") == 0) + return new Exact(config); + else if (mystrcasecmp(name, "soundex") == 0) + return new Soundex(config); + else if (mystrcasecmp(name, "metaphone") == 0) + return new Metaphone(config); + else if (mystrcasecmp(name, "accents") == 0) + return new Accents(config); + else if (mystrcasecmp(name, "endings") == 0) + return new Endings(config); + else if (mystrcasecmp(name, "synonyms") == 0) + return new Synonym(config); + else if (mystrcasecmp(name, "substring") == 0) + return new Substring(config); + else if (mystrcasecmp(name, "prefix") == 0) + return new Prefix(config); + else if (mystrcasecmp(name, "regex") == 0) + return new Regexp(config); + else if (mystrcasecmp(name, "speling") == 0) + return new Speling(config); + else + return 0; +} + +//***************************************************************************** +int +Fuzzy::createDB(const HtConfiguration &) +{ + return OK; +} + +void +Fuzzy::generateKey(char *, String &) +{ +} + + +void +Fuzzy::addWord(char *) +{ +} + diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Fuzzy.h b/debian/htdig/htdig-3.2.0b6/htfuzzy/Fuzzy.h new file mode 100644 index 00000000..825e357f --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Fuzzy.h @@ -0,0 +1,112 @@ +// +// Fuzzy.h +// +// Fuzzy: This is the base class for all the different types of fuzzy searches. +// We only define the interface. +// +// There are two main uses of classes derived from this class: +// 1) Creation of a fuzzy index +// 2) Searching for a word using the fuzzy index +// +// The Fuzzy classes take the raw words from the user's query and generate +// a list of words to be looked up in the database. These words are created +// using the getWords call and can either be picked off from a separate fuzzy +// database specific to the method, or by generating words on the fly. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Fuzzy.h,v 1.12 2004/05/28 13:15:20 lha Exp $ +// + +#ifndef _Fuzzy_h_ +#define _Fuzzy_h_ + +#include "Object.h" +#include "htString.h" +#include "Database.h" +#include "HtWordType.h" +#include "HtWordList.h" + +class HtConfiguration; +class Dictionary; +class List; + + +class Fuzzy : public Object +{ +public: + // + // Construction/Destruction + // + Fuzzy(const HtConfiguration& config); + virtual ~Fuzzy(); + + // + // Given a single work, generate a list of replacement words using + // the current algorithm. + // + virtual void getWords(char *word, List &words); + + // + // For the current algorithm, open the key database + // + virtual int openIndex(); + + // + // For searching, we will need to keep track of the weight associated + // with a particular fuzzy algorithm. + // + void setWeight(double w) {weight = w;} + double getWeight() {return weight;} + + //******************************************************************* + // The following are used in the creation of the fuzzy databases. + // + // For the current algorithm, write the database to disk. + // + virtual int writeDB(); + + // + // For the current algorithm, create the database. + // This is for those algoritms that don't need a list of words + // to work. + // + virtual int createDB(const HtConfiguration &config); + + // + // Given a word from the htdig word database, create the appropriate + // entries into memory which will later be written out with writeDB(). + // + virtual void addWord(char *word); + + // + // Each algorithm has a name... + // + char *getName() {return name;} + + // + // Fuzzy algorithm factory. This returns a new Fuzzy algorithm + // object that belongs to the given name. + // + static Fuzzy *getFuzzyByName(char *name, const HtConfiguration& config); + +protected: + // + // Given a single word, generate a database key + // + virtual void generateKey(char *word, String &key); + + char *name; + Database *index; + Dictionary *dict; + double weight; + const HtConfiguration& config; +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Makefile.am b/debian/htdig/htdig-3.2.0b6/htfuzzy/Makefile.am new file mode 100644 index 00000000..632a5be5 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Makefile.am @@ -0,0 +1,34 @@ + +include $(top_srcdir)/Makefile.config + +LOCAL_DEFINES= -DMV=\"$(MV)\" + +pkglib_LTLIBRARIES = libfuzzy.la + +libfuzzy_la_SOURCES = Endings.cc EndingsDB.cc Exact.cc \ + Fuzzy.cc Metaphone.cc Soundex.cc Accents.cc \ + SuffixEntry.cc Synonym.cc \ + Substring.cc Prefix.cc Regexp.cc Speling.cc + +libfuzzy_la_LDFLAGS = -release $(HTDIG_MAJOR_VERSION).$(HTDIG_MINOR_VERSION).$(HTDIG_MICRO_VERSION) ${extra_ldflags} + +noinst_HEADERS = Accents.h \ + Endings.h \ + Exact.h \ + Fuzzy.h \ + Metaphone.h \ + Prefix.h \ + Regexp.h \ + Soundex.h \ + Speling.h \ + Substring.h \ + SuffixEntry.h \ + Synonym.h \ + htfuzzy.h + +bin_PROGRAMS = htfuzzy + +htfuzzy_SOURCES = htfuzzy.cc +htfuzzy_DEPENDENCIES = libfuzzy.la $(HTLIBS) +htfuzzy_LDFLAGS = $(PROFILING) ${extra_ldflags} +htfuzzy_LDADD = libfuzzy.la $(HTLIBS) diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Makefile.in b/debian/htdig/htdig-3.2.0b6/htfuzzy/Makefile.in new file mode 100644 index 00000000..6e52aba9 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Makefile.in @@ -0,0 +1,542 @@ +# Makefile.in generated by automake 1.7.9 from Makefile.am. +# @configure_input@ + +# Copyright 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003 +# Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +# +# To compile with profiling do the following: +# +# make CFLAGS=-g CXXFLAGS=-g PROFILING=-p all +# + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +top_builddir = .. + +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +INSTALL = @INSTALL@ +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +host_triplet = @host@ +ACLOCAL = @ACLOCAL@ +ALLOCA = @ALLOCA@ +AMDEP_FALSE = @AMDEP_FALSE@ +AMDEP_TRUE = @AMDEP_TRUE@ +AMTAR = @AMTAR@ +APACHE = @APACHE@ +APACHE_MODULES = @APACHE_MODULES@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CGIBIN_DIR = @CGIBIN_DIR@ +COMMON_DIR = @COMMON_DIR@ +CONFIG_DIR = @CONFIG_DIR@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DATABASE_DIR = @DATABASE_DIR@ +DEFAULT_CONFIG_FILE = @DEFAULT_CONFIG_FILE@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +ECHO = @ECHO@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +F77 = @F77@ +FFLAGS = @FFLAGS@ +FIND = @FIND@ +GUNZIP = @GUNZIP@ +HAVE_SSL = @HAVE_SSL@ +HTDIG_MAJOR_VERSION = @HTDIG_MAJOR_VERSION@ +HTDIG_MICRO_VERSION = @HTDIG_MICRO_VERSION@ +HTDIG_MINOR_VERSION = @HTDIG_MINOR_VERSION@ +IMAGE_DIR = @IMAGE_DIR@ +IMAGE_URL_PREFIX = @IMAGE_URL_PREFIX@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +MAINT = @MAINT@ +MAINTAINER_MODE_FALSE = @MAINTAINER_MODE_FALSE@ +MAINTAINER_MODE_TRUE = @MAINTAINER_MODE_TRUE@ +MAKEINFO = @MAKEINFO@ +MV = @MV@ +OBJEXT = @OBJEXT@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PERL = @PERL@ +RANLIB = @RANLIB@ +RRDTOOL = @RRDTOOL@ +SEARCH_DIR = @SEARCH_DIR@ +SEARCH_FORM = @SEARCH_FORM@ +SED = @SED@ +SENDMAIL = @SENDMAIL@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +TAR = @TAR@ +TESTS_FALSE = @TESTS_FALSE@ +TESTS_TRUE = @TESTS_TRUE@ +TIME = @TIME@ +TIMEV = @TIMEV@ +USER = @USER@ +VERSION = @VERSION@ +YACC = @YACC@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_F77 = @ac_ct_F77@ +ac_ct_RANLIB = @ac_ct_RANLIB@ +ac_ct_STRIP = @ac_ct_STRIP@ +am__fastdepCC_FALSE = @am__fastdepCC_FALSE@ +am__fastdepCC_TRUE = @am__fastdepCC_TRUE@ +am__fastdepCXX_FALSE = @am__fastdepCXX_FALSE@ +am__fastdepCXX_TRUE = @am__fastdepCXX_TRUE@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +datadir = @datadir@ +exec_prefix = @exec_prefix@ +extra_ldflags = @extra_ldflags@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +oldincludedir = @oldincludedir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +subdirs = @subdirs@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ + +AUTOMAKE_OPTIONS = foreign no-dependencies + +INCLUDES = -DDEFAULT_CONFIG_FILE=\"$(DEFAULT_CONFIG_FILE)\" \ + -I$(top_srcdir)/include -I$(top_srcdir)/htlib \ + -I$(top_srcdir)/htnet -I$(top_srcdir)/htcommon \ + -I$(top_srcdir)/htword \ + -I$(top_srcdir)/db -I$(top_builddir)/db \ + $(LOCAL_DEFINES) $(PROFILING) + + +HTLIBS = $(top_builddir)/htnet/libhtnet.la \ + $(top_builddir)/htcommon/libcommon.la \ + $(top_builddir)/htword/libhtword.la \ + $(top_builddir)/htlib/libht.la \ + $(top_builddir)/htcommon/libcommon.la \ + $(top_builddir)/htword/libhtword.la \ + $(top_builddir)/db/libhtdb.la \ + $(top_builddir)/htlib/libht.la + + +LOCAL_DEFINES = -DMV=\"$(MV)\" + +pkglib_LTLIBRARIES = libfuzzy.la + +libfuzzy_la_SOURCES = Endings.cc EndingsDB.cc Exact.cc \ + Fuzzy.cc Metaphone.cc Soundex.cc Accents.cc \ + SuffixEntry.cc Synonym.cc \ + Substring.cc Prefix.cc Regexp.cc Speling.cc + + +libfuzzy_la_LDFLAGS = -release $(HTDIG_MAJOR_VERSION).$(HTDIG_MINOR_VERSION).$(HTDIG_MICRO_VERSION) ${extra_ldflags} + +noinst_HEADERS = Accents.h \ + Endings.h \ + Exact.h \ + Fuzzy.h \ + Metaphone.h \ + Prefix.h \ + Regexp.h \ + Soundex.h \ + Speling.h \ + Substring.h \ + SuffixEntry.h \ + Synonym.h \ + htfuzzy.h + + +bin_PROGRAMS = htfuzzy + +htfuzzy_SOURCES = htfuzzy.cc +htfuzzy_DEPENDENCIES = libfuzzy.la $(HTLIBS) +htfuzzy_LDFLAGS = $(PROFILING) ${extra_ldflags} +htfuzzy_LDADD = libfuzzy.la $(HTLIBS) +subdir = htfuzzy +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_HEADER = $(top_builddir)/include/config.h +CONFIG_CLEAN_FILES = +LTLIBRARIES = $(pkglib_LTLIBRARIES) + +libfuzzy_la_LIBADD = +am_libfuzzy_la_OBJECTS = Endings.lo EndingsDB.lo Exact.lo Fuzzy.lo \ + Metaphone.lo Soundex.lo Accents.lo SuffixEntry.lo Synonym.lo \ + Substring.lo Prefix.lo Regexp.lo Speling.lo +libfuzzy_la_OBJECTS = $(am_libfuzzy_la_OBJECTS) +bin_PROGRAMS = htfuzzy$(EXEEXT) +PROGRAMS = $(bin_PROGRAMS) + +am_htfuzzy_OBJECTS = htfuzzy.$(OBJEXT) +htfuzzy_OBJECTS = $(am_htfuzzy_OBJECTS) + +DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir)/include +depcomp = +am__depfiles_maybe = +CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) +LTCXXCOMPILE = $(LIBTOOL) --mode=compile $(CXX) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CXXFLAGS) $(CXXFLAGS) +CXXLD = $(CXX) +CXXLINK = $(LIBTOOL) --mode=link $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +DIST_SOURCES = $(libfuzzy_la_SOURCES) $(htfuzzy_SOURCES) +HEADERS = $(noinst_HEADERS) + +DIST_COMMON = $(noinst_HEADERS) $(srcdir)/Makefile.in \ + $(top_srcdir)/Makefile.config Makefile.am +SOURCES = $(libfuzzy_la_SOURCES) $(htfuzzy_SOURCES) + +all: all-am + +.SUFFIXES: +.SUFFIXES: .cc .lo .o .obj +$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/Makefile.config $(top_srcdir)/configure.in $(ACLOCAL_M4) + cd $(top_srcdir) && \ + $(AUTOMAKE) --foreign htfuzzy/Makefile +Makefile: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe) +pkglibLTLIBRARIES_INSTALL = $(INSTALL) +install-pkglibLTLIBRARIES: $(pkglib_LTLIBRARIES) + @$(NORMAL_INSTALL) + $(mkinstalldirs) $(DESTDIR)$(pkglibdir) + @list='$(pkglib_LTLIBRARIES)'; for p in $$list; do \ + if test -f $$p; then \ + f="`echo $$p | sed -e 's|^.*/||'`"; \ + echo " $(LIBTOOL) --mode=install $(pkglibLTLIBRARIES_INSTALL) $(INSTALL_STRIP_FLAG) $$p $(DESTDIR)$(pkglibdir)/$$f"; \ + $(LIBTOOL) --mode=install $(pkglibLTLIBRARIES_INSTALL) $(INSTALL_STRIP_FLAG) $$p $(DESTDIR)$(pkglibdir)/$$f; \ + else :; fi; \ + done + +uninstall-pkglibLTLIBRARIES: + @$(NORMAL_UNINSTALL) + @list='$(pkglib_LTLIBRARIES)'; for p in $$list; do \ + p="`echo $$p | sed -e 's|^.*/||'`"; \ + echo " $(LIBTOOL) --mode=uninstall rm -f $(DESTDIR)$(pkglibdir)/$$p"; \ + $(LIBTOOL) --mode=uninstall rm -f $(DESTDIR)$(pkglibdir)/$$p; \ + done + +clean-pkglibLTLIBRARIES: + -test -z "$(pkglib_LTLIBRARIES)" || rm -f $(pkglib_LTLIBRARIES) + @list='$(pkglib_LTLIBRARIES)'; for p in $$list; do \ + dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \ + test "$$dir" = "$$p" && dir=.; \ + echo "rm -f \"$${dir}/so_locations\""; \ + rm -f "$${dir}/so_locations"; \ + done +libfuzzy.la: $(libfuzzy_la_OBJECTS) $(libfuzzy_la_DEPENDENCIES) + $(CXXLINK) -rpath $(pkglibdir) $(libfuzzy_la_LDFLAGS) $(libfuzzy_la_OBJECTS) $(libfuzzy_la_LIBADD) $(LIBS) +binPROGRAMS_INSTALL = $(INSTALL_PROGRAM) +install-binPROGRAMS: $(bin_PROGRAMS) + @$(NORMAL_INSTALL) + $(mkinstalldirs) $(DESTDIR)$(bindir) + @list='$(bin_PROGRAMS)'; for p in $$list; do \ + p1=`echo $$p|sed 's/$(EXEEXT)$$//'`; \ + if test -f $$p \ + || test -f $$p1 \ + ; then \ + f=`echo "$$p1" | sed 's,^.*/,,;$(transform);s/$$/$(EXEEXT)/'`; \ + echo " $(INSTALL_PROGRAM_ENV) $(LIBTOOL) --mode=install $(binPROGRAMS_INSTALL) $$p $(DESTDIR)$(bindir)/$$f"; \ + $(INSTALL_PROGRAM_ENV) $(LIBTOOL) --mode=install $(binPROGRAMS_INSTALL) $$p $(DESTDIR)$(bindir)/$$f || exit 1; \ + else :; fi; \ + done + +uninstall-binPROGRAMS: + @$(NORMAL_UNINSTALL) + @list='$(bin_PROGRAMS)'; for p in $$list; do \ + f=`echo "$$p" | sed 's,^.*/,,;s/$(EXEEXT)$$//;$(transform);s/$$/$(EXEEXT)/'`; \ + echo " rm -f $(DESTDIR)$(bindir)/$$f"; \ + rm -f $(DESTDIR)$(bindir)/$$f; \ + done + +clean-binPROGRAMS: + @list='$(bin_PROGRAMS)'; for p in $$list; do \ + f=`echo $$p|sed 's/$(EXEEXT)$$//'`; \ + echo " rm -f $$p $$f"; \ + rm -f $$p $$f ; \ + done +htfuzzy$(EXEEXT): $(htfuzzy_OBJECTS) $(htfuzzy_DEPENDENCIES) + @rm -f htfuzzy$(EXEEXT) + $(CXXLINK) $(htfuzzy_LDFLAGS) $(htfuzzy_OBJECTS) $(htfuzzy_LDADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) core *.core + +distclean-compile: + -rm -f *.tab.c + +.cc.o: + $(CXXCOMPILE) -c -o $@ `test -f '$<' || echo '$(srcdir)/'`$< + +.cc.obj: + $(CXXCOMPILE) -c -o $@ `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` + +.cc.lo: + $(LTCXXCOMPILE) -c -o $@ `test -f '$<' || echo '$(srcdir)/'`$< + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +distclean-libtool: + -rm -f libtool +uninstall-info-am: + +ETAGS = etags +ETAGSFLAGS = + +CTAGS = ctags +CTAGSFLAGS = + +tags: TAGS + +ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + mkid -fID $$unique + +TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(ETAGS_ARGS)$$tags$$unique" \ + || $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$tags $$unique + +ctags: CTAGS +CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(CTAGS_ARGS)$$tags$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$tags $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && cd $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) $$here + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) + +top_distdir = .. +distdir = $(top_distdir)/$(PACKAGE)-$(VERSION) + +distdir: $(DISTFILES) + $(mkinstalldirs) $(distdir)/.. + @srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \ + list='$(DISTFILES)'; for file in $$list; do \ + case $$file in \ + $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \ + $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \ + esac; \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test "$$dir" != "$$file" && test "$$dir" != "."; then \ + dir="/$$dir"; \ + $(mkinstalldirs) "$(distdir)$$dir"; \ + else \ + dir=''; \ + fi; \ + if test -d $$d/$$file; then \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \ + fi; \ + cp -pR $$d/$$file $(distdir)$$dir || exit 1; \ + else \ + test -f $(distdir)/$$file \ + || cp -p $$d/$$file $(distdir)/$$file \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) $(PROGRAMS) $(HEADERS) + +installdirs: + $(mkinstalldirs) $(DESTDIR)$(pkglibdir) $(DESTDIR)$(bindir) +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + `test -z '$(STRIP)' || \ + echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f $(CONFIG_CLEAN_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-binPROGRAMS clean-generic clean-libtool \ + clean-pkglibLTLIBRARIES mostlyclean-am + +distclean: distclean-am + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-libtool distclean-tags + +dvi: dvi-am + +dvi-am: + +info: info-am + +info-am: + +install-data-am: + +install-exec-am: install-binPROGRAMS install-pkglibLTLIBRARIES + +install-info: install-info-am + +install-man: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: uninstall-binPROGRAMS uninstall-info-am \ + uninstall-pkglibLTLIBRARIES + +.PHONY: CTAGS GTAGS all all-am check check-am clean clean-binPROGRAMS \ + clean-generic clean-libtool clean-pkglibLTLIBRARIES ctags \ + distclean distclean-compile distclean-generic distclean-libtool \ + distclean-tags distdir dvi dvi-am info info-am install \ + install-am install-binPROGRAMS install-data install-data-am \ + install-exec install-exec-am install-info install-info-am \ + install-man install-pkglibLTLIBRARIES install-strip \ + installcheck installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags uninstall uninstall-am uninstall-binPROGRAMS \ + uninstall-info-am uninstall-pkglibLTLIBRARIES + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Makefile.win32 b/debian/htdig/htdig-3.2.0b6/htfuzzy/Makefile.win32 new file mode 100644 index 00000000..2d2f9a9e --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Makefile.win32 @@ -0,0 +1,35 @@ + +TARGET1 = $(LIBDIR)/libfuzzy$(LIBSFX) +TARGET2 = $(BINDIR)/htfuzzy$(EXESFX) + +# ---------------------------------------------------------------------------- +# add new library members to this list + +# ---------------------------------------------------------------------------- + +include ../Makedefs.win32 + +CXXSRC = Endings.cc EndingsDB.cc Exact.cc Fuzzy.cc Metaphone.cc \ + Soundex.cc Accents.cc SuffixEntry.cc Synonym.cc \ + Substring.cc Prefix.cc Regexp.cc Speling.cc + +CPPFLAGS += -DHAVE_CONFIG_H -I. -I../htlib -I../db -I../htcommon -I../htword + +OBJS2 = win32/htfuzzy.obj + +LDLIBS = ../lib/$(ARCH)/libhtnet.lib ../lib/$(ARCH)/libcommon.lib ../lib/$(ARCH)/libhtword.lib ../lib/$(ARCH)/libht.lib ../lib/$(ARCH)/libcommon.lib ../lib/$(ARCH)/libhtword.lib ../lib/$(ARCH)/libht.lib ../lib/$(ARCH)/libfuzzy.lib ../lib/$(ARCH)/libhtdb.lib +OTHERLIBS = ws2_32.lib L:/win32/lib/zlib114/zlib.lib + +DEPLIBS += $(LDLIBS) + +all: $(TARGET1) $(TARGET2) + +$(TARGET1): $(OBJDIRDEP) $(LIBDIRDEP) $(OBJS) + $(AR) $(ARFLAGS) $(OBJS) + +$(TARGET2): $(OBJDIRDEP) $(BINDIRDEP) $(OBJS2) $(DEPLIBS) + $(EXELD) $(LDFLAGS) $(OBJS2) $(LDLIBS) $(OTHERLIBS) + + +include ../Makerules.win32 + diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Metaphone.cc b/debian/htdig/htdig-3.2.0b6/htfuzzy/Metaphone.cc new file mode 100644 index 00000000..1f066a9d --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Metaphone.cc @@ -0,0 +1,330 @@ +// +// Metaphone.cc +// +// Metaphone: A fuzzy matching algorithm used to match words that +// sound alike in the English language. Probably not so +// good for foreign languages. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Metaphone.cc,v 1.12 2004/05/28 13:15:20 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <fcntl.h> + +#include "Metaphone.h" +#include "Dictionary.h" + +#include <ctype.h> + + +//***************************************************************************** +// Metaphone::Metaphone(const HtConfiguration& config_arg) +// +Metaphone::Metaphone(const HtConfiguration& config_arg) : + Fuzzy(config_arg) +{ + name = "metaphone"; +} + + +//***************************************************************************** +// Metaphone::~Metaphone() +// +Metaphone::~Metaphone() +{ +} + + +//***************************************************************************** +// void Metaphone::generateKey(char *word, String &key) +// +/* + * This code was copied from the slapd package developed at umich. + * it was debugged and cleaned up in February 1999 by Geoffrey Hutchison + * for the ht://Dig Project. + */ +/* + * Metaphone copied from C Gazette, June/July 1991, pp 56-57, + * author Gary A. Parker, with changes by Bernard Tiffany of the + * University of Michigan, and more changes by Tim Howes of the + * University of Michigan. + */ + +/* Character coding array */ +static char vsvfn[26] = { + 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, + /* A B C D E F G H I J K L M */ + 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0}; + /* N O P Q R S T U V W X Y Z */ + +/* Macros to access character coding array */ +#define vscode(x) ((x) >= 'A' && (x) <= 'Z' ? vsvfn[(x) - 'A'] : 0) +#define vowel(x) ((x) != '\0' && vscode(x) & 1) /* AEIOU */ +#define same(x) ((x) != '\0' && vscode(x) & 2) /* FJLMNR */ +#define varson(x) ((x) != '\0' && vscode(x) & 4) /* CGPST */ +#define frontv(x) ((x) != '\0' && vscode(x) & 8) /* EIY */ +#define noghf(x) ((x) != '\0' && vscode(x) & 16) /* BDH */ + +#define MAXPHONEMELEN 6 + +void +Metaphone::generateKey(char *word, String &key) +{ + if (!word || !*word) + return; + + char *n; + String ntrans; + + /* + * Copy Word to internal buffer, dropping non-alphabetic characters + * and converting to upper case + */ + + ntrans << "0000"; + + for (; *word; word++) + { + if (isalpha(*word)) + ntrans << *word; + } + ntrans.uppercase(); + + /* ntrans[0] will always be == 0 */ + n = ntrans.get(); + *n++ = 0; + *n++ = 0; + *n++ = 0; + *n = 0; /* Pad with nulls */ + n = ntrans.get() + 4; /* Assign pointer to start */ + + /* Check for PN, KN, GN, AE, WR, WH, and X at start */ + switch (*n) + { + case 'P': + case 'K': + case 'G': + /* 'PN', 'KN', 'GN' becomes 'N' */ + if (*(n + 1) == 'N') + *n++ = 0; + break; + case 'A': + /* 'AE' becomes 'E' */ + if (*(n + 1) == 'E') + *n++ = 0; + break; + case 'W': + /* 'WR' becomes 'R', and 'WH' to 'W' */ + if (*(n + 1) == 'R') + *n++ = 0; + else if (*(n + 1) == 'H') { + *(n + 1) = *n; + *n++ = 0; + } + break; + case 'X': + /* 'X' becomes 'S' */ + *n = 'S'; + break; + } + + /* + * Now, loop step through string, stopping at end of string or when + * the computed 'metaph' is MAXPHONEMELEN characters long + */ + + for (; *n && key.length() < MAXPHONEMELEN; n++) + { + /* Drop duplicates except for CC */ + if (*(n - 1) == *n && *n != 'C') + continue; + /* Check for F J L M N R or first letter vowel */ + if (same(*n) || *(n - 1) == '\0' && vowel(*n)) + key << *n; + else + { + switch (*n) + { + case 'B': + /* + * B unless in -MB + */ + if (*(n + 1) || *(n - 1) != 'M') + key << *n; + break; + case 'C': + /* + * X if in -CIA-, -CH- else S if in + * -CI-, -CE-, -CY- else dropped if + * in -SCI-, -SCE-, -SCY- else K + */ + if (*(n - 1) != 'S' || !frontv(*(n + 1))) + { + if (*(n + 1) == 'I' && *(n + 2) == 'A') + key << 'X'; + else if (frontv(*(n + 1))) + key << 'S'; + else if (*(n + 1) == 'H') + key << (((*(n - 1) == '\0' && !vowel(*(n + 2))) + || *(n - 1) == 'S') + ? 'K' : 'X'); + else + key << 'K'; + } + break; + case 'D': + /* + * J if in DGE or DGI or DGY else T + */ + key << ((*(n + 1) == 'G' && frontv(*(n + 2))) + ? (char) 'J' : (char) 'T'); + break; + case 'G': + /* + * F if in -GH and not B--GH, D--GH, + * -H--GH, -H---GH else dropped if + * -GNED, -GN, -DGE-, -DGI-, -DGY- + * else J if in -GE-, -GI-, -GY- and + * not GG else K + * + */ + if ((*(n + 1) != 'G' || vowel(*(n + 2))) && + (*(n + 1) != 'N' || (*(n + 1) && + (*(n + 2) != 'E' || + *(n + 3) != 'D'))) && + (*(n - 1) != 'D' || !frontv(*(n + 1)))) + if (frontv(*(n + 1)) && *(n + 2) != 'G') + key << 'J'; + else + key << 'K'; + else if (*(n + 1) == 'H' && !noghf(*(n - 3)) && + *(n - 4) != 'H') + key << 'F'; + break; + case 'H': + /* + * H if before a vowel and not after + * C, G, P, S, T else dropped + */ + if (!varson(*(n - 1)) && (!vowel(*(n - 1 + )) || + vowel(*(n + 1)))) + key << 'H'; + break; + case 'K': + /* + * dropped if after C else K + */ + if (*(n - 1) != 'C') + key << 'K'; + break; + case 'P': + /* + * F if before H, else P + */ + key << (*(n + 1) == 'H' ? + (char) 'F' : (char) 'P'); + break; + case 'Q': + /* + * K + */ + key << 'K'; + break; + case 'S': + /* + * X in -SH-, -SIO- or -SIA- else S + */ + key << ((*(n + 1) == 'H' || + (*(n + 1) == 'I' && (*(n + 2) == 'O' || + *(n + 2) == 'A'))) + ? (char) 'X' : (char) 'S'); + break; + case 'T': + /* + * X in -TIA- or -TIO- else 0 (zero) + * before H else dropped if in -TCH- + * else T + */ + if (*(n + 1) == 'I' && (*(n + 2) == 'O' || + *(n + 2) == 'A')) + key << 'X'; + else if (*(n + 1) == 'H') + key << '0'; + else if (*(n + 1) != 'C' || *(n + 2) != 'H') + key << 'T'; + break; + case 'V': + /* + * F + */ + key << 'F'; + break; + case 'W': + /* + * W after a vowel, else dropped + */ + case 'Y': + /* + * Y unless followed by a vowel + */ + if (vowel(*(n + 1))) + key << *n; + break; + case 'X': + /* + * KS + */ + if (*(n - 1) == '\0') + key << 'S'; + else + key << "KS"; /* Insert K, then S */ + break; + case 'Z': + /* + * S + */ + key << 'S'; + break; + } + } + } +} + + +//***************************************************************************** +// void Metaphone::addWord(char *word) +// +void +Metaphone::addWord(char *word) +{ + if (!dict) + { + dict = new Dictionary; + } + + String key; + generateKey(word, key); + + if (key.length() == 0) + return; + String *s = (String *) dict->Find(key); + if (s) + { + // if (mystrcasestr(s->get(), word) != 0) + (*s) << ' ' << word; + } + else + { + dict->Add(key, new String(word)); + } +} diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Metaphone.h b/debian/htdig/htdig-3.2.0b6/htfuzzy/Metaphone.h new file mode 100644 index 00000000..6e9d0803 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Metaphone.h @@ -0,0 +1,40 @@ +// +// Metaphone.h +// +// Metaphone: A fuzzy matching algorithm used to match words that +// sound alike in the English language. Probably not so +// good for foreign languages. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Metaphone.h,v 1.7 2004/05/28 13:15:20 lha Exp $ +// + +#ifndef _Metaphone_h_ +#define _Metaphone_h_ + +#include "Fuzzy.h" + +class Metaphone : public Fuzzy +{ +public: + // + // Construction/Destruction + // + Metaphone(const HtConfiguration& config_arg); + virtual ~Metaphone(); + + virtual void generateKey(char *word, String &key); + + virtual void addWord(char *word); + +private: +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Prefix.cc b/debian/htdig/htdig-3.2.0b6/htfuzzy/Prefix.cc new file mode 100644 index 00000000..1e254d80 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Prefix.cc @@ -0,0 +1,150 @@ +// +// Prefix.cc +// +// Prefix: The prefix fuzzy algorithm. Performs a O(log n) search on for words +// matching the *prefix* specified--thus significantly faster than a full +// substring search. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Prefix.cc,v 1.17 2004/05/28 13:15:20 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <fcntl.h> + +#include "Prefix.h" +#include "htString.h" +#include "List.h" +#include "StringMatch.h" +#include "HtConfiguration.h" + + +//***************************************************************************** +// Prefix::Prefix(const HtConfiguration& config_arg) +// +Prefix::Prefix(const HtConfiguration& config_arg) : + Fuzzy(config_arg) +{ + name = "prefix"; +} + + +//***************************************************************************** +// Prefix::~Prefix() +// +Prefix::~Prefix() +{ +} + + +//***************************************************************************** +// +// Prefix search +// +void +Prefix::getWords(char *w, List &words) +{ + if (w == NULL || w[0] == '\0') + return; + + String stripped = w; + HtStripPunctuation(stripped); + w = stripped.get(); + + const String prefix_suffix = config["prefix_match_character"]; + int prefix_suffix_length = prefix_suffix.length(); + int minimum_prefix_length = config.Value("minimum_prefix_length"); + + if (debug) + cerr << " word=" << w << " prefix_suffix=" << prefix_suffix + << " prefix_suffix_length=" << prefix_suffix_length + << " minimum_prefix_length=" << minimum_prefix_length << "\n"; + + if ((int)strlen(w) < minimum_prefix_length + prefix_suffix_length) + return; + + // A null prefix character means that prefix matching should be + // applied to every search word; otherwise return if the word does + // not end in the prefix character(s). + // + if (prefix_suffix_length > 0 + && strcmp(prefix_suffix, w+strlen(w)-prefix_suffix_length)) + return; + + HtWordList wordDB(config); + if (wordDB.Open(config["word_db"], O_RDONLY) == NOTOK) + return; + + int wordCount = 0; + int maximumWords = config.Value("max_prefix_matches", 1000); + String s; + int len = strlen(w) - prefix_suffix_length; + + // Strip the prefix character(s) + char w2[8192]; + strncpy(w2, w, sizeof(w2) - 1); + w2[sizeof(w2) - 1] = '\0'; + w2[strlen(w2) - prefix_suffix_length] = '\0'; + String w3(w2); + w3.lowercase(); + List *wordList = wordDB.Prefix(w3.get()); + WordReference *word_ref; + String last_word; + + wordList->Start_Get(); + while (wordCount < maximumWords && (word_ref = (WordReference *) wordList->Get_Next() )) + { + s = word_ref->Key().GetWord(); + + // If we're somehow past the original word, we're done + if (mystrncasecmp(s.get(), w, len)) + break; + + // If this is a duplicate word, ignore it + if (last_word.length() != 0 && last_word == s) + continue; + + last_word = s; + words.Add(new String(s)); + wordCount++; + } + if (wordList) { + wordList->Destroy(); + delete wordList; + } + wordDB.Close(); +} + + +//***************************************************************************** +int +Prefix::openIndex() +{ + return 0; +} + + +//***************************************************************************** +void +Prefix::generateKey(char *, String &) +{ +} + + +//***************************************************************************** +void +Prefix::addWord(char *) +{ +} + + + + diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Prefix.h b/debian/htdig/htdig-3.2.0b6/htfuzzy/Prefix.h new file mode 100644 index 00000000..f477a5a5 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Prefix.h @@ -0,0 +1,48 @@ +// +// Prefix.h +// +// Prefix: The prefix fuzzy algorithm. Performs a O(log n) search on for words +// matching the *prefix* specified--thus significantly faster than a full +// substring search. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Prefix.h,v 1.7 2004/05/28 13:15:20 lha Exp $ +// + +#ifndef _Prefix_h_ +#define _Prefix_h_ + +#include "Fuzzy.h" +#include "htfuzzy.h" + +class Dictionary; +class String; +class List; + + +class Prefix : public Fuzzy +{ +public: + // + // Construction/Destruction + // + Prefix(const HtConfiguration& config_arg); + virtual ~Prefix(); + + virtual void getWords(char *word, List &words); + virtual int openIndex(); + + virtual void generateKey(char *, String &); + virtual void addWord(char *); + +private: +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Regexp.cc b/debian/htdig/htdig-3.2.0b6/htfuzzy/Regexp.cc new file mode 100644 index 00000000..d3ef4b71 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Regexp.cc @@ -0,0 +1,116 @@ +// +// Regexp.cc +// +// Regexp: A fuzzy to match input regex against the word database. +// Based on the substring fuzzy +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Regexp.cc,v 1.5 2004/05/28 13:15:20 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <fcntl.h> + +#include "Regexp.h" +#include "htString.h" +#include "List.h" +#include "StringMatch.h" +#include "HtConfiguration.h" + +//***************************************************************************** +// Regexp::Regexp(const HtConfiguration& config_arg) +// +Regexp::Regexp(const HtConfiguration& config_arg) : + Fuzzy(config_arg) +{ + name = "regex"; +} + + +//***************************************************************************** +// Regexp::~Regexp() +// +Regexp::~Regexp() +{ +} + + +//***************************************************************************** +// A very simplistic and inefficient regex search. For every word +// that is looked for we do a complete linear search through the word +// database. +// Maybe a better method of doing this would be to mmap a list of words +// to memory and then run the regex on it. It would still be a +// linear search, but with much less overhead. +// +void +Regexp::getWords(char *pattern, List &words) +{ + HtRegex regexMatch; + String stripped (pattern); + + // First we have to strip the necessary punctuation +// Why?? lha +// stripped.remove("^.[]$()|*+?{},-\\"); + + // Anchor the string to be matched + regexMatch.set(String("^") + stripped); + + HtWordList wordDB(config); + List *wordList; + String *key; + wordDB.Open(config["word_db"], O_RDONLY); + wordList = wordDB.Words(); + + int wordCount = 0; + int maximumWords = config.Value("regex_max_words", 25); + + wordList->Start_Get(); + while (wordCount < maximumWords && (key = (String *) wordList->Get_Next())) + { + if (regexMatch.match(*key, 0, 0) != 0) + { + words.Add(new String(*key)); + wordCount++; + } + } + if (wordList) { + wordList->Destroy(); + delete wordList; + } + wordDB.Close(); +} + + +//***************************************************************************** +int +Regexp::openIndex() +{ + return 0; +} + + +//***************************************************************************** +void +Regexp::generateKey(char *, String &) +{ +} + + +//***************************************************************************** +void +Regexp::addWord(char *) +{ +} + + + + diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Regexp.h b/debian/htdig/htdig-3.2.0b6/htfuzzy/Regexp.h new file mode 100644 index 00000000..c6c977a8 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Regexp.h @@ -0,0 +1,47 @@ +// +// Regexp.h +// +// Regexp: A fuzzy to match input regex against the word database. +// Based on the substring fuzzy +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Regexp.h,v 1.4 2004/05/28 13:15:20 lha Exp $ +// + +#ifndef _Regexp_h_ +#define _Regexp_h_ + +#include "Fuzzy.h" +#include "HtRegex.h" + +class Dictionary; +class String; +class List; + + +class Regexp : public Fuzzy +{ +public: + // + // Construction/Destruction + // + Regexp(const HtConfiguration& config_arg); + virtual ~Regexp(); + + virtual void getWords(char *word, List &words); + virtual int openIndex(); + + virtual void generateKey(char *, String &); + virtual void addWord(char *); + +private: +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Soundex.cc b/debian/htdig/htdig-3.2.0b6/htfuzzy/Soundex.cc new file mode 100644 index 00000000..ed903e9d --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Soundex.cc @@ -0,0 +1,167 @@ +// +// Soundex.cc +// +// Soundex: A fuzzy matching algorithm on the principal of the +// Soundex method for last names used by the U.S. INS +// and described by Knuth and others. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Soundex.cc,v 1.11 2004/05/28 13:15:20 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <fcntl.h> + +#include "Soundex.h" +#include "Dictionary.h" + +#include <ctype.h> + +//***************************************************************************** +// Soundex::Soundex(const HtConfiguration& config_arg) +// +Soundex::Soundex(const HtConfiguration& config_arg) : + Fuzzy(config_arg) +{ + name = "soundex"; +} + + +//***************************************************************************** +// Soundex::~Soundex() +// +Soundex::~Soundex() +{ +} + + +//***************************************************************************** +// void Soundex::generateKey(char *word, String &key) +// +void +Soundex::generateKey(char *word, String &key) +{ + int code = 0; + int lastcode = 0; + + key = 0; + if (!word) + { + key = '0'; + return; + } + + while (*word && !isalpha(*word)) + word++; + + if (*word) + { + key << *word++; + } + else + { + key = '0'; + return; + } + + + while (key.length() < 6) + { + switch (*word) + { + case 'b': + case 'p': + case 'f': + case 'v': + code = 1; + break; + + case 'c': + case 's': + case 'k': + case 'g': + case 'j': + case 'q': + case 'x': + case 'z': + code = 2; + break; + + case 'd': + case 't': + code = 3; + break; + + case 'l': + code = 4; + break; + + case 'm': + case 'n': + code = 5; + break; + + case 'r': + code = 6; + break; + + case 'a': + case 'e': + case 'i': + case 'o': + case 'u': + case 'y': + case 'w': + case 'h': + code = 0; + break; + + default: + break; + } + if (code && code != lastcode) + { + key << code; + lastcode = code; + } + if (*word) + word++; + else + break; + } +} + + +//***************************************************************************** +// void Soundex::addWord(char *word) +// +void +Soundex::addWord(char *word) +{ + if (!dict) + { + dict = new Dictionary; + } + + String key; + generateKey(word, key); + + String *s = (String *) dict->Find(key); + if (s) + { + // if (mystrcasestr(s->get(), word) != 0) + (*s) << ' ' << word; + } + else + { + dict->Add(key, new String(word)); + } +} diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Soundex.h b/debian/htdig/htdig-3.2.0b6/htfuzzy/Soundex.h new file mode 100644 index 00000000..331e27d9 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Soundex.h @@ -0,0 +1,40 @@ +// +// Soundex.h +// +// Soundex: A fuzzy matching algorithm on the principal of the +// Soundex method for last names used by the U.S. INS +// and described by Knuth and others. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Soundex.h,v 1.7 2004/05/28 13:15:20 lha Exp $ +// + +#ifndef _Soundex_h_ +#define _Soundex_h_ + +#include "Fuzzy.h" + +class Soundex : public Fuzzy +{ +public: + // + // Construction/Destruction + // + Soundex(const HtConfiguration& config_arg); + virtual ~Soundex(); + + virtual void generateKey(char *word, String &key); + + virtual void addWord(char *word); + +private: +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Speling.cc b/debian/htdig/htdig-3.2.0b6/htfuzzy/Speling.cc new file mode 100644 index 00000000..e9f365e3 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Speling.cc @@ -0,0 +1,138 @@ +// +// Speling.h +// +// Speling: (sic) Performs elementary (one-off) spelling correction for ht://Dig +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Speling.cc,v 1.12 2004/05/28 13:15:20 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <fcntl.h> + +#include "Speling.h" +#include "htString.h" +#include "List.h" +#include "StringMatch.h" +#include "HtConfiguration.h" + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +#include <stdio.h> + +//***************************************************************************** +// Speling::Speling(const HtConfiguration& config_arg) +// +Speling::Speling(const HtConfiguration& config_arg) : + Fuzzy(config_arg) +{ + name = "speling"; +} + + +//***************************************************************************** +// Speling::~Speling() +// +Speling::~Speling() +{ +} + + +//***************************************************************************** +// A fairly efficient one-off spelling checker +// This generates the small list of possibilities and +// checks to see if they exist... +// +void +Speling::getWords(char *w, List &words) +{ + if ((int)strlen(w) < config.Value("minimum_speling_length",5)) + return; + + HtWordList wordDB(config); + // last arg=1 -> open to compare only "word" part of of word keys + if (wordDB.Open(config["word_db"], O_RDONLY, 1) == NOTOK) + return; + + String initial = w; + String stripped = initial; + HtStripPunctuation(stripped); + String tail; + int max_length = stripped.length() - 1; + + for (int pos = 0; pos < max_length; pos++) + { + // First transposes + // (these are really common) + initial = stripped; + char temp = initial[pos]; + initial[pos] = initial[pos+1]; + initial[pos+1] = temp; + if (!wordDB.Exists(initial)) // Seems weird, but this is correct + words.Add(new String(initial)); + + // Now let's do deletions + initial = stripped; + tail = initial.sub(pos+1); + if (pos > 0) + { + initial = initial.sub(0, pos); + initial += tail; + } + else + initial = tail; + + if (!wordDB.Exists(initial)) // Seems weird, but this is correct + words.Add(new String(initial)); + } + + // One last deletion -- check the last character! + initial = stripped; + initial = initial.sub(0, initial.length() - 1); + + if (!wordDB.Exists(initial)) // Seems weird, but this is correct + words.Add(new String(initial)); + + wordDB.Close(); +} + + +//***************************************************************************** +int +Speling::openIndex() +{ + return 0; +} + + +//***************************************************************************** +void +Speling::generateKey(char *, String &) +{ +} + + +//***************************************************************************** +void +Speling::addWord(char *) +{ +} + + + + diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Speling.h b/debian/htdig/htdig-3.2.0b6/htfuzzy/Speling.h new file mode 100644 index 00000000..59b3466d --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Speling.h @@ -0,0 +1,45 @@ +// +// Speling.h +// +// Speling: (sic) Performs elementary (one-off) spelling correction for ht://Dig +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Speling.h,v 1.7 2004/05/28 13:15:20 lha Exp $ +// + +#ifndef _Speling_h_ +#define _Speling_h_ + +#include "Fuzzy.h" + +class Dictionary; +class String; +class List; + + +class Speling : public Fuzzy +{ +public: + // + // Construction/Destruction + // + Speling(const HtConfiguration& config_arg); + virtual ~Speling(); + + virtual void getWords(char *word, List &words); + virtual int openIndex(); + + virtual void generateKey(char *, String &); + virtual void addWord(char *); + +private: +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Substring.cc b/debian/htdig/htdig-3.2.0b6/htfuzzy/Substring.cc new file mode 100644 index 00000000..2b5a7c36 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Substring.cc @@ -0,0 +1,116 @@ +// +// Substring.cc +// +// Substring: The substring fuzzy algorithm. Currently a rather slow, naive approach +// that checks the substring against every word in the word db. +// It does not generate a separate database. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Substring.cc,v 1.15 2004/05/28 13:15:20 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <fcntl.h> + +#include "Substring.h" +#include "htString.h" +#include "List.h" +#include "StringMatch.h" +#include "HtConfiguration.h" + +//***************************************************************************** +// Substring::Substring(const HtConfiguration& config_arg) +// +Substring::Substring(const HtConfiguration& config_arg) : + Fuzzy(config_arg) +{ + name = "substring"; +} + + +//***************************************************************************** +// Substring::~Substring() +// +Substring::~Substring() +{ +} + + +//***************************************************************************** +// A very simplistic and inefficient substring search. For every word +// that is looked for we do a complete linear search through the word +// database. +// Maybe a better method of doing this would be to mmap a list of words +// to memory and then run the StringMatch on it. It would still be a +// linear search, but with much less overhead. +// +void +Substring::getWords(char *w, List &words) +{ + // First strip the punctuation + String stripped = w; + HtStripPunctuation(stripped); + + // Now set up the StringMatch object + StringMatch match; + match.Pattern(stripped); + + // And get the list of all possible words + HtWordList wordDB(config); + List *wordList; + String *key; + wordDB.Open(config["word_db"], O_RDONLY); + wordList = wordDB.Words(); + + int wordCount = 0; + int maximumWords = config.Value("substring_max_words", 25); + + wordList->Start_Get(); + while (wordCount < maximumWords && (key = (String *) wordList->Get_Next())) + { + if (match.FindFirst((char*)*key) >= 0) + { + words.Add(new String(*key)); + wordCount++; + } + } + if (wordList) { + wordList->Destroy(); + delete wordList; + } + wordDB.Close(); +} + + +//***************************************************************************** +int +Substring::openIndex() +{ + return 0; +} + + +//***************************************************************************** +void +Substring::generateKey(char *, String &) +{ +} + + +//***************************************************************************** +void +Substring::addWord(char *) +{ +} + + + + diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Substring.h b/debian/htdig/htdig-3.2.0b6/htfuzzy/Substring.h new file mode 100644 index 00000000..2960a153 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Substring.h @@ -0,0 +1,47 @@ +// +// Substring.h +// +// Substring: The substring fuzzy algorithm. Currently a rather slow, naive approach +// that checks the substring against every word in the word db. +// It does not generate a separate database. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Substring.h,v 1.7 2004/05/28 13:15:20 lha Exp $ +// + +#ifndef _Substring_h_ +#define _Substring_h_ + +#include "Fuzzy.h" + +class Dictionary; +class String; +class List; + + +class Substring : public Fuzzy +{ +public: + // + // Construction/Destruction + // + Substring(const HtConfiguration& config_arg); + virtual ~Substring(); + + virtual void getWords(char *word, List &words); + virtual int openIndex(); + + virtual void generateKey(char *, String &); + virtual void addWord(char *); + +private: +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/SuffixEntry.cc b/debian/htdig/htdig-3.2.0b6/htfuzzy/SuffixEntry.cc new file mode 100644 index 00000000..9db1903c --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/SuffixEntry.cc @@ -0,0 +1,77 @@ +// +// SuffixEntry.cc +// +// SuffixEntry: Decode the suffix rules used in the ispell dictionary files +// for the endings fuzzy DB. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: SuffixEntry.cc,v 1.5 2004/05/28 13:15:20 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <fcntl.h> + +#include "SuffixEntry.h" +#include "Endings.h" + + +//***************************************************************************** +// SuffixEntry::SuffixEntry() +// +SuffixEntry::SuffixEntry(char *str) +{ + parse(str); +} + + +//***************************************************************************** +// SuffixEntry::~SuffixEntry() +// +SuffixEntry::~SuffixEntry() +{ +} + + +//***************************************************************************** +// void SuffixEntry::parse(char *str) +// Parse a string in the format <expr> '>' <rule> into ourselves. +// +void +SuffixEntry::parse(char *str) +{ + String temp = 0; + + while (*str == ' ' || *str == '\t') + str++; + + temp = "^.*"; + while (*str != '>') + { + if (*str != ' ' && *str != '\t') + temp << *str; + str++; + } + temp << "$"; + while (*str == ' ' || *str == '\t' || *str == '>') + str++; + + Endings::mungeWord(temp, expression); + + temp = 0; + while (*str != ' ' && *str != '\t' && *str != '\n' && *str != '\r' && *str) + { + temp << *str; + str++; + } + Endings::mungeWord(temp, rule); +} + + diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/SuffixEntry.h b/debian/htdig/htdig-3.2.0b6/htfuzzy/SuffixEntry.h new file mode 100644 index 00000000..812b7841 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/SuffixEntry.h @@ -0,0 +1,43 @@ +// +// SuffixEntry.h +// +// SuffixEntry: Decode the suffix rules used in the ispell dictionary files +// for the endings fuzzy DB. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: SuffixEntry.h,v 1.6 2004/05/28 13:15:20 lha Exp $ +// + + +#ifndef _SuffixEntry_h_ +#define _SuffixEntry_h_ + +#include "Object.h" +#include "htString.h" + + +class SuffixEntry : public Object +{ +public: + // + // Construction/Destruction + // + SuffixEntry(char *); + ~SuffixEntry(); + + String expression; + String rule; + + void parse(char *str); + +private: +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Synonym.cc b/debian/htdig/htdig-3.2.0b6/htfuzzy/Synonym.cc new file mode 100644 index 00000000..234312a4 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Synonym.cc @@ -0,0 +1,225 @@ +// +// Synonym.cc +// +// Synonym: A fuzzy matching algorithm to create a database of related words +// (or misspellings) that should be searched together. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Synonym.cc,v 1.16 2004/05/28 13:15:20 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <fcntl.h> + +#include "Synonym.h" +#include "htfuzzy.h" +#include "List.h" +#include "StringList.h" +#include "HtConfiguration.h" + +#include "filecopy.h" + +#include <stdio.h> +#include <stdlib.h> +#include <sys/stat.h> + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +//***************************************************************************** +Synonym::Synonym(const HtConfiguration& config_arg) : + Fuzzy(config_arg) +{ + name = "synonyms"; + db = 0; +} + + +//***************************************************************************** +Synonym::~Synonym() +{ + if (db) + { + db->Close(); + delete db; + db = 0; + } +} + + +//***************************************************************************** +int +Synonym::createDB(const HtConfiguration &config) +{ + String tmpdir = getenv("TMPDIR"); + String dbFile; + +#if defined(LIBHTDIG) || defined(LIBHTDIGPHP) || defined(_MSC_VER) //WIN32 + int ret = -1; + char * source = NULL; + char * dest = NULL; +#endif + + if (tmpdir.length()) + dbFile = tmpdir; + else + dbFile = "/tmp"; + + dbFile << "/synonyms.db"; + + char input[1000]; + FILE *fl; + + const String sourceFile = config["synonym_dictionary"]; + + fl = fopen(sourceFile, "r"); + if (fl == NULL) + { + cout << "htfuzzy/synonyms: unable to open " << sourceFile << endl; + cout << "htfuzzy/synonyms: Use the 'synonym_dictionary' attribute\n"; + cout << "htfuzzy/synonyms: to specify the file that contains the synonyms\n"; + return NOTOK; + } + + Database *db = Database::getDatabaseInstance(DB_HASH); + + if (db->OpenReadWrite(dbFile.get(), 0664) == NOTOK) + { + delete db; + db = 0; + return NOTOK; + } + + String data; + String word; + int count = 0; + while (fgets(input, sizeof(input), fl)) + { + StringList sl(input, " \t\r\n"); + if (sl.Count() < 2) + { // Avoid segfault caused by calling Database::Put() + if (debug) // with negative length for data field + { + cout<<"htfuzzy/synonyms: Rejected line with less than 2 words: " + << input << endl; + cout.flush(); + } + continue; + } + for (int i = 0; i < sl.Count(); i++) + { + data = 0; + for (int j = 0; j < sl.Count(); j++) + { + if (i != j) + data << sl[j] << ' '; + } + word = sl[i]; + word.lowercase(); + data.lowercase(); + db->Put(word, String(data.get(), data.length() - 1)); + if (debug && (count % 10) == 0) + { + cout << "htfuzzy/synonyms: " << count << ' ' << word << "\n"; + cout.flush(); + } + count++; + } + } + fclose(fl); + db->Close(); + delete db; + +#if defined(LIBHTDIG) || defined(LIBHTDIGPHP) || defined(_MSC_VER) //WIN32 + + //Uses file_copy function - works on Unix/Linux & WinNT + source = dbFile.get(); + dest = (char *)config["synonym_db"].get(); + + //Attempt rename, if fail attempt copy & delete. + ret = rename(source, dest); + if (ret < 0) + { + ret = file_copy(source, dest, FILECOPY_OVERWRITE_ON); + if (ret == TRUE) + unlink(source); + else + return NOTOK; + } + + if (debug) + { + cout << "htfuzzy/synonyms: " << count << ' ' << word << "\n"; + cout << "htfuzzy/synonyms: Done.\n"; + } + +#else //This code uses a system call - Phase this out + + struct stat stat_buf; + String mv("mv"); // assume it's in the PATH if predefined setting fails + if ((stat(MV, &stat_buf) != -1) && S_ISREG(stat_buf.st_mode)) + mv = MV; + system(form("%s %s %s", + mv.get(), dbFile.get(), config["synonym_db"].get())); + +#endif + + return OK; +} + + +//***************************************************************************** +int +Synonym::openIndex() +{ + const String dbFile = config["synonym_db"]; + + if (db) + { + db->Close(); + delete db; + db = 0; + } + db = Database::getDatabaseInstance(DB_HASH); + if (db->OpenRead(dbFile) == NOTOK) + { + delete db; + db = 0; + return NOTOK; + } + return OK; +} + + +//***************************************************************************** +void +Synonym::getWords(char *originalWord, List &words) +{ + String data; + String stripped = originalWord; + HtStripPunctuation(stripped); + + if (db && db->Get(stripped, data) == OK) + { + char *token = strtok(data.get(), " "); + while (token) + { + words.Add(new String(token)); + token = strtok(0, " "); + } + } +} diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Synonym.h b/debian/htdig/htdig-3.2.0b6/htfuzzy/Synonym.h new file mode 100644 index 00000000..c07681d0 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Synonym.h @@ -0,0 +1,51 @@ +// +// Synonym.h +// +// Synonym: A fuzzy matching algorithm to create a database of related words +// (or misspellings) that should be searched together. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Synonym.h,v 1.7 2004/05/28 13:15:20 lha Exp $ +// +// + +#ifndef _Synonym_h_ +#define _Synonym_h_ + +#include "Fuzzy.h" + +class List; + +class Synonym : public Fuzzy +{ +public: + // + // Construction/Destruction + // + Synonym(const HtConfiguration& config_arg); + ~Synonym(); + + // + // Lookup routines + // + virtual void getWords(char *word, List &words); + virtual int openIndex(); + + // + // Creation + // + virtual int createDB(const HtConfiguration &config); + +protected: + + Database *db; +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/htfuzzy.cc b/debian/htdig/htdig-3.2.0b6/htfuzzy/htfuzzy.cc new file mode 100644 index 00000000..5a3789db --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/htfuzzy.cc @@ -0,0 +1,265 @@ +// +// htfuzzy.cc +// +// htfuzzy: Create one or more ``fuzzy'' indexes into the main word database. +// These indexes can be used by htsearch to perform a search that uses +// other algorithms than exact word match. +// +// This program is meant to be run after htmerge has created the word +// database. +// +// For each fuzzy algorithm, there will be a separate database. Each +// database is simply a mapping from the fuzzy key to a list of words +// in the main word database. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: htfuzzy.cc,v 1.20 2004/05/28 13:15:20 lha Exp $ +// +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "htfuzzy.h" +#include "Fuzzy.h" +#include "Accents.h" +#include "Soundex.h" +#include "Endings.h" +#include "Metaphone.h" +#include "Synonym.h" +#include "htString.h" +#include "List.h" +#include "Dictionary.h" +#include "defaults.h" +#include "HtWordList.h" +#include "WordContext.h" + +// If we have this, we probably want it. +#ifdef HAVE_GETOPT_H +#include <getopt.h> +#elif HAVE_GETOPT_LOCAL +#include <getopt_local.h> +#endif + +int debug = 0; + +void usage(); + + +//***************************************************************************** +// int main(int ac, char **av) +// +int +main(int ac, char **av) +{ + int c, i; + extern char *optarg; + extern int optind; + String configFile = DEFAULT_CONFIG_FILE; + + // + // Parse command line arguments + // + while ((c = getopt(ac, av, "c:v")) != -1) + { + switch (c) + { + case 'c': + configFile = optarg; + break; + + case 'v': + debug++; + break; + + default: + usage(); + } + } + + HtConfiguration* config= HtConfiguration::config(); + // + // Determine what algorithms to use + // + List wordAlgorithms; + List noWordAlgorithms; + for (i = optind; i < ac; i++) + { + if (mystrcasecmp(av[i], "soundex") == 0) + { + wordAlgorithms.Add(new Soundex(*config)); + } + else if (mystrcasecmp(av[i], "metaphone") == 0) + { + wordAlgorithms.Add(new Metaphone(*config)); + } + else if (mystrcasecmp(av[i], "accents") == 0) + { + wordAlgorithms.Add(new Accents(*config)); + } + else if (mystrcasecmp(av[i], "endings") == 0) + { + noWordAlgorithms.Add(new Endings(*config)); + } + else if (mystrcasecmp(av[i], "synonyms") == 0) + { + noWordAlgorithms.Add(new Synonym(*config)); + } + else + { + reportError(form("'%s' is not a supported algorithm", + av[i])); + } + } + if (wordAlgorithms.Count() == 0 && noWordAlgorithms.Count() == 0) + { + cout << "htfuzzy: No algorithms specified\n"; + usage(); + } + + // + // Find and parse the configuration file. + // + config->Defaults(&defaults[0]); + if (access((char*)configFile, R_OK) < 0) + { + reportError(form("Unable to find configuration file '%s'", + configFile.get())); + } + config->Read(configFile); + + // Initialize htword library (key description + wordtype...) + WordContext::Initialize(*config); + + Fuzzy *fuzzy; + if (wordAlgorithms.Count() > 0) + { + // + // Open the word database so that we can grab the words from it. + // + HtWordList worddb(*config); + if (worddb.Open(config->Find("word_db"), O_RDONLY) == OK) + { + // + // Go through all the words in the database + // + List *words = worddb.Words(); + String *key; + Fuzzy *fuzzy = 0; + String word, fuzzyKey; + int count = 0; + + words->Start_Get(); + while ((key = (String *) words->Get_Next())) + { + word = *key; + wordAlgorithms.Start_Get(); + while ((fuzzy = (Fuzzy *) wordAlgorithms.Get_Next())) + { + fuzzy->addWord(word); + } + count++; + if ((count % 100) == 0 && debug) + { + cout << "htfuzzy: words: " << count << '\n'; + cout.flush(); + } + } + if (debug) + { + cout << "htfuzzy: total words: " << count << "\n"; + cout << "htfuzzy: Writing index files...\n"; + } + + // + // All the information is now in memory. + // Write all of it out to the individual databases + // + wordAlgorithms.Start_Get(); + while ((fuzzy = (Fuzzy *) wordAlgorithms.Get_Next())) + { + fuzzy->writeDB(); + } + worddb.Close(); + words->Destroy(); + delete words; + if (fuzzy) + delete fuzzy; + } + else + { + reportError(form("Unable to open word database %s", config->Find("word_db").get())); + } + } + if (noWordAlgorithms.Count() > 0) + { + noWordAlgorithms.Start_Get(); + while ((fuzzy = (Fuzzy *) noWordAlgorithms.Get_Next())) + { + if (debug) + { + cout << "htfuzzy: Selected algorithm: " << fuzzy->getName() + << endl; + } + if (fuzzy->createDB(*config) == NOTOK) + { + cout << "htfuzzy: Could not create database for algorithm: " + << fuzzy->getName() << endl; + } + } + } + + if (debug) + { + cout << "htfuzzy: Done.\n"; + } + + return 0; +} + + +//***************************************************************************** +// void usage() +// +void +usage() +{ + cout << "usage: htfuzzy [-c configfile][-v] algorithm ...\n"; + cout << "This program is part of ht://Dig " << VERSION << "\n\n"; + cout << "Supported algorithms:\n"; + cout << "\tsoundex\n"; + cout << "\tmetaphone\n"; + cout << "\taccents\n"; + cout << "\tendings\n"; + cout << "\tsynonyms\n"; + cout << "\n"; + + cout << "Options:\n"; + + cout << "\t-c configfile\n"; + cout << "\t\tUse the specified configuration file instead of the\n"; + cout << "\t\tdefault.\n\n"; + + cout << "\t-v\tVerbose mode. This increases the verbosity of the\n"; + cout << "\t\tprogram. Using more than 2 is probably only useful\n"; + cout << "\t\tfor debugging purposes.\n\n"; + + exit(0); +} + + +//***************************************************************************** +// void reportError(char *msg) +// +void +reportError(char *msg) +{ + cout << "htfuzzy: " << msg << "\n\n"; + exit(1); +} + + diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/htfuzzy.h b/debian/htdig/htdig-3.2.0b6/htfuzzy/htfuzzy.h new file mode 100644 index 00000000..b512a672 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/htfuzzy.h @@ -0,0 +1,54 @@ +// +// htfuzzy.h +// +// htfuzzy: Create one or more ``fuzzy'' indexes into the main word database. +// These indexes can be used by htsearch to perform a search that uses +// other algorithms than exact word match. +// +// This program is meant to be run after htmerge has created the word +// database. +// +// For each fuzzy algorithm, there will be a separate database. Each +// database is simply a mapping from the fuzzy key to a list of words +// in the main word database. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: htfuzzy.h,v 1.12 2004/05/28 13:15:20 lha Exp $ +// + +#ifndef _htfuzzy_h_ +#define _htfuzzy_h_ + +#include "htconfig.h" +#include "HtConfiguration.h" +#include "HtWordList.h" + +#include <stdlib.h> + +#ifndef _MSC_VER /* _WIN32 */ +#include <unistd.h> +#endif + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +#include <stdio.h> + +extern int debug; + +extern void reportError(char *msg); + +#endif + + |