diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htlib/HtWordCodec.cc')
-rw-r--r-- | debian/htdig/htdig-3.2.0b6/htlib/HtWordCodec.cc | 437 |
1 files changed, 437 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htlib/HtWordCodec.cc b/debian/htdig/htdig-3.2.0b6/htlib/HtWordCodec.cc new file mode 100644 index 00000000..5a4474d4 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htlib/HtWordCodec.cc @@ -0,0 +1,437 @@ +// +// HtWordCodec.cc +// +// HtWordCodec: Given two lists of pair of "words" 'from' and 'to'; +// simple one-to-one translations, use those lists to translate. +// Only restriction are that no null (0) characters must be +// used in "words", and that there is a character "joiner" that +// does not appear in any word. One-to-one consistency may be +// checked at construction. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: HtWordCodec.cc,v 1.9 2004/05/28 13:15:21 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "HtWordCodec.h" + +// Do not use 0, so we can use "normal" string routines. +// Values 1..4 are used to describe how many bytes are used to +// keep the number. Do not use other than control-characters, +// as the first character for internal encodings, so the user +// can use "international" characters (128 .. 255) for cute +// encodings to use across different configuration files and +// databases. +#define JOIN_CHAR 5 +#define QUOTE_CHAR 6 +#define FIRST_INTERNAL_SINGLECHAR 7 +#define LAST_INTERNAL_SINGLECHAR 31 + + +HtWordCodec::HtWordCodec() +{ + myFrom = 0; + myTo = 0; + myFromMatch = 0; + myToMatch = 0; +} + + +HtWordCodec::~HtWordCodec() +{ + if (myFrom) + delete myFrom; + + if (myTo) + delete myTo; + + if (myFromMatch) + delete myFromMatch; + + if (myToMatch) + delete myToMatch; +} + + +// Straightforward filling of the encoding-lists. +HtWordCodec::HtWordCodec(StringList *from, StringList *to, char joiner) +{ + myFromMatch = new StringMatch; + myToMatch = new StringMatch; + + myTo = to; + myFrom = from; + + String to_pattern(myTo->Join(joiner)); + + // After being initialized with Join, the strings are not + // null-terminated, but that is done through "operator char*". + myToMatch->Pattern(to_pattern, joiner); + + String from_pattern(myFrom->Join(joiner)); + myFromMatch->Pattern(from_pattern, joiner); + +} + + +// This constructor is the most complicated function in this class. +// It handles consistency checking for the supplied code-lists. + +// Cleanups for anything except myTo, myFrom, myToMatch is +// necessary. The member myFromMatch is used as a sanity check +// for member functions to see that the constructor was +// successful in case the programmer forgets to check errmsg. +HtWordCodec::HtWordCodec(StringList &requested_encodings, + StringList &frequent_substrings, + String &errmsg) +{ + if ((requested_encodings.Count() % 2) != 0) + { + errmsg = + "Expected pairs, got odd number of strings"; + + return; + } + + myFrom = new StringList; + myTo = new StringList; + + // Go through requested_encodings and fill myTo and myFrom. + // Check that the "to" strings look remotely sane regarding + // reserved characters. + + // Iteration temporaries. + String *from; + String *to; + + int n_of_pairs = requested_encodings.Count() / 2; + + requested_encodings.Start_Get(); + while ((from = (String *) requested_encodings.Get_Next()) != NULL) + { + // Sanity check: Reserve empty strings as we cannot do + // anything sane with them. + + int templen = from->length(); + if (templen == 0) + { + errmsg = "Empty strings are not allowed"; + return; + } + + myFrom->Add(new String(*from)); + + // This must be non-null since we checked "oddness" above. + to = (String *) requested_encodings.Get_Next(); + + templen = to->length(); + if (templen == 0) + { + errmsg = "Empty strings are not allowed"; + return; + } + + // We just have to check that there's no JOIN_CHAR in the + // string. Since no "to" is allowed to be part of any other + // "to", there will be no ambiguity, even if one would + // contain a QUOTE_CHAR (which is documented as invalid anyway). + if (strchr(from->get(), JOIN_CHAR) != NULL) + { + errmsg = + form("(\"%s\" =>) \"%s\" contains a reserved character (number %d)", + from->get(), to->get(), int(JOIN_CHAR)); + return; + } + + // Loop over the other "to"-strings and check that this + // string is not a substring of any other "to", or vice versa. + // Return in error if it is so. + int i; + int count = myTo->Count(); + for (i = 0; i < count; i++) + { + String *ith = (String *) myTo->Nth(i); + + // Just check if the shorter string is part of the + // longer string. + if (to->length() < ith->length() + ? ith->indexOf(to->get()) != -1 + : to->indexOf(ith->get()) != -1) + { + errmsg = + form("\"%s\" => \"%s\" collides with (\"%s\" => \"%s\")", + from, to, (*myFrom)[i], ith->get()); + + return; + } + } + + // All ok, just add this one. + myTo->Add(new String(*to)); + } + + // Check that none of the "to"-strings is a substring of any + // of the "from" strings, since that's hard to support and + // most probably is a user mistake anyway. + + StringMatch req_tos; + String req_to_pattern(myTo->Join(JOIN_CHAR)); + int which, length; + + // The StringMatch functions want the strings + // zero-terminated, which is done through "operator char*". + req_tos.Pattern(req_to_pattern, JOIN_CHAR); + + // Check the requested encodings. + if (n_of_pairs != 0) + { + int i; + for (i = 0; i < n_of_pairs; i++) + { + from = (String *) myFrom->Nth(i); + if (req_tos.FindFirst(from->get(), which, length) != -1) + { + if (i != which) + { + errmsg = + form("(\"%s\" => \"%s\") overlaps (\"%s\" => \"%s\")", + (*myFrom)[which], (*myTo)[which], + from->get(), (*myTo)[i]); + } + else + { + errmsg = + form("Overlap in (\"%s\" => \"%s\")", + from->get(), (*myTo)[i]); + } + + return; + } + } + } + + if (frequent_substrings.Count() != 0) + { + // Make a temporary search-pattern of the requested + // from-strings. + + StringMatch req_froms; + String req_from_pattern(myFrom->Join(JOIN_CHAR)); + + req_froms.Pattern(req_from_pattern, JOIN_CHAR); + + // Continue filling "to" and "from" from frequent_substrings and + // internal encodings. If a frequent_substring is found in the + // requested from-strings, it is ignored, but the internal + // encoding is still ticked up, so that changes in + // requested_encodings (e.g. url_part_aliases) do not change + // an existing database (e.g. containing common_url_parts). + + int internal_encoding_no = 0; + + String *common_part; + frequent_substrings.Start_Get(); + String to; + + for (; + (common_part = (String *) frequent_substrings.Get_Next()) != NULL; + internal_encoding_no++) + { + int templen = common_part->length(); + if (templen == 0) + { + errmsg = "Empty strings are not allowed"; + return; + } + + // Is a "From" string in it, or is a "To" string in it? + // Note that checking if there are *any* requested + // encodings (n_of_pairs) is not just an "optimization"; + // it is necessary since StringMatch will return 0 (not + // -1) if the pattern is empty (FIXME: changing that + // breaks something else in another part of ht://Dig). + + if (n_of_pairs + && (req_froms.FindFirst(common_part->get()) != -1 + || req_tos.FindFirst(common_part->get()) != -1)) + continue; + + to = 0; // Clear previous run. + + // Dream up an encoding without zeroes. + // Use FIRST_INTERNAL_SINGLECHAR .. LAST_INTERNAL_SINGLECHAR + // for the first encodings, as much as possible. + + long int number_to_store = + internal_encoding_no + FIRST_INTERNAL_SINGLECHAR; + + if (number_to_store <= LAST_INTERNAL_SINGLECHAR) + { + to << char(number_to_store); + } + else + { + // Use <number-of-bytes-in-length> + // <number-as-nonzero-bytes> to code the rest. + // Note that we assume eight-bit chars here, which + // should be ok for all systems you run htdig on. + // At least it helps clarity here. + + number_to_store -= LAST_INTERNAL_SINGLECHAR; + + // Make sure highest bit in every byte is "1" by + // inserting one there. + char to_store[sizeof(number_to_store)+1]; + int j = 1; + + while (number_to_store > 0x7f) + { + number_to_store = ((number_to_store & ~0x7f) << 1) + | 0x80 | (number_to_store & 0x7f); + + to_store[j++] = char(number_to_store); + number_to_store >>= 8; + } + + // Finally, store the highest byte. It too shall have + // the highest bit set. This is the easiest way to + // adjust it not to be QUOTE_CHAR. + to_store[0] = j; + to_store[j] = char(number_to_store | 0x80); + + to.append(to_store, j+1); + } + + // Add to replacement pairs. + myFrom->Add(new String(*common_part)); + myTo->Add(new String(to)); + } + } + + // Now, add the quoted "to":s to the "to"-list, with the unquoted + // "to":s to the "from"-list. This way we do not have to + // check for quoting separately. Like this: + // From To + // foo : ! + // bar : > + // baz : $ + // ! : \! + // > : \> + // $ : \$ + // + // Since we checked that none of the "To":s are in a "From" we + // can do this. + + myTo->Start_Get(); + int to_count = myTo->Count(); + String *current; + String temp; + + int i; + for (i = 0; i < to_count; i++) + { + // It works to append *and* iterate through a + // StringList, despite not having an iterator class. + current = (String *) myTo->Nth(i); + + myFrom->Add(new String(*current)); + + temp = 0; // Reset any previous round. + temp.append(char(QUOTE_CHAR)); + temp.append(*current); + + myTo->Add(new String(temp)); + } + + myFromMatch = new StringMatch; + myToMatch = new StringMatch; + + String to_pattern(myTo->Join(JOIN_CHAR)); + String from_pattern(myFrom->Join(JOIN_CHAR)); + + // StringMatch class has unchecked limits, better check them. + // The length of each string in the pattern an the upper limit + // of the needs. + if (to_pattern.length() - (myTo->Count() - 1) > 0xffff + || from_pattern.length() - (myFrom->Count() - 1) > 0xffff) + { + errmsg = "Limit reached; use fewer encodings"; + return; + } + + myToMatch->Pattern(to_pattern, JOIN_CHAR); + myFromMatch->Pattern(from_pattern, JOIN_CHAR); + + errmsg = 0; +} + + +// We only need one "coding" function, since quoting and unquoting is +// handled through the to- and from-lists. +String +HtWordCodec::code(const String &orig_string, StringMatch &match, + StringList &replacements) const +{ + String retval; + String tempinput; + int offset, which, length; + const char *orig; + + // Get a null-terminated string, usable for FindFirst to look at. + orig = orig_string.get(); + + // Sanity check. If bad use, just return empty strings. + if (myFromMatch == NULL) + { + return retval; + } + + // Need to check if "replacements" is empty; that is, if no + // transformations should be done. FindFirst() does not return + // -1 in this case, it returns 0. + if (replacements.Count() == 0) + return orig_string; + + // Find the encodings and replace them. + while ((offset = match.FindFirst(orig, which, length)) != -1) + { + // Append the previous part that was not part of a code. + retval.append(orig, offset); + + // Replace with the original string. + retval.append(replacements[which]); + + orig += offset + length; + } + + // Add the final non-matched part. + retval.append(orig); + + return retval; +} + + +// The assymetry is caused by swapping both the matching and +// replacement lists. +String +HtWordCodec::decode(const String &orig) const +{ + return code(orig, *myToMatch, *myFrom); +} + + +String +HtWordCodec::encode(const String &orig) const +{ + return code(orig, *myFromMatch, *myTo); +} + +// End of HtWordCodec.cc |