// WordKey.h // // NAME // inverted index key. // // SYNOPSIS // // #include // // #define DOCID 1 // #define LOCATION 1 // // WordKey key("word 1 2"); // key.Set(DOCID, 100); // key.SetWord("other"); // // DESCRIPTION // // Describes the key used to store a entry in the inverted index. // The structure of a key is described by the WordKeyInfo // Each field in the key has a bit in the set // member that says if it is set or not. This bit allows to // say that a particular field is undefined regardless of // the actual value stored. The methods // IsDefined, SetDefined and Undefined are used to manipulate // the defined status of a field. The Pack and Unpack // methods are used to convert to and from the disk storage representation // of the key. // // Generic functions to manipulate the key should use the WordKeyInfo // information to work regardless of the actual structure of the key. // // Suffix definition: a word suffix is a kind of marker that says if // the word is a full word or only the beginning of a // word. If a word has a suffix then it's a full word. If it // has no suffix then it's only the beginning of a word. // This is mostly useful when specifying search keys. If a // search key word has no suffix, the search mechanism is // expected to return all words that begin with the word. If // the search key word has a suffix, only words that exactly // match the search key word will be returned. // // ASCII FORMAT // // The ASCII description is a string with fields separated by tabs or // white space. //
// Example: Foo  0 1 4 2
// Field 1: The word as a string or  if not defined
// Field 2:  if suffix defined,  if suffix undefined
// Field 3 to nfield + 1: numerical value of the field or  if
//                        not defined
//
// 
// // END // // Part of the ht://Dig package // Copyright (c) 1999-2004 The ht://Dig Group // For copyright details, see the file COPYING in your distribution // or the GNU Library General Public License (LGPL) version 2 or later // // // #ifndef _WordKey_h_ #define _WordKey_h_ #ifndef SWIG #include "db.h" #include "htString.h" #include "StringList.h" #endif /* SWIG */ // // WORDSUFFIX: // // field in set flag that says if a word is just a prefix (incomplete word) // WORD_KEY_WORDSUFFIX_DEFINED -> means that word IS complete (not a prefix) // #define WORD_KEY_WORDSUFFIX_DEFINED (1 << 30) #define WORD_KEY_WORD_DEFINED 1 #define WORD_KEY_WORDFULLY_DEFINED ( WORD_KEY_WORDSUFFIX_DEFINED | WORD_KEY_WORD_DEFINED ) // // Possible return values of Outbound/Overflow/Underflow methods // #define WORD_INBOUND 0 #define WORD_OVERFLOW 1 #define WORD_UNDERFLOW 2 // // Possible return values of SetToFollowing // #define WORD_FOLLOWING_ATEND 0x0001 // // Default value for position argument of SetToFollowing // meaning NFields() - 1 // #define WORD_FOLLOWING_MAX -1 // // Position of the first numerical field (just after the word) // #define WORD_FIRSTFIELD 1 // // Unknown field position // #define WORD_KEY_UNKNOWN_POSITION -1 #ifndef SWIG // C comparison function interface for Berkeley DB (bt_compare) // int word_db_cmp(const DBT *a, const DBT *b); int word_only_db_cmp(const DBT *a, const DBT *b); #endif /* SWIG */ #ifndef SWIG #include"WordKeyInfo.h" #endif /* SWIG */ // // Describe a word occurrence // // !!!!!!!DEBUGTMP #ifndef SWIG #define WORD_FATAL_ABORT fflush(stdout);fprintf(stderr,"FATAL ERROR at file:%s line:%d !!!\n",__FILE__,__LINE__);fflush(stderr);(*(int *)NULL)=1 #define word_errr(s) {fprintf(stderr,"FATAL ERROR:%s\n",s);WORD_FATAL_ABORT;} #endif /* SWIG */ class WordKey { public: // // Constructors, destructors, copy and clear // //- // Constructor. Build an empty key. // WordKey() { Initialize(); } #ifndef SWIG //- // Constructor. Initialize from an ASCII description of a key. // See ASCII FORMAT section. // WordKey(const String& word) { Initialize(); Set(word); } // // Copy constructor (needed because of the array pointer) // WordKey(const WordKey &other) { Initialize(); CopyFrom(other); } #endif /* SWIG */ ~WordKey() { delete [] numerical_fields; } #ifndef SWIG protected: // // Constructor helper, allocate members and set to empty key // void Initialize() { if(!Info()) { fprintf(stderr, "WordKey::WordKey used before word_key_info set\n"); word_errr("WordKey::initialize"); } numerical_fields = new WordKeyNum[NFields()-1]; Clear(); } public: // // Copy operator (needed because of the array pointer) // void operator =(const WordKey &other) { Clear(); CopyFrom(other); } #endif /* SWIG */ //- // Copy other into object. // void CopyFrom(const WordKey &other) { if(other.IsDefined(0)) { SetWord(other.GetWord()); } for(int i=1;iWordKeyInfo(3)). // static inline const WordKeyInfo *Info() { return WordKeyInfo::Instance(); } #endif /* SWIG */ //- // Convenience functions to access the total number of fields // in a key (see WordKeyInfo(3)). // static inline int NFields() { return Info()->nfields; } //- // Convenience functions to access the // maximum possible value for field at position. // in a key (see WordKeyInfo(3)). // static inline WordKeyNum MaxValue(int position) { return Info()->sort[position].MaxValue(); } // // Accessors // //- // Returns the word as a const. // #ifndef SWIG inline const String& GetWord() const { return kword; } #endif /* SWIG */ //- // Returns the word. // inline String& GetWord() { return kword; } //- // Set the word. // inline void SetWord(const String& arg) { kword = arg; setbits |= WORD_KEY_WORDFULLY_DEFINED; } protected: //- // Set the word. // inline void SetWord(const char* arg, int arg_length) { kword.set(arg, arg_length); setbits |= WORD_KEY_WORDFULLY_DEFINED; } public: //- // Change status of the word to undefined. Also undefines // its suffix. // inline void UndefinedWord() { kword.trunc(); setbits &= ~WORD_KEY_WORDFULLY_DEFINED; } //- // Set the status of the word suffix to undefined. // inline void UndefinedWordSuffix() {setbits &= ~WORD_KEY_WORDSUFFIX_DEFINED;} //- // Set the status of the word suffix to defined. // inline void SetDefinedWordSuffix() {setbits |= WORD_KEY_WORDSUFFIX_DEFINED;} //- // Returns true if word suffix is defined, false otherwise. // inline int IsDefinedWordSuffix() const {return( (setbits & WORD_KEY_WORDSUFFIX_DEFINED) == WORD_KEY_WORDSUFFIX_DEFINED);} // // Get/Set numerical fields // //- // Return value of numerical field at position as const. // inline WordKeyNum Get(int position) const { // if(position<1 || position>=NFields()){errr("Get: out of bounds");} return(numerical_fields[position-1]); } #ifndef SWIG //- // Return value of numerical field at position. // inline WordKeyNum& Get(int position) { return(numerical_fields[position-1]); } //- // Return value of numerical field at position as const. // inline const WordKeyNum & operator[] (int position) const { return(numerical_fields[position-1]); } //- // Return value of numerical field at position. // inline WordKeyNum & operator[] (int position) { return(numerical_fields[position-1]); } #endif /* SWIG */ //- // Set value of numerical field at position to val. // inline void Set(int position, WordKeyNum val) { // if(position<1 || position>=NFields()){errr("Set: out of bounds");} SetDefined(position); numerical_fields[position-1] = val; } // // Key field value existenz. Defined means the value of the field contains // a valid value. Undefined means the value of the field is not valid. // //- // Returns true if field at position is defined, false // otherwise. // int IsDefined(int position) const { return setbits & (1 << position); } //- // Value in field position becomes defined. // void SetDefined(int position) { setbits |= (1 << position); } //- // Value in field position becomes undefined. // void Undefined(int position) { setbits &= ~(1 << position); } #ifndef SWIG // // Set and Get the whole structure from/to ASCII description //- // Set the whole structure from ASCII string in bufferin. // See ASCII FORMAT section. // Return OK if successfull, NOTOK otherwise. // int Set(const String& bufferin); int SetList(StringList& fields); //- // Convert the whole structure to an ASCII string description // in bufferout. // See ASCII FORMAT section. // Return OK if successfull, NOTOK otherwise. // int Get(String& bufferout) const; //- // Convert the whole structure to an ASCII string description // and return it. // See ASCII FORMAT section. // String Get() const; #endif /* SWIG */ // // Storage format conversion // #ifndef SWIG //- // Set structure from disk storage format as found in // string buffer or length length. // Return OK if successfull, NOTOK otherwise. // int Unpack(const char* string, int length); // //- // Set structure from disk storage format as found in // data string. // Return OK if successfull, NOTOK otherwise. // inline int Unpack(const String& data) { return(Unpack(data,data.length())); } // //- // Convert object into disk storage format as found in // and place the result in data string. // Return OK if successfull, NOTOK otherwise. // int Pack(String& data) const; #endif /* SWIG */ // // Transformations // //- // Copy each defined field from other into the object, if // the corresponding field of the object is not defined. // Return OK if successfull, NOTOK otherwise. // int Merge(const WordKey& other); //- // Undefine all fields found after the first undefined field. The // resulting key has a set of defined fields followed by undefined fields. // Returns NOTOK if the word is not defined because the resulting key would // be empty and this is considered an error. Returns OK on success. // int PrefixOnly(); #ifndef SWIG //- // Implement ++ on a key. // // It behaves like arithmetic but follows these rules: //
  // . Increment starts at field 
  // . If a field value overflows, increment field position - 1
  // . Undefined fields are ignored and their value untouched
  // . Incrementing the word field is done by appending \001
  // . When a field is incremented all fields to the left are set to 0
  // 
// If position is not specified it is equivalent to NFields() - 1. // It returns OK if successfull, NOTOK if position out of range or // WORD_FOLLOWING_ATEND if the maximum possible value was reached. // int SetToFollowing(int position = WORD_FOLLOWING_MAX); #endif /* SWIG */ // // Predicates // //- // Return true if all the fields are defined, false otherwise. // int Filled() const { return setbits == (unsigned int) (((1 << NFields()) - 1) | WORD_KEY_WORDSUFFIX_DEFINED); } //- // Return true if no fields are defined, false otherwise. // int Empty() const { return setbits == 0; } //- // Return true if the object and other are equal. // Only fields defined in both keys are compared. // int Equal(const WordKey& other) const; //- // Return true if the object and other are equal. // All fields are compared. If a field is defined in object // and not defined in the object, the key are not considered // equal. // int ExactEqual(const WordKey& other) const {return(Equal(other) && other.setbits == setbits);} #ifndef SWIG //- // Return true if the object and other are equal. // The packed string are compared. An undefined numerical field // will be 0 and therefore undistinguishable from a defined field // whose value is 0. // int PackEqual(const WordKey& other) const; //- // Return true if adding increment in field at position makes // it overflow or underflow, false if it fits. // int Outbound(int position, int increment) { if(increment < 0) return Underflow(position, increment); else if(increment > 0) return Overflow(position, increment); else return WORD_INBOUND; } //- // Return true if adding positive increment to field at // position makes it overflow, false if it fits. // int Overflow(int position, int increment) { return MaxValue(position) - Get(position) < (WordKeyNum)increment ? WORD_OVERFLOW : WORD_INBOUND; } //- // Return true if subtracting positive increment to field // at position makes it underflow, false if it fits. // int Underflow(int position, int increment) { return Get(position) < (WordKeyNum)(-increment) ? WORD_UNDERFLOW : WORD_INBOUND; } #endif /* SWIG */ //- // Return OK if the key may be used as a prefix for search. // In other words return OK if the fields set in the key // are all contiguous, starting from the first field. // Otherwise returns NOTOK // int Prefix() const; #ifndef SWIG //- // Compare a and b in the Berkeley DB fashion. // a and b are packed keys. The semantics of the // returned int is as of strcmp and is driven by the key description // found in WordKeyInfo. // static int Compare(const String& a, const String& b); static int Compare_WordOnly(const String& a, const String& b); //- // Compare a and b in the Berkeley DB fashion. // a and b are packed keys. The semantics of the // returned int is as of strcmp and is driven by the key description // found in WordKeyInfo. // static int Compare(const char *a, int a_length, const char *b, int b_length); static int Compare_WordOnly(const char *a, int a_length, const char *b, int b_length); //- // Compare object defined fields with other key defined fields only, // ignore fields that are not defined in object or other. // Return 1 if different 0 if equal. // If different, position is set to the field number that differ, // lower is set to 1 if Get(position) is lower than // other.Get(position) otherwise lower is set to 0. // int Diff(const WordKey& other, int& position, int& lower); //- // Print object in ASCII form on f (uses Get method). // See ASCII FORMAT section. // int Write(FILE* f) const; #endif /* SWIG */ //- // Print object in ASCII form on stdout (uses Get method). // See ASCII FORMAT section. // void Print() const; #ifndef SWIG private: // // Convert a single number from and to disk storage representation // static int UnpackNumber(const unsigned char* from, const int from_size, WordKeyNum &res, const int lowbits, const int bits); static int PackNumber(WordKeyNum from, char* to, int to_size, int lowbits, int lastbits); // // Data members // // // Bit field for defined/undefined status of each key field // unsigned int setbits; // // Holds the numerical values of the key fields // WordKeyNum *numerical_fields; // // Holds the word key field // String kword; #endif /* SWIG */ }; #ifndef SWIG // // Set bit number to 0 and others to 1. may have a value from 0 to 8. If // 8 then all bits are 1. // #define WORD_BIT_MASK(b) ((b) == 0 ? 0xff : ((( 1 << (b)) - 1) & 0xff)) #define WORD_BIT_MASK2(b) ((1<<(b)) -1) // // Decode integer found in using bytes. The integer starts at bit // in the first byte and occupies a total of bits. The resulting integer is stored in * // inline int WordKey::UnpackNumber(const unsigned char* from, const int from_size, WordKeyNum& to, const int lowbits, const int bits) { to = 0; to = ((from[0] & 0xff) >> lowbits); if(lowbits) to &= WORD_BIT_MASK(8 - lowbits); if(from_size == 1) to &= WORD_BIT_MASK(bits); else { for(int i = 1; i < from_size; i++) { to |= (from[i] & 0xff) << ((i - 1) * 8 + (8 - lowbits)); } } if(bits < (int)(sizeof(WordKeyNum) * 8)) to &= ( 1 << bits ) - 1; return OK; } // // Encode integer , starting at bit in byte array . It will span // bytes and only the bits of the last byte (to[to_size - 1]) are // filled. See word_builder.pl for more information. // inline int WordKey::PackNumber(WordKeyNum from, char* to, int to_size, int lowbits, int lastbits) { // first byte if(lowbits) { to[0] |= ((from & WORD_BIT_MASK(8 - lowbits)) << lowbits) & 0xff; } else { to[0] = from & 0xff; } from >>= 8 - lowbits; // following bytes for(int i = 1; i < to_size; i++) { to[i] = from & 0xff; from >>= 8; } // clip the end off (clobbers anything left at the end of this byte) if(lastbits) to[to_size - 1] &= WORD_BIT_MASK(lastbits); return OK; } #undef WORD_BIT_MASK #endif /* SWIG */ #endif