diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htword')
53 files changed, 13551 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htword/.cvsignore b/debian/htdig/htdig-3.2.0b6/htword/.cvsignore new file mode 100644 index 00000000..09dc8ef2 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/.cvsignore @@ -0,0 +1,7 @@ +Makefile +*.lo +*.la +.purify +.pure +.deps +.libs diff --git a/debian/htdig/htdig-3.2.0b6/htword/Makefile.am b/debian/htdig/htdig-3.2.0b6/htword/Makefile.am new file mode 100644 index 00000000..16c6d7bc --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/Makefile.am @@ -0,0 +1,51 @@ +# +# Part of the ht://Dig package <http://www.htdig.org/> +# Copyright (c) 1999-2004 The ht://Dig Group +# For copyright details, see the file COPYING in your distribution +# or the GNU Library General Public License version 2 or later +# <http://www.gnu.org/copyleft/lgpl.html> +# +include $(top_srcdir)/Makefile.config + +LOCAL_DEFINES = + +pkglib_LTLIBRARIES = libhtword.la + +libhtword_la_SOURCES = \ + WordBitCompress.cc \ + WordContext.cc \ + WordCursor.cc \ + WordDB.cc \ + WordDBCompress.cc \ + WordDBInfo.cc \ + WordDBPage.cc \ + WordKey.cc \ + WordKeyInfo.cc \ + WordList.cc \ + WordMonitor.cc \ + WordRecord.cc \ + WordRecordInfo.cc \ + WordReference.cc \ + WordStat.cc \ + WordType.cc + +libhtword_la_LDFLAGS = -release $(HTDIG_MAJOR_VERSION).$(HTDIG_MINOR_VERSION).$(HTDIG_MICRO_VERSION) ${extra_ldflags} + +pkginclude_HEADERS = \ + WordBitCompress.h \ + WordCaseIsAStatements.h \ + WordContext.h \ + WordCursor.h \ + WordDB.h \ + WordDBCompress.h \ + WordDBInfo.h \ + WordDBPage.h \ + WordKey.h \ + WordKeyInfo.h \ + WordList.h \ + WordMonitor.h \ + WordRecord.h \ + WordRecordInfo.h \ + WordReference.h \ + WordStat.h \ + WordType.h diff --git a/debian/htdig/htdig-3.2.0b6/htword/Makefile.in b/debian/htdig/htdig-3.2.0b6/htword/Makefile.in new file mode 100644 index 00000000..f540671b --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/Makefile.in @@ -0,0 +1,544 @@ +# Makefile.in generated by automake 1.7.9 from Makefile.am. +# @configure_input@ + +# Copyright 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003 +# Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +# +# To compile with profiling do the following: +# +# make CFLAGS=-g CXXFLAGS=-g PROFILING=-p all +# + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +top_builddir = .. + +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +INSTALL = @INSTALL@ +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +host_triplet = @host@ +ACLOCAL = @ACLOCAL@ +ALLOCA = @ALLOCA@ +AMDEP_FALSE = @AMDEP_FALSE@ +AMDEP_TRUE = @AMDEP_TRUE@ +AMTAR = @AMTAR@ +APACHE = @APACHE@ +APACHE_MODULES = @APACHE_MODULES@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CGIBIN_DIR = @CGIBIN_DIR@ +COMMON_DIR = @COMMON_DIR@ +CONFIG_DIR = @CONFIG_DIR@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DATABASE_DIR = @DATABASE_DIR@ +DEFAULT_CONFIG_FILE = @DEFAULT_CONFIG_FILE@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +ECHO = @ECHO@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +F77 = @F77@ +FFLAGS = @FFLAGS@ +FIND = @FIND@ +GUNZIP = @GUNZIP@ +HAVE_SSL = @HAVE_SSL@ +HTDIG_MAJOR_VERSION = @HTDIG_MAJOR_VERSION@ +HTDIG_MICRO_VERSION = @HTDIG_MICRO_VERSION@ +HTDIG_MINOR_VERSION = @HTDIG_MINOR_VERSION@ +IMAGE_DIR = @IMAGE_DIR@ +IMAGE_URL_PREFIX = @IMAGE_URL_PREFIX@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +MAINT = @MAINT@ +MAINTAINER_MODE_FALSE = @MAINTAINER_MODE_FALSE@ +MAINTAINER_MODE_TRUE = @MAINTAINER_MODE_TRUE@ +MAKEINFO = @MAKEINFO@ +MV = @MV@ +OBJEXT = @OBJEXT@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PERL = @PERL@ +RANLIB = @RANLIB@ +RRDTOOL = @RRDTOOL@ +SEARCH_DIR = @SEARCH_DIR@ +SEARCH_FORM = @SEARCH_FORM@ +SED = @SED@ +SENDMAIL = @SENDMAIL@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +TAR = @TAR@ +TESTS_FALSE = @TESTS_FALSE@ +TESTS_TRUE = @TESTS_TRUE@ +TIME = @TIME@ +TIMEV = @TIMEV@ +USER = @USER@ +VERSION = @VERSION@ +YACC = @YACC@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_F77 = @ac_ct_F77@ +ac_ct_RANLIB = @ac_ct_RANLIB@ +ac_ct_STRIP = @ac_ct_STRIP@ +am__fastdepCC_FALSE = @am__fastdepCC_FALSE@ +am__fastdepCC_TRUE = @am__fastdepCC_TRUE@ +am__fastdepCXX_FALSE = @am__fastdepCXX_FALSE@ +am__fastdepCXX_TRUE = @am__fastdepCXX_TRUE@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +datadir = @datadir@ +exec_prefix = @exec_prefix@ +extra_ldflags = @extra_ldflags@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +oldincludedir = @oldincludedir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +subdirs = @subdirs@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ + +AUTOMAKE_OPTIONS = foreign no-dependencies + +INCLUDES = -DDEFAULT_CONFIG_FILE=\"$(DEFAULT_CONFIG_FILE)\" \ + -I$(top_srcdir)/include -I$(top_srcdir)/htlib \ + -I$(top_srcdir)/htnet -I$(top_srcdir)/htcommon \ + -I$(top_srcdir)/htword \ + -I$(top_srcdir)/db -I$(top_builddir)/db \ + $(LOCAL_DEFINES) $(PROFILING) + + +HTLIBS = $(top_builddir)/htnet/libhtnet.la \ + $(top_builddir)/htcommon/libcommon.la \ + $(top_builddir)/htword/libhtword.la \ + $(top_builddir)/htlib/libht.la \ + $(top_builddir)/htcommon/libcommon.la \ + $(top_builddir)/htword/libhtword.la \ + $(top_builddir)/db/libhtdb.la \ + $(top_builddir)/htlib/libht.la + + + +# +# Part of the ht://Dig package <http://www.htdig.org/> +# Copyright (c) 1999-2004 The ht://Dig Group +# For copyright details, see the file COPYING in your distribution +# or the GNU Library General Public License version 2 or later +# <http://www.gnu.org/copyleft/lgpl.html> +# +LOCAL_DEFINES = + +pkglib_LTLIBRARIES = libhtword.la + +libhtword_la_SOURCES = \ + WordBitCompress.cc \ + WordContext.cc \ + WordCursor.cc \ + WordDB.cc \ + WordDBCompress.cc \ + WordDBInfo.cc \ + WordDBPage.cc \ + WordKey.cc \ + WordKeyInfo.cc \ + WordList.cc \ + WordMonitor.cc \ + WordRecord.cc \ + WordRecordInfo.cc \ + WordReference.cc \ + WordStat.cc \ + WordType.cc + + +libhtword_la_LDFLAGS = -release $(HTDIG_MAJOR_VERSION).$(HTDIG_MINOR_VERSION).$(HTDIG_MICRO_VERSION) ${extra_ldflags} + +pkginclude_HEADERS = \ + WordBitCompress.h \ + WordCaseIsAStatements.h \ + WordContext.h \ + WordCursor.h \ + WordDB.h \ + WordDBCompress.h \ + WordDBInfo.h \ + WordDBPage.h \ + WordKey.h \ + WordKeyInfo.h \ + WordList.h \ + WordMonitor.h \ + WordRecord.h \ + WordRecordInfo.h \ + WordReference.h \ + WordStat.h \ + WordType.h + +subdir = htword +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_HEADER = $(top_builddir)/include/config.h +CONFIG_CLEAN_FILES = +LTLIBRARIES = $(pkglib_LTLIBRARIES) + +libhtword_la_LIBADD = +am_libhtword_la_OBJECTS = WordBitCompress.lo WordContext.lo \ + WordCursor.lo WordDB.lo WordDBCompress.lo WordDBInfo.lo \ + WordDBPage.lo WordKey.lo WordKeyInfo.lo WordList.lo \ + WordMonitor.lo WordRecord.lo WordRecordInfo.lo WordReference.lo \ + WordStat.lo WordType.lo +libhtword_la_OBJECTS = $(am_libhtword_la_OBJECTS) + +DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir)/include +depcomp = +am__depfiles_maybe = +CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) +LTCXXCOMPILE = $(LIBTOOL) --mode=compile $(CXX) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CXXFLAGS) $(CXXFLAGS) +CXXLD = $(CXX) +CXXLINK = $(LIBTOOL) --mode=link $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +DIST_SOURCES = $(libhtword_la_SOURCES) +HEADERS = $(pkginclude_HEADERS) + +DIST_COMMON = README $(pkginclude_HEADERS) $(srcdir)/Makefile.in \ + $(top_srcdir)/Makefile.config Makefile.am +SOURCES = $(libhtword_la_SOURCES) + +all: all-am + +.SUFFIXES: +.SUFFIXES: .cc .lo .o .obj +$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/Makefile.config $(top_srcdir)/configure.in $(ACLOCAL_M4) + cd $(top_srcdir) && \ + $(AUTOMAKE) --foreign htword/Makefile +Makefile: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe) +pkglibLTLIBRARIES_INSTALL = $(INSTALL) +install-pkglibLTLIBRARIES: $(pkglib_LTLIBRARIES) + @$(NORMAL_INSTALL) + $(mkinstalldirs) $(DESTDIR)$(pkglibdir) + @list='$(pkglib_LTLIBRARIES)'; for p in $$list; do \ + if test -f $$p; then \ + f="`echo $$p | sed -e 's|^.*/||'`"; \ + echo " $(LIBTOOL) --mode=install $(pkglibLTLIBRARIES_INSTALL) $(INSTALL_STRIP_FLAG) $$p $(DESTDIR)$(pkglibdir)/$$f"; \ + $(LIBTOOL) --mode=install $(pkglibLTLIBRARIES_INSTALL) $(INSTALL_STRIP_FLAG) $$p $(DESTDIR)$(pkglibdir)/$$f; \ + else :; fi; \ + done + +uninstall-pkglibLTLIBRARIES: + @$(NORMAL_UNINSTALL) + @list='$(pkglib_LTLIBRARIES)'; for p in $$list; do \ + p="`echo $$p | sed -e 's|^.*/||'`"; \ + echo " $(LIBTOOL) --mode=uninstall rm -f $(DESTDIR)$(pkglibdir)/$$p"; \ + $(LIBTOOL) --mode=uninstall rm -f $(DESTDIR)$(pkglibdir)/$$p; \ + done + +clean-pkglibLTLIBRARIES: + -test -z "$(pkglib_LTLIBRARIES)" || rm -f $(pkglib_LTLIBRARIES) + @list='$(pkglib_LTLIBRARIES)'; for p in $$list; do \ + dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \ + test "$$dir" = "$$p" && dir=.; \ + echo "rm -f \"$${dir}/so_locations\""; \ + rm -f "$${dir}/so_locations"; \ + done +libhtword.la: $(libhtword_la_OBJECTS) $(libhtword_la_DEPENDENCIES) + $(CXXLINK) -rpath $(pkglibdir) $(libhtword_la_LDFLAGS) $(libhtword_la_OBJECTS) $(libhtword_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) core *.core + +distclean-compile: + -rm -f *.tab.c + +.cc.o: + $(CXXCOMPILE) -c -o $@ `test -f '$<' || echo '$(srcdir)/'`$< + +.cc.obj: + $(CXXCOMPILE) -c -o $@ `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` + +.cc.lo: + $(LTCXXCOMPILE) -c -o $@ `test -f '$<' || echo '$(srcdir)/'`$< + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +distclean-libtool: + -rm -f libtool +uninstall-info-am: +pkgincludeHEADERS_INSTALL = $(INSTALL_HEADER) +install-pkgincludeHEADERS: $(pkginclude_HEADERS) + @$(NORMAL_INSTALL) + $(mkinstalldirs) $(DESTDIR)$(pkgincludedir) + @list='$(pkginclude_HEADERS)'; for p in $$list; do \ + if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ + f="`echo $$p | sed -e 's|^.*/||'`"; \ + echo " $(pkgincludeHEADERS_INSTALL) $$d$$p $(DESTDIR)$(pkgincludedir)/$$f"; \ + $(pkgincludeHEADERS_INSTALL) $$d$$p $(DESTDIR)$(pkgincludedir)/$$f; \ + done + +uninstall-pkgincludeHEADERS: + @$(NORMAL_UNINSTALL) + @list='$(pkginclude_HEADERS)'; for p in $$list; do \ + f="`echo $$p | sed -e 's|^.*/||'`"; \ + echo " rm -f $(DESTDIR)$(pkgincludedir)/$$f"; \ + rm -f $(DESTDIR)$(pkgincludedir)/$$f; \ + done + +ETAGS = etags +ETAGSFLAGS = + +CTAGS = ctags +CTAGSFLAGS = + +tags: TAGS + +ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + mkid -fID $$unique + +TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(ETAGS_ARGS)$$tags$$unique" \ + || $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$tags $$unique + +ctags: CTAGS +CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(CTAGS_ARGS)$$tags$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$tags $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && cd $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) $$here + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) + +top_distdir = .. +distdir = $(top_distdir)/$(PACKAGE)-$(VERSION) + +distdir: $(DISTFILES) + $(mkinstalldirs) $(distdir)/.. + @srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \ + list='$(DISTFILES)'; for file in $$list; do \ + case $$file in \ + $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \ + $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \ + esac; \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test "$$dir" != "$$file" && test "$$dir" != "."; then \ + dir="/$$dir"; \ + $(mkinstalldirs) "$(distdir)$$dir"; \ + else \ + dir=''; \ + fi; \ + if test -d $$d/$$file; then \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \ + fi; \ + cp -pR $$d/$$file $(distdir)$$dir || exit 1; \ + else \ + test -f $(distdir)/$$file \ + || cp -p $$d/$$file $(distdir)/$$file \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) $(HEADERS) + +installdirs: + $(mkinstalldirs) $(DESTDIR)$(pkglibdir) $(DESTDIR)$(pkgincludedir) +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + `test -z '$(STRIP)' || \ + echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f $(CONFIG_CLEAN_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool clean-pkglibLTLIBRARIES \ + mostlyclean-am + +distclean: distclean-am + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-libtool distclean-tags + +dvi: dvi-am + +dvi-am: + +info: info-am + +info-am: + +install-data-am: install-pkgincludeHEADERS + +install-exec-am: install-pkglibLTLIBRARIES + +install-info: install-info-am + +install-man: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: uninstall-info-am uninstall-pkgincludeHEADERS \ + uninstall-pkglibLTLIBRARIES + +.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ + clean-libtool clean-pkglibLTLIBRARIES ctags distclean \ + distclean-compile distclean-generic distclean-libtool \ + distclean-tags distdir dvi dvi-am info info-am install \ + install-am install-data install-data-am install-exec \ + install-exec-am install-info install-info-am install-man \ + install-pkgincludeHEADERS install-pkglibLTLIBRARIES \ + install-strip installcheck installcheck-am installdirs \ + maintainer-clean maintainer-clean-generic mostlyclean \ + mostlyclean-compile mostlyclean-generic mostlyclean-libtool pdf \ + pdf-am ps ps-am tags uninstall uninstall-am uninstall-info-am \ + uninstall-pkgincludeHEADERS uninstall-pkglibLTLIBRARIES + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/debian/htdig/htdig-3.2.0b6/htword/Makefile.win32 b/debian/htdig/htdig-3.2.0b6/htword/Makefile.win32 new file mode 100644 index 00000000..9f484eae --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/Makefile.win32 @@ -0,0 +1,22 @@ + +TARGET = $(LIBDIR)/libhtword$(LIBSFX) + +# ---------------------------------------------------------------------------- +# add new library members to this list + +# ---------------------------------------------------------------------------- + +include ../Makedefs.win32 + +CXXSRC = WordBitCompress.cc WordContext.cc WordCursor.cc WordDB.cc \ + WordDBCompress.cc WordDBInfo.cc WordDBPage.cc WordKey.cc \ + WordKeyInfo.cc WordList.cc WordMonitor.cc WordRecord.cc \ + WordRecordInfo.cc WordReference.cc WordStat.cc WordType.cc + +CPPFLAGS += -DHAVE_CONFIG_H -I../db -I../htcommon -I../htlib -I../htword + +$(TARGET): $(OBJDIRDEP) $(LIBDIRDEP) $(OBJS) + $(AR) $(ARFLAGS) $(OBJS) + +include ../Makerules.win32 + diff --git a/debian/htdig/htdig-3.2.0b6/htword/README b/debian/htdig/htdig-3.2.0b6/htword/README new file mode 100644 index 00000000..adb0e1af --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/README @@ -0,0 +1,11 @@ +Files: + +WordDB : Interface to berkeley DB +WordKey : key manipulation +WordRecord : record manipulation +WordReference : record and key manipulation +WordStat : derived from WordReference -> per unique word statistics +WordType : word normalisation and transformation (accents, lowercase, ...) +WordList : inverted index interface (word insert, word delete, list browsing) + + diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordBitCompress.cc b/debian/htdig/htdig-3.2.0b6/htword/WordBitCompress.cc new file mode 100644 index 00000000..ce4bdb54 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordBitCompress.cc @@ -0,0 +1,927 @@ +// +// WordBitCompress.cc +// +// BitStream: put and get bits into a buffer +// *tagging: add tags to keep track of the position of data +// inside the bitstream for debuging purposes. +// *freezing: saves current position. further inserts in the BitStream +// aren't really done. This way you can try different +// compression algorithms and chose the best. +// +// Compressor: BitStream with extended compression fuctionalities +// +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordBitCompress.cc,v 1.5 2004/05/28 13:15:26 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <stdlib.h> + +#include"WordBitCompress.h" + +// ******** HtVector_byte (implementation) +#define GType byte +#define HtVectorGType HtVector_byte +#include "HtVectorGenericCode.h" + +// ******** HtVector_charptr (implementation) +#define GType charptr +#define HtVectorGType HtVector_charptr +#include "HtVectorGenericCode.h" + + + +// ************************************************** +// *************** misc functions ******************* +// ************************************************** + +// return a temporary string that merges a name and a number +char * +label_str(const char *s,int n) +{ + static char buff[1000]; + sprintf(buff,"%s%d",s,n); + return buff; +} + +// display n bits of value v +void +show_bits(int v,int n/*=16*/) +{ + int i; + if(n>0) + { + for(i=0;i<n;i++) + { + printf("%c",( v&(1<<(n-i-1)) ? '1':'0' ) ); + } + } + else + { + n=-n; + for(i=0;i<n;i++) + { + printf("%c",( v&(1<<(i)) ? '1':'0' ) ); + } + } +} + + + +// duplicate an array of unsigned int's +unsigned int * +duplicate(unsigned int *v,int n) +{ + unsigned int *res=new unsigned int[n]; + CHECK_MEM(res); + memcpy((void *)res,(void *)v,n*sizeof(unsigned int)); + return(res); +} + +// quick sort compare function (for unsigned int's) +int +qsort_uint_cmp(const void *a,const void *b) +{ +// printf("%12u %12u",*((unsigned int *)a),*((unsigned int *)b)); + if((*((unsigned int *)a)) > (*((unsigned int *)b))) return 1; + else + if((*((unsigned int *)a)) < (*((unsigned int *)b))) return -1; + else + return 0; +// return +// (*((unsigned int *)a)) - +// (*((unsigned int *)b)) ; +} +// quick sort an array of unsigned int's +void +qsort_uint(unsigned int *v,int n) +{ + qsort((void *)v,(unsigned int)n,sizeof(unsigned int),&qsort_uint_cmp); +} + +// log in base 2 of v +// log2(0) -> -1 +// log2(1) -> 0 +// log2(2) -> 1 +// log2(4) -> 2 +// ... +// log2(8) -> 3 +// log2(7) -> 2 +int +log2(unsigned int v) +{ + int res; + for(res=-1;v;res++){v>>=1;} + return(res); +} + + + + +// ************************************************** +// *************** VlengthCoder ******************* +// ************************************************** +// +// Compress values into a bitstream based on their probability distribution +// The probability distribution is reduced to a number of intervals. +// Each interval (generally) has the same probability of occuring +// values are then coded by: interval_number position_inside_interval +// this can be seen as modified version of shanon-fanno encoding +// +// Here are some aproximate calculation for estimating final coded size: +// +// n number of entries to code +// nbits maximum size in bits of entries to code +// +// SUM_interval_bit_sizes -> depends on probability dist +// total_size = table_size + coded_size +// table_size = 2^nlev * NBITS_NBITS_VAL +// coded_size = n * (nlev + SUM_interval_bit_sizes / 2^nlev ) +// +// example1: flat probability distribution : +// SUM_interval_bit_sizes = 2^nlev * log2( 2^nbits / 2^nlev) = 2^nlev * ( nbits - nlev ) +// => coded_size = n * ( nlev + nbits - nlev ) = n*nbits !! +// => coded_size is the same as if we used no compression +// this is normal, because it is not possible to compress random data +// +// example2: probability all focused in first interval except for one entry +// SUM_interval_bit_sizes = 1 + nbits +// the computations above are not valid because of integer roundofs +// => coded_size would actually be = n * 1 + nbits +// (but the code needs a few cleanups to obtain this value) +// +class VlengthCoder +{ + int nbits;// min number of bits to code all entries + int nlev;// split proba into 2^nlev parts + int nintervals;// number of intervals + + int *intervals; + unsigned int *intervalsizes; // speedup + unsigned int *lboundaries; // speedup + BitStream &bs; + +// inline unsigned int intervalsize(int i) +// { +// unsigned int res=((intervals[i] > 0 ? pow2(intervals[i]-1) : 0)); +// if(intervalsizes[i]!=res){errr("intervalsizes");} +// return res; +// } + inline unsigned int intervalsize0(int i){return((intervals[i] > 0 ? pow2(intervals[i]-1) : 0));} + +public: + int verbose; + + // find interval where value v resides + // fast version, this one recursively splits initial interval + inline int find_interval2(const unsigned int v,unsigned int &lboundary) + { + int i0=0; + int i1=nintervals; + int i; + for(;;) + { + if(i1==i0+1){break;} + i=(i0+i1)>>1; + lboundary=lboundaries[i]; +// if(verbose)printf("considering i0:%3d i1:%3d : i:%3d v:%12u lboundary:%12u (%12u - %12u)\n",i0,i1,i,v,lboundary,lboundaries[i0],lboundaries[i1]); + if(v<lboundary){i1=i;continue;} + else {i0=i;continue;} + + } + + lboundary=lboundaries[i0]; +// i=i0; +// unsigned int sboundary=lboundary+intervalsizes[i]; +// if(!( (lboundary!=sboundary && v>=lboundary && v<sboundary) || +// (lboundary==sboundary && v==lboundary) )) +// { +// printf("interval fd:i0:%3d i1:%3d : i:%3d v:%12u lboundary:%12u (%12u - %12u)\n",i0,i1,i,v,lboundary,lboundaries[i0],lboundaries[i1]); +// errr("bad interval"); +// } + return i0; + } + + // find interval where value v resides + // slow version, this tries every interval + inline int find_interval(const unsigned int v,unsigned int &lboundary) + { + // SPEED CRITICAL SECTION + register int i; + register unsigned int sboundary=0; + lboundary=0; + for(i=0;i<nintervals-1;i++) + { +// if(i>=nintervals){errr("code argh!");} + sboundary=lboundary+intervalsizes[i]; +// printf("nintervals:%3d i:%3d : %12u ... %12u : %12u\n",nintervals,i,lboundary,sboundary,v); + if( (lboundary!=sboundary && v>=lboundary && v<sboundary) || + (lboundary==sboundary && v==lboundary) ){break;} + lboundary=sboundary; + } + + return i; + } + + // compress and insert a value into the bitstream + inline void code(unsigned int v) + { + unsigned int lboundary=0; + // SPEED CRITICAL SECTION + int i; +// i=find_interval(v,lboundary); + i=find_interval2(v,lboundary); + // were in the i'th interval; + bs.put_uint(i,nlev,"int");// store interval + const int bitsremaining=(intervals[i]>0 ? intervals[i]-1 : 0); +// if(verbose>1)printf("v:%6d interval:%2d (%5d - %5d) bitsremaining:%2d ",v,i,lboundary,sboundary,bitsremaining); + v-=lboundary; +// if(verbose>1)printf("remain:%6d totalbits:%2d\n",v,bitsremaining+nlev); + bs.put_uint(v,bitsremaining,"rem"); + } + // get and uncompress a value from the bitstream + inline unsigned int get() + { + // SPEED CRITICAL SECTION + int i=bs.get_uint(nlev,"int");// get interval +// if(verbose>1)printf("get:interval:%2d ",i); + const int bitsremaining=(intervals[i]>0 ? intervals[i]-1 : 0); +// if(verbose>1)printf("bitsremain:%2d ",bitsremaining); + unsigned int v=bs.get_uint(bitsremaining,"rem"); +// if(verbose>1)printf("v0:%3d ",v); +// unsigned int lboundary=0; + v+=lboundaries[i]; +// for(int j=0;j<i;j++){lboundary+=intervalsizes[j];} +// v+=lboundary; +// if(verbose>1)printf("lboundary:%5d v:%5d \n",lboundaries[i],v); + return(v); + } + + + // insert the packed probability distrbution into the bitstream + void code_begin(); + // get the packed probability distrbution from the bitstream + void get_begin(); + + void make_lboundaries(); + + VlengthCoder(BitStream &nbs,int nverbose=0); + + ~VlengthCoder() + { + delete [] lboundaries; + delete [] intervals; + delete [] intervalsizes; + } + + // create VlengthCoder and its probability distrbution from an array of values + VlengthCoder(unsigned int *vals,int n,BitStream &nbs,int nverbose=0); +}; + +void +VlengthCoder::code_begin() +{ + int i; + bs.add_tag("VlengthCoder:Header"); + bs.put_uint(nbits,NBITS_NBITS_VAL,"nbits"); + bs.put_uint(nlev,5,"nlev"); + for(i=0;i<nintervals;i++) + { + bs.put_uint(intervals[i],NBITS_NBITS_VAL,label_str("interval",i)); + } +} +void +VlengthCoder::get_begin() +{ + int i; + nbits=bs.get_uint(NBITS_NBITS_VAL,"nbits"); + if(verbose>1)printf("get_begin nbits:%d\n",nbits); + nlev=bs.get_uint(5,"nlev"); + if(verbose>1)printf("get_begin nlev:%d\n",nlev); + nintervals=pow2(nlev); + + intervals=new int [nintervals]; + CHECK_MEM(intervals); + intervalsizes=new unsigned int [nintervals]; + CHECK_MEM(intervalsizes); + lboundaries=new unsigned int [nintervals+1]; + CHECK_MEM(lboundaries); + + for(i=0;i<nintervals;i++) + { + intervals[i]=bs.get_uint(NBITS_NBITS_VAL,label_str("interval",i)); + intervalsizes[i]=intervalsize0(i); + if(verbose>1)printf("get_begin intervals:%2d:%2d\n",i,intervals[i]); + } + make_lboundaries(); +} +void +VlengthCoder::make_lboundaries() +{ + unsigned int lboundary=0; + for(int j=0;j<=nintervals;j++) + { + lboundaries[j]=lboundary; + if(j<nintervals){lboundary+=intervalsizes[j];} + } +} + +VlengthCoder::VlengthCoder(BitStream &nbs,int nverbose/*=0*/):bs(nbs) +{ + verbose=nverbose; + nbits=0; + nlev=0; + nintervals=0; + intervals=NULL; +} + +int debug_test_nlev=-1; + +VlengthCoder::VlengthCoder(unsigned int *vals,int n,BitStream &nbs,int nverbose/*=0*/):bs(nbs) +{ + verbose=nverbose; + unsigned int *sorted=duplicate(vals,n); + qsort_uint(sorted,n); + + nbits=num_bits(HtMaxMin::max_v(vals,n)); + + // **** heuristics to determine best nlev + // force table size to be less than 1/10 of the maximum coded size + nlev=num_bits((n*nbits)/(10*NBITS_NBITS_VAL)); + // sanity + if(nlev>=nbits){nlev=nbits-1;} + // nlev at least 1 + if(nlev<1){nlev=1;} + + if(debug_test_nlev>=0){nlev=debug_test_nlev;} + nintervals=pow2(nlev); + int i; + + intervals=new int [nintervals]; + CHECK_MEM(intervals); + intervalsizes=new unsigned int [nintervals]; + CHECK_MEM(intervalsizes); + lboundaries=new unsigned int [nintervals+1]; + CHECK_MEM(lboundaries); + + if(verbose>1)printf("nbits:%d nlev:%d nintervals:%d \n",nbits,nlev,nintervals); + + if(verbose>10) + { + printf("vals;\n"); + for(i=0;i<n;i++) + { + printf("%12u ",vals[i]); + } + printf("\nsorted:\n"); + for(i=0;i<n;i++) + { + printf("%12u ",sorted[i]); + } + printf("\n"); + } + + // find split boundaires + unsigned int lboundary=0; + unsigned int boundary; + for(i=0;i<nintervals-1;i++) + { + boundary=sorted[(n*(i+1))/nintervals]; + intervals[i]=1+log2(boundary-lboundary); + intervalsizes[i]=intervalsize0(i); + if(0 || verbose>1)printf("intnum%02d begin:%5u end:%5u len:%5u (code:%2d) real upper boundary: real:%5u\n",i,lboundary,intervalsizes[i]+lboundary,intervalsizes[i],intervals[i],boundary); + lboundary+=intervalsizes[i]; + } + boundary=sorted[n-1]; + intervals[i]=1+log2(boundary-lboundary)+1; + intervalsizes[i]=intervalsize0(i); + if(0 || verbose>1)printf("intnum%02d begin:%5u end:%5u len:%5u (code:%2d) real upper boundary: real:%5u\n",i,lboundary,intervalsizes[i]+lboundary,intervalsizes[i],intervals[i],boundary); + if(0 || verbose>1)printf("\n"); + + make_lboundaries(); + + int SUM_interval_bit_sizes=0; + for(i=0;i<nintervals;i++) + { + SUM_interval_bit_sizes+=intervals[i]; + } + if(verbose)printf("SUM_interval_bit_sizes:%d\n",SUM_interval_bit_sizes); + delete [] sorted; +} + + +// ************************************************** +// *************** BitStream *********************** +// ************************************************** + +void +BitStream::put_zone(byte *vals,int n,const char *tag) +{ + add_tag(tag); + for(int i=0;i<(n+7)/8;i++){put_uint(vals[i],TMin(8,n-8*i),NULL);} +} +void +BitStream::get_zone(byte *vals,int n,const char *tag) +{ + check_tag(tag); + for(int i=0;i<(n+7)/8;i++){vals[i]=get_uint(TMin(8,n-8*i));} +} + +void +BitStream::put_uint(unsigned int v,int n,const char *tag/*="NOTAG"*/) +{ + // SPEED CRITICAL SECTION + if(freezeon){bitpos+=n;return;} + add_tag(tag); + + if(!n){return;} + + // 1) + int bpos0= bitpos & 0x07; +// printf("bpos0:%3d bitpos:%5d:%5d n:%4d val:%x\n",bpos0,bitpos,buff.size()*8,n,v); + if(bpos0 + n <8) + { +// printf("simple case:"); +// ::show_bits(v,n); +// printf("\n"); + // simplest case it all fits + buff.back()|=v<<bpos0; + bitpos+=n; + if(! (bitpos & 0x07) ) + {buff.push_back(0);}// new byte + return; + } + else + { + const int ncentral=((bpos0 + n)>>3)-1; + // put first + buff.back()|=((v & 0xff)<<bpos0) & 0xff; + const int nbitsinfirstbyte=8-bpos0; + +// printf("normal case :(%x:%x)",((v & 0xff)<<bpos0) & 0xff,buff.back()); +// ::show_bits(((v & 0xff)<<bpos0) & 0xff,-8); +// printf(" "); + + + v>>=nbitsinfirstbyte; +// printf(" (v:%x)",v); + // put central + for(int i=ncentral;i;i--) + { + buff.push_back(0); + buff.back()= v & 0xff ; +// ::show_bits(v & 0xff,-8); +// printf(" "); + v>>=8; + } + // put last + const int nbitsremaining=n-( (ncentral<<3)+nbitsinfirstbyte ); + if(nbitsremaining) + { + buff.push_back(0); + buff.back()=v & (pow2(nbitsremaining+1)-1); + +// printf(" (v:%x:%x)",v & (pow2(nbitsremaining+1)-1),buff.back()); +// ::show_bits(v & (pow2(nbitsremaining+1)-1),-nbitsremaining); +// printf("\n"); + } + if(!(nbitsremaining & 0x07)){buff.push_back(0);} + bitpos+=n; +// printf("nbitsinfirstbyte:%d ncentral:%d nbitsremaining:%d\n",nbitsinfirstbyte,ncentral,nbitsremaining); + + } +// printf("cuurent put order:"); +// for(i=0;i<n;i++) +// { +// printf("%c",((v0& pow2(i) ? '1':'0'))); +// } +// printf("\n"); +} + + + + +unsigned int +BitStream::get_uint(int n,const char *tag/*=NULL*/) +{ + // SPEED CRITICAL SECTION + if(check_tag(tag)==NOTOK){errr("BitStream::get(int) check_tag failed");} + if(!n){return 0;} + + unsigned int res=0; + + // 1) + int bpos0= bitpos & 0x07; + +// printf("bpos0:%3d bitpos:%5d n:%4d %s\n",bpos0,bitpos,n,tag); +// printf("input:\n"); +// for(int j=0;j<(bpos0+n+7)/8;j++){printf("%x",buff[bitpos/8+j]);} +// printf("\n"); + + if(bpos0 + n <8) + { + // simplest case it all fits + res=(buff[bitpos>>3]>>bpos0) & (pow2(n)-1); + bitpos+=n; +// printf("simple case:res:%x\n",res); + return res; + } + else + { + int bytepos=bitpos>>3; + const int ncentral=((bpos0 + n)>>3)-1; + // put first + res=(buff[bytepos]>>bpos0) & 0xff; +// printf("normal case:res0:%x\n",res); + + const int nbitsinfirstbyte=8-bpos0; + + bytepos++; + // put central + if(ncentral) + { + unsigned int v=0; + for(int i=ncentral-1;i>=0;i--) + { + v|=buff[bytepos+i]&0xff; + if(i)v<<=8; +// printf(" resC%d:v:%x\n",i,v); + } + bytepos+=ncentral; + res|=v<<nbitsinfirstbyte; +// printf(" :resC:%x\n",res); + } + // put last + const int nbitsremaining=n-( (ncentral<<3)+nbitsinfirstbyte ); + if(nbitsremaining) + { + res|=((unsigned int)(buff[bytepos] & (pow2(nbitsremaining)-1) )) << (nbitsinfirstbyte +((bytepos-(bitpos>>3)-1)<<3)); +// printf(" :resR:%x buff[%d]:%x %d\n",res,bytepos,buff[bytepos], +// (nbitsinfirstbyte +((bytepos-(bitpos>>3)-1)<<3))); + } + + bitpos+=n; +// printf("nbitsinfirstbyte:%d ncentral:%d nbitsremaining:%d\n",nbitsinfirstbyte,ncentral,nbitsremaining); + return res; + } +} +#ifdef NOTDEF +unsigned int +BitStream::get(int n,const char *tag/*=NULL*/) +{ + if(check_tag(tag)==NOTOK){errr("BitStream::get(int) check_tag failed");} + unsigned int res=0; + for(int i=0;i<n;i++) + { + if(get()){res|=pow2(i);} + } + return(res); +} +#endif +void +BitStream::freeze() +{ + freeze_stack.push_back(bitpos); + freezeon=1; +} + +int +BitStream::unfreeze() +{ + int size0=bitpos; + bitpos=freeze_stack.back(); + freeze_stack.pop_back(); + size0-=bitpos; + if(freeze_stack.size()==0){freezeon=0;} + return(size0); +} +void +BitStream::add_tag1(const char *tag) +{ + if(!use_tags){return;} + if(freezeon){return;} + if(!tag){return;} + tags.push_back(strdup(tag)); + tagpos.push_back(bitpos); +} + +int +BitStream::check_tag1(const char *tag,int pos/*=-1*/) +{ + if(!use_tags){return OK;} + if(!tag){return OK;} + int found=-1; + int ok=0; + if(pos==-1){pos=bitpos;} + for(int i=0;i<tags.size();i++) + { + if(!strcmp(tags[i],tag)) + { + found=tagpos[i]; + if(tagpos[i]==pos){ok=1;break;} + } + } + if(!ok) + { + show(); + if(found>=0) + { + printf("ERROR:BitStream:bitpos:%4d:check_tag: found tag %s at %d expected it at %d\n",bitpos,tag,found,pos); + } + else + { + printf("ERROR:BitStream:bitpos:%4d:check_tag: tag %s not found, expected it at %d\n",bitpos,tag,pos); + } + return(NOTOK); + } + return(OK); +} + +int +BitStream::find_tag(const char *tag) +{ + int i; + for(i=0;i<tags.size() && strcmp(tag,tags[i]);i++); + if(i==tags.size()){return -1;} + else{return i;} +} +int +BitStream::find_tag(int pos,int posaftertag/*=1*/) +{ + int i; + for(i=0;i<tags.size() && tagpos[i]<pos;i++); + if(i==tags.size()){return -1;} + if(!posaftertag){return i;} + for(;tagpos[i]>pos && i>=0;i--); + return(i); +} + +void +BitStream::show_bits(int a,int n) +{ + for(int b=a;b<a+n;b++) + { + printf("%c",(buff[b/8] & (1<<(b%8)) ? '1' : '0')); + } +} +void +BitStream::show(int a/*=0*/,int n/*=-1*/) +{ + int all=(n<0 ? 1 : 0); + if(n<0){n=bitpos-a;} + int i; + + if(all) + { + printf("BitStream::Show: ntags:%d size:%4d buffsize:%6d ::: ",tags.size(),size(),buffsize()); +// for(i=0;i<tags.size();i++){printf("tag:%d:%s:pos:%d\n",i,tags[i],tagpos[i]);} + } + + int t=find_tag(a,0); + if(t<0){show_bits(a,n);return;} + for(i=a;i<a+n;i++) + { + for(;t<tags.size() && tagpos[t]<i+1;t++) + { + printf("# %s:%03d:%03d #",tags[t],tagpos[t],n); + } + show_bits(i,1); + } + if(all){printf("\n");} + +} +byte * +BitStream::get_data() +{ + byte *res=(byte *)malloc(buff.size()); + CHECK_MEM(res); + for(int i=0;i<buff.size();i++){res[i]=buff[i];} + return(res); +} +void +BitStream::set_data(const byte *nbuff,int nbits) +{ + if(buff.size()!=1 || bitpos!=0) + { + printf("BitStream:set_data: size:%d bitpos:%d\n",buff.size(),bitpos); + errr("BitStream::set_data: valid only if BitStream is empty"); + } + buff[0] = nbuff[0]; + for(int i=1;i<(nbits+7)/8;i++){buff.push_back(nbuff[i]);} + bitpos=nbits; +} + + + +// ************************************************** +// *************** Compressor *********************** +// ************************************************** + + +void +Compressor::put_uint_vl(unsigned int v,int maxn,const char *tag/*="NOTAG"*/) +{ + int nbits=num_bits(v); + put_uint(nbits,num_bits(maxn),tag); + if(nbits){put_uint(v,nbits,(char *)NULL);} +} +unsigned int +Compressor::get_uint_vl(int maxn,const char *tag/*=NULL*/) +{ + int nbits=get_uint(num_bits(maxn),tag); + if(!nbits){return 0;} + else{return(get_uint(nbits,(char *)NULL));} +} + +int +Compressor::put_vals(unsigned int *vals,int n,const char *tag) +{ + int cpos=bitpos; + add_tag(tag); + if(n>=pow2(NBITS_NVALS)){errr("Compressor::put(uint *,nvals) : overflow: nvals>2^16");} + put_uint_vl(n,NBITS_NVALS,"size"); + if(n==0){return NBITS_NVALS;} + + int sdecr=2; + int sfixed=1; + + int nbits=num_bits(HtMaxMin::max_v(vals,n)); + if(verbose)printf("*********************put_vals:n:%3d nbits:%3d\n",n,nbits); + + int i; + if(verbose) + { + printf("TTT:n:%3d nbits:%3d\n",n,nbits); + for(i=1;i<7;i++) + { + debug_test_nlev=i; + printf("trying nlev:%3d\n",debug_test_nlev); + freeze(); + put_decr(vals,n); + int fndsz=unfreeze(); + printf("TTT:nlev:%2d try size:%4d\n",i,fndsz); + } + debug_test_nlev=-1; + } + + if(n>15 && nbits>3) + { + freeze(); + put_decr(vals,n); + sdecr=unfreeze(); + + freeze(); + put_fixedbitl(vals,n); + sfixed=unfreeze(); + } + + if(verbose)printf("put_vals:n:%3d sdecr:%6d sfixed:%6d rap:%f\n",n,sdecr,sfixed,sdecr/(float)sfixed); + if(sdecr<sfixed) + { + if(verbose)printf("put_vals: comptyp:0\n"); + put_uint(0,2,"put_valsCompType"); + put_decr(vals,n); + } + else + { + if(verbose)printf("put_vals: comptyp:1\n"); + put_uint(1,2,"put_valsCompType"); + put_fixedbitl(vals,n); + } + + if(verbose)printf("------------------------------put_vals over\n"); + + return(bitpos-cpos); +} + +int +Compressor::get_vals(unsigned int **pres,const char *tag/*="BADTAG!"*/) +{ + if(check_tag(tag)==NOTOK){errr("Compressor::get_vals(unsigned int): check_tag failed");} + int n=get_uint_vl(NBITS_NVALS); + if(verbose>1)printf("get_vals n:%d\n",n); + if(!n){*pres=NULL;return 0;} + + if(verbose)printf("get_vals: n:%3d\n",n); + unsigned int *res=new unsigned int[n]; + CHECK_MEM(res); + + + int comptype=get_uint(2,"put_valsCompType"); + if(verbose)printf("get_vals:comptype:%d\n",comptype); + switch(comptype) + { + case 0: get_decr(res,n); + break; + case 1: get_fixedbitl(res,n); + break; + default: errr("Compressor::get_vals invalid comptype");break; + } +// get_fixedbitl(res,n); +// get_decr(res,n); + + *pres=res; + return(n); +} + + +int +Compressor::put_fixedbitl(byte *vals,int n,const char *tag) +{ + int cpos=bitpos; + int i,j; + add_tag(tag); + + put_uint_vl(n,NBITS_NVALS,"size"); + if(n==0){return 0;} + + byte maxv=vals[0]; + for(i=1;i<n;i++) + { + byte v=vals[i]; + if(v>maxv){maxv=v;} + } + int nbits=num_bits(maxv); + if(n>=pow2(NBITS_NVALS)){errr("Compressor::put_fixedbitl(byte *) : overflow: nvals>2^16");} + put_uint(nbits,NBITS_NBITS_CHARVAL,"nbits"); + add_tag("data"); + for(i=0;i<n;i++) + { + byte v=vals[i]; + for(j=0;j<nbits;j++) {put(v&pow2(j));} + } + return(bitpos-cpos); +} +void +Compressor::put_fixedbitl(unsigned int *vals,int n) +{ + int nbits=num_bits(HtMaxMin::max_v(vals,n)); + + put_uint_vl(nbits,NBITS_NBITS_VAL,"nbits"); + add_tag("data"); + if(verbose)printf("put_fixedbitl:nbits:%4d nvals:%6d\n",nbits,n); + for(int i=0;i<n;i++) + { + put_uint(vals[i],nbits,NULL); + } +} + +void +Compressor::get_fixedbitl(unsigned int *res,int n) +{ + int nbits=get_uint_vl(NBITS_NBITS_VAL); + if(verbose)printf("get_fixedbitl(uint):n%3d nbits:%2d\n",n,nbits); + int i; + for(i=0;i<n;i++) + { + res[i]=get_uint(nbits); + } +} +int +Compressor::get_fixedbitl(byte **pres,const char *tag/*="BADTAG!"*/) +{ + if(check_tag(tag)==NOTOK){errr("Compressor::get_fixedbitl(byte *): check_tag failed");} + int n=get_uint_vl(NBITS_NVALS); + if(!n){*pres=NULL;return 0;} + int nbits=get_uint(NBITS_NBITS_CHARVAL); + if(verbose)printf("get_fixedbitl(byte):n%3d nbits:%2d\n",n,nbits); + int i; + byte *res=new byte[n]; + CHECK_MEM(res); + for(i=0;i<n;i++) + { + res[i]=get_uint(nbits); + } + *pres=res; + return(n); +} + +void +Compressor::put_decr(unsigned int *vals,int n) +{ + VlengthCoder coder(vals,n,*this,verbose); + coder.code_begin(); + int i; + for(i=0;i<n;i++){coder.code(vals[i]);} +} +void +Compressor::get_decr(unsigned int *res,int n) +{ + VlengthCoder coder(*this,verbose); + coder.get_begin(); + int i; + for(i=0;i<n;i++) + { + res[i]=coder.get(); + if(verbose>1){printf("get_decr:got:%8d\n",res[i]);} + } +} diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordBitCompress.h b/debian/htdig/htdig-3.2.0b6/htword/WordBitCompress.h new file mode 100644 index 00000000..19f2c336 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordBitCompress.h @@ -0,0 +1,267 @@ +// +// WordBitCompress.h +// +// BitStream: put and get bits into a buffer +// *tagging: add tags to keep track of the position of data +// inside the bitstream for debuging purposes. +// *freezing: saves current position. further inserts in the BitStream +// aren't really done. This way you can try different +// compression algorithms and chose the best. +// +// Compressor: BitStream with extended fuctionalities +// +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordBitCompress.h,v 1.7 2004/05/28 13:15:26 lha Exp $ +// + +#ifndef _WordBitCompress_h +#define _WordBitCompress_h + +#include<stdio.h> +#include<stdlib.h> +#include"HtVector_int.h" +#include"HtMaxMin.h" + +typedef unsigned char byte; +// ******** HtVector_byte (header) +#define GType byte +#define HtVectorGType HtVector_byte +#include "HtVectorGeneric.h" + +typedef char * charptr; +// ******** HtVector_charptr (header) +#define GType charptr +#define HtVectorGType HtVector_charptr +#include "HtVectorGeneric.h" + + +// ******** Utility inline functions and macros + +// error checking +#define FATAL_ABORT fflush(stdout);fprintf(stderr,"FATAL ERROR at file:%s line:%d !!!\n",__FILE__,__LINE__);fflush(stderr);(*(int *)NULL)=1 +#define errr(s) {fprintf(stderr,"FATAL ERROR:%s\n",s);FATAL_ABORT;} +#define CHECK_MEM(p) if(!p) errr("mifluz: Out of memory!"); +// max/min of 2 values +#define TMax(a,b) (((a)>(b)) ? (a) : (b)) +#define TMin(a,b) (((a)<(b)) ? (a) : (b)) + +// compute integer log2 +// == minimum number of bits needed to code value +inline int +num_bits(unsigned int maxval ) +{ + unsigned int mv=maxval; + int nbits; + for(nbits=0;mv;nbits++){mv>>=1;} + return(nbits); +} +// compute 2^x +#define pow2(x) (1<<(x)) + + +// function declarations +char *label_str(const char *s,int n); +void show_bits(int v,int n=16); + +// unsigned short max_v(unsigned short *vals,int n); +// unsigned int max_v(unsigned int *vals,int n); +// unsigned short min_v(unsigned short *vals,int n); +// unsigned int min_v(unsigned int *vals,int n); + + + + + +// ************************************************** +// *************** BitStream *********************** +// ************************************************** +// compression is done in Compressor not in BitStream +class BitStream +{ +protected: + + // the buffer were the bitstream is stored + HtVector_byte buff; + + // current bit position within the buffer + int bitpos; + + // tags for debuging + HtVector_int tagpos; + HtVector_charptr tags; + int use_tags; + + // freezing the bitstream + HtVector_int freeze_stack; + int freezeon; +public: + void freeze(); + int unfreeze(); + + // puts a bit into the bitstream + inline void put(unsigned int v) + { + // SPEED CRITICAL SECTION + if(freezeon){bitpos++;return;} + if(v){buff.back()|=pow2(bitpos & 0x07);} + bitpos++; + if(!(bitpos & 0x07))// new byte + { + buff.push_back(0); + } + } + inline void put(unsigned int v,const char *tag) + { + if(!freezeon){add_tag(tag);} + put(v); + } + + // gets a bit from the bitstream + inline byte get(const char *tag=(char*)NULL) + { + // SPEED CRITICAL SECTION + if(check_tag(tag)==NOTOK){errr("BitStream::get() check_tag failed");} + if(bitpos>=(buff.size()<<3)){errr("BitStream::get reading past end of BitStream!");} + byte res=buff[bitpos>>3] & pow2(bitpos & 0x07); +// printf("get:res:%d bitpos:%5d/%d buff[%3d]=%x\n",res,bitpos,bitpos%8,bitpos/8,buff[bitpos/8]); + bitpos++; + return(res); + } + + // get/put an integer using n bits + void put_uint(unsigned int v,int n,const char *tag=(char*)"NOTAG"); + unsigned int get_uint( int n,const char *tag=(char*)NULL); + + // get/put n bits of data stored in vals + void put_zone(byte *vals,int n,const char *tag); + void get_zone(byte *vals,int n,const char *tag); + + // + inline void add_tag(const char *tag) + { + if(!use_tags || !tag || freezeon){return;} + add_tag1(tag); + } + void add_tag1(const char *tag); + inline int check_tag(const char *tag,int pos=-1) + { + if(!use_tags || !tag){return OK;} + return(check_tag1(tag,pos)); + } + int check_tag1(const char *tag,int pos); + void set_use_tags(){use_tags=1;} + int find_tag(const char *tag); + int find_tag(int pos,int posaftertag=1); + + void show_bits(int a,int n); + void show(int a=0,int n=-1); + + // position accesors + int size(){return(bitpos);} + int buffsize(){return(buff.size());} + + // get a copy of the buffer + byte *get_data(); + // set the buffer from outside data (current buffer must be empty) + void set_data(const byte *nbuff,int nbits); + + // use this for reading a BitStream after you have written in it + // (generally for debuging) + void rewind(){bitpos=0;} + + ~BitStream() + { + int i; + for(i=0;i<tags.size();i++){free(tags[i]);} + } + BitStream(int size0) + { + buff.reserve((size0+7)/8); + init(); + } + BitStream() + { + init(); + } + private: + void init() + { + bitpos=0; + buff.push_back(0); + freezeon=0; + use_tags=0; + } +}; + + +// ************************************************** +// *************** Compressor *********************** +// ************************************************** + +// Constants used by Compressor +// number of bits to code the number of values in an array +#define NBITS_NVALS 16 +// number of bits to code the values in an unsigned int array (=sizeof(unsigned int)) +#define NBITS_VAL 32 +// number of bits to code he number of bits used by an unsigned int value +#define NBITS_NBITS_VAL 5 +// number of bits to code the number of bits used by a byte value +#define NBITS_NBITS_CHARVAL 4 + +class Compressor : public BitStream +{ +public: + int verbose; + // get/put an integer using a variable number of bits + void put_uint_vl(unsigned int v,int maxn,const char *tag=(char*)"NOTAG"); + unsigned int get_uint_vl( int maxn,const char *tag=(char*)NULL); + + // get/put an integer checking for an expected value + void put_uint_ex(unsigned int v,unsigned int ex,int maxn,const char *tag=(char*)"NOTAG") + { + if(v==ex){put(1,tag);} + else{put(0,tag);put_uint(v,maxn,(char*)NULL);} + } + unsigned int get_uint_ex( unsigned int ex,int maxn,const char *tag=(char*)NULL) + { + if(get(tag)){return ex;} + else{return get_uint(maxn,(char*)NULL);} + } + + + // compress/decompress an array of unsigned ints (choosing best method) + int put_vals(unsigned int *vals,int n,const char *tag); + int get_vals(unsigned int **pres,const char *tag=(char*)"BADTAG!"); + + // compress/decompress an array of bytes (very simple) + int put_fixedbitl(byte *vals,int n,const char *tag); + int get_fixedbitl(byte **pres,const char *tag=(char*)"BADTAG!"); + + // compress/decompress an array of unsigned ints (very simple) + void get_fixedbitl(unsigned int *res,int n); + void put_fixedbitl(unsigned int *vals,int n); + + // compress/decompress an array of unsigned ints (sophisticated) + void get_decr(unsigned int *res,int n); + void put_decr(unsigned int *vals,int n); + + Compressor():BitStream() + { + verbose=0; + } + Compressor(int size0):BitStream(size0) + { + verbose=0; + } + +}; + + + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordCaseIsAStatements.h b/debian/htdig/htdig-3.2.0b6/htword/WordCaseIsAStatements.h new file mode 100644 index 00000000..2046ee2f --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordCaseIsAStatements.h @@ -0,0 +1,26 @@ +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// switch between unknown numerical types +// example usage: +// -------------------------- +// switch(word_key_info.sort[position].type) +// { +//#define STATEMENT(type) case WORD_ISA_##type:pool_##type[word_key_info.sort[position].index]=val;break +//#include"WordCaseIsAStatements.h" +// } +// -------------------------- +#ifdef WORD_HAVE_TypeA + STATEMENT(TypeA); +#endif /* WORD_HAVE_TypeA */ +#ifdef WORD_HAVE_TypeB + STATEMENT(TypeB); +#endif /* WORD_HAVE_TypeB */ +#ifdef WORD_HAVE_TypeC + STATEMENT(TypeC); +#endif /* WORD_HAVE_TypeC */ +#undef STATEMENT diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordContext.cc b/debian/htdig/htdig-3.2.0b6/htword/WordContext.cc new file mode 100644 index 00000000..490c9361 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordContext.cc @@ -0,0 +1,107 @@ +// +// WordContext.cc +// +// WordContext: call Initialize for all classes that need to. +// This will enable the Instance() static member +// of each to return a properly allocated and configured +// object. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordContext.cc,v 1.5 2004/05/28 13:15:26 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <stdlib.h> +#include <sys/stat.h> +#include <errno.h> + +#include "WordContext.h" +#include "WordType.h" +#include "WordKeyInfo.h" +#include "WordDBInfo.h" +#include "WordRecord.h" +#include "WordMonitor.h" + +void WordContext::Initialize(Configuration &config) +{ +#if !defined(HAVE_LIBZ) || !defined(HAVE_ZLIB_H) + config.Add("wordlist_compress", "false"); +#endif + + WordType::Initialize(config); + WordKeyInfo::Initialize(config); + WordRecordInfo::Initialize(config); + WordDBInfo::Initialize(config); + if(config.Boolean("wordlist_monitor")) + WordMonitor::Initialize(config); +} + +Configuration *WordContext::Initialize(const ConfigDefaults* config_defaults /* = 0 */) +{ + Configuration *config = new Configuration(); + + if(config_defaults) + config->Defaults(config_defaults); + + String filename; + // + // Check file pointed by MIFLUZ_CONFIG environment variable + // + if(getenv("MIFLUZ_CONFIG")) { + filename << getenv("MIFLUZ_CONFIG"); + struct stat statbuf; + if(stat((char*)filename, &statbuf) < 0) { + if(errno != ENOENT) { + fprintf(stderr, "WordContext::Initialize: MIFLUZ_CONFIG could not stat %s\n", (char*)filename); + perror(""); + } + filename.trunc(); + } + } + // + // Check for ~/.mifluz + // + if(filename.empty()) { + const char* home = getenv("HOME"); + if(home) { + filename << home << "/.mifluz"; + struct stat statbuf; + if(stat((char*)filename, &statbuf) < 0) { + if(errno != ENOENT) { + fprintf(stderr, "WordContext::Initialize: could not stat %s\n", (char*)filename); + perror(""); + } + filename.trunc(); + } + } + } + + if(!filename.empty()) + config->Read(filename); + + Initialize(*config); + + if(filename.empty() && !config_defaults) { + delete config; + config = 0; + } + + return config; +} + +void WordContext::Finish() +{ + delete WordType::Instance(); + delete WordKeyInfo::Instance(); + delete WordRecordInfo::Instance(); + delete WordDBInfo::Instance(); + if(WordMonitor::Instance()) delete WordMonitor::Instance(); +} diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordContext.h b/debian/htdig/htdig-3.2.0b6/htword/WordContext.h new file mode 100644 index 00000000..9081175c --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordContext.h @@ -0,0 +1,101 @@ +// +// WordContext.h +// +// NAME +// +// read configuration description and setup mifluz context. +// +// SYNOPSIS +// +// #include <mifluz.h> +// +// Configuration* config = WordContext::Initialize(); +// ... +// WordContext::Finish(); +// +// DESCRIPTION +// +// The WordContext::Initialize() method initialize the global context +// for the mifluz library. All other classes depend on it. It must +// therefore be called before any other <i>mifluz</i> classes are used. +// +// CONFIGURATION +// +// wordlist_monitor {true|false} (default false) +// If true create a <i>WordMonitor</i> instance to gather statistics and +// build reports. +// +// +// ENVIRONMENT +// +// <b>MIFLUZ_CONFIG</b> file name of configuration file read by +// WordContext(3). Defaults to <b>~/.mifluz.</b> +// +// END +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordContext.h,v 1.5 2004/05/28 13:15:26 lha Exp $ +// +#ifndef _WordContext_h_ +#define _WordContext_h_ + +#ifndef SWIG +#include "Configuration.h" +#endif /* SWIG */ + +// +// Short hand for calling Initialize for all classes +// Word* that have a single instance (WordType, WordKeyInfo, WordRecordInfo). +// +class WordContext +{ + public: + //- + // Create environment. Must be called before any other class are used. + // + // When calling <b>Initialize</b> a second time, one must ensure + // that all WordList and WordCursor objects have been + // destroyed. WordList and WordCursor internal state depends on the + // current WordContext that will be lost by a second call. + // <br> + // For those interested by the internals, the <b>Initialize</b> function + // maintains a Berkeley DB environment (DB_ENV) in the following way: + // + // First invocation: + // <pre> + // Initialize -> new DB_ENV (thru WordDBInfo) + // </pre> + // + // Second invocation: + // <pre> + // Initialize -> delete DB_ENV -> new DB_ENV (thru WordDBInfo) + // </pre> + // + static void Initialize(Configuration &config); +#ifndef SWIG + //- + // Build a <i>Configuration</i> object from the file pointed to by the + // MIFLUZ_CONFIG environment variable or ~/.mifluz. + // The <b>config_defaults</b> argument, if provided, is passed to + // the <i>Configuration</i> object using the <b>Defaults</b> method. + // The <b>Initialize(const Configuration &)</b> method is then called + // with the <i>Configuration</i> object. + // + // Refer to the <i>Configuration</i> description for more information. + // + // + static Configuration *Initialize(const ConfigDefaults* config_defaults = 0); +#endif /* SWIG */ + //- + // Destroy environment. Must be called after all other <i>mifluz</i> + // objects are destroyed. + // + static void Finish(); +}; + +#endif // _WordContext_h_ diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordCursor.cc b/debian/htdig/htdig-3.2.0b6/htword/WordCursor.cc new file mode 100644 index 00000000..d0980e04 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordCursor.cc @@ -0,0 +1,582 @@ +// +// WordCursor.cc +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordCursor.cc,v 1.4 2004/05/28 13:15:26 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <stdlib.h> + +#include "WordCursor.h" +#include "WordStat.h" +#include "WordList.h" + +#include <stdio.h> + +// +// WordCursor implementation +// + +// ***************************************************************************** +// +int WordCursor::Initialize(WordList *nwords, const WordKey &nsearchKey, wordlist_walk_callback_t ncallback, Object *ncallback_data, int naction) +{ + action = naction; + searchKey = nsearchKey; + callback = ncallback; + callback_data = ncallback_data; + words = nwords; + return OK; +} + +// ***************************************************************************** +// +void +WordCursor::Clear() +{ + searchKey.Clear(); + action = 0; + callback = 0; + callback_data = 0; + ClearResult(); + ClearInternal(); + words = 0; + + // + // Debugging section. + // + traceRes = 0; +} + +// ***************************************************************************** +// +void +WordCursor::ClearInternal() +{ + cursor.Close(); + key.trunc(); + data.trunc(); + prefixKey.Clear(); + cursor_get_flags = DB_SET_RANGE; + searchKeyIsSameAsPrefix = 0; +} + +// ***************************************************************************** +// +void +WordCursor::ClearResult() +{ + collectRes = 0; + found.Clear(); + status = OK; +} + +int +WordCursor::ContextRestore(const String& buffer) +{ + int ret = OK; + if(!buffer.empty()) { + WordKey key(buffer); + if((ret = Seek(key)) != OK) + return ret; + // + // Move to restored position so that next call to + // WalkNext will go above the restored position. + // + if((ret = WalkNext()) != OK) + return ret; + } + return ret; +} + +// ***************************************************************************** +// +// Walk and collect data from the word database. +// +// If action bit HTDIG_WORDLIST_COLLECTOR is set WordReferences are +// stored in a list and the list is returned. +// If action bit HTDIG_WORDLIST_WALKER is set the <callback> function +// is called for each WordReference found. No list is built and the +// function returns a null pointer. +// +// The <searchKey> argument may be a fully qualified key, containing precise values for each +// field of the key. It may also contain only some fields of the key. In both cases +// all the word occurrences matching the fields set in the key are retrieved. It may +// be fast if key is a prefix (see WordKey::Prefix for a definition). It may +// be *slow* if key is not a prefix because it forces a complete walk of the +// index. +// +int +WordCursor::Walk() +{ + int ret; + if((ret = WalkInit()) != OK) return ret; + while((ret = WalkNext()) == OK) + ; + int ret1; + if((ret1 = WalkFinish()) != OK) return ret1; + + return ret == WORD_WALK_ATEND ? OK : NOTOK; +} + +int +WordCursor::WalkInit() +{ + int ret = OK; + + ClearResult(); + ClearInternal(); + + WordReference wordRef; + + if((ret = cursor.Open(words->db.db)) != 0) + return ret; + + if(words->verbose) fprintf(stderr, "WordCursor::WalkInit: action = %d, SearchKey = %s\n", action, (char*)searchKey.Get()); + + if(action & HTDIG_WORDLIST_COLLECTOR) { + collectRes = new List; + } + + const WordReference& last = WordStat::Last(); + + WordKey first_key; + // + // Move the cursor to start walking and do some sanity checks. + // + if(searchKey.Empty()) { + // + // Move past the stat data + // + if(words->verbose) fprintf(stderr, "WordCursor::WalkInit: at start of keys because search key is empty\n"); + first_key = last.Key(); + + } else { + prefixKey = searchKey; + // + // If the key is a prefix, the start key is + // the longest possible prefix contained in the key. If the + // key does not contain any prefix, start from the beginning + // of the file. + // + if(prefixKey.PrefixOnly() == NOTOK) { + if(words->verbose) fprintf(stderr, "WordCursor::WalkInit: at start of keys because search key is not a prefix\n"); + prefixKey.Clear(); + // + // Move past the stat data + // + first_key = last.Key(); + } else { + if(words->verbose) fprintf(stderr, "WordCursor::WalkInit: go to %s \n", (char*)prefixKey.Get()); + first_key = prefixKey; + } + } + + first_key.Pack(key); + // + // Allow Seek immediately after Init + // + found.Key().CopyFrom(first_key); + + status = OK; + searchKeyIsSameAsPrefix = searchKey.ExactEqual(prefixKey); + cursor_get_flags = DB_SET_RANGE; + + return OK; +} + +int +WordCursor::WalkRewind() +{ + const WordReference& last = WordStat::Last(); + + WordKey first_key; + // + // Move the cursor to start walking and do some sanity checks. + // + if(searchKey.Empty()) { + first_key = last.Key(); + } else { + prefixKey = searchKey; + // + // If the key is a prefix, the start key is + // the longest possible prefix contained in the key. If the + // key does not contain any prefix, start from the beginning + // of the file. + // + if(prefixKey.PrefixOnly() == NOTOK) { + prefixKey.Clear(); + // + // Move past the stat data + // + first_key = last.Key(); + } else { + first_key = prefixKey; + } + } + + first_key.Pack(key); + // + // Allow Seek immediately after Rewind + // + found.Key().CopyFrom(first_key); + + status = OK; + searchKeyIsSameAsPrefix = searchKey.ExactEqual(prefixKey); + cursor_get_flags = DB_SET_RANGE; + + return OK; +} + +int +WordCursor::WalkNext() +{ + int ret; + while((ret = WalkNextStep()) == WORD_WALK_NOMATCH_FAILED) + if(words->verbose > 1) fprintf(stderr, "WordCursor::WalkNext: got false match, retry\n"); + + return ret; +} + +int +WordCursor::WalkNextStep() +{ + status = OK; + + { + int error; + if((error = cursor.Get(key, data, cursor_get_flags)) != 0) { + if(error == DB_NOTFOUND) { + if(words->verbose) fprintf(stderr, "WordCursor::WalkNextStep: looking for %s, no more matches\n", (char*)searchKey.Get()); + return (status = WORD_WALK_ATEND); + } else { + return WORD_WALK_GET_FAILED; + } + } + } + + // + // Next step operation is always sequential walk + // + cursor_get_flags = DB_NEXT; + + found.Unpack(key, data); + + if(traceRes) traceRes->Add(new WordReference(found)); + + if(words->verbose > 1) fprintf(stderr, "WordCursor::WalkNextStep: looking for %s, candidate is %s\n", (char*)searchKey.Get(), (char*)found.Get()); + + // + // Don't bother to compare keys if we want to walk all the entries + // + if(!(searchKey.Empty())) { + // examples + // searchKey: aabc 1 ? ? ? + // prefixKey: aabc 1 ? ? ? + + // + // Stop loop if we reach a record whose key does not + // match prefix key requirement, provided we have a valid + // prefix key. + // (ie. stop loop if we're past last possible match...) + // + if(!prefixKey.Empty() && + !prefixKey.Equal(found.Key())) { + if(words->verbose) fprintf(stderr, "WordCursor::WalkNextStep: looking for %s, no more matches because found a key that is greater than searchKey\n", (char*)searchKey.Get()); + return (status = WORD_WALK_ATEND); + } + + // + // Skip entries that do not exactly match the specified key. + // + if(!searchKeyIsSameAsPrefix && + !searchKey.Equal(found.Key())) { + int ret; + switch((ret = SkipUselessSequentialWalking())) { + case OK: + if(words->verbose > 1) fprintf(stderr, "WordCursor::WalkNextStep: looking for %s, false match jump to %s\n", (char*)searchKey.Get(), (char*)found.Get()); + return WORD_WALK_NOMATCH_FAILED; + break; + case WORD_WALK_ATEND: + if(words->verbose) fprintf(stderr, "WordCursor::WalkNextStep: looking for %s, no more matches according to SkipUselessSequentialWalking\n", (char*)searchKey.Get()); + return (status = WORD_WALK_ATEND); + break; + default: + fprintf(stderr, "WordCursor::WalkNextStep: SkipUselessSequentialWalking failed %d\n", ret); + return NOTOK; + break; + } + } + } + + if(words->verbose) fprintf(stderr, "WordCursor::WalkNextStep: looking for %s, found %s\n", (char*)searchKey.Get(), (char*)found.Get()); + + if(collectRes) { + if(words->verbose > 2) fprintf(stderr, "WordCursor::WalkNextStep: collect\n"); + collectRes->Add(new WordReference(found)); + } else if(callback) { + if(words->verbose > 2) fprintf(stderr, "WordCursor::WalkNextStep: calling callback\n"); + int ret = (*callback)(words, cursor, &found, *(callback_data) ); + // + // The callback function tells us that something went wrong, might + // as well stop walking. + // + if(ret != OK) { + if(words->verbose) fprintf(stderr, "WordCursor::WalkNextStep: callback returned NOTOK"); + return WORD_WALK_CALLBACK_FAILED|(status = WORD_WALK_ATEND); + } + } + + return OK; +} + +int +WordCursor::WalkFinish() +{ + if(words->verbose) fprintf(stderr, "WordCursor::WalkFinish\n"); + + return cursor.Close() == 0 ? OK : NOTOK; +} + +// ***************************************************************************** +// +// Helper for SkipUselessSequentialWalking. +// Undefine in foundKey all fields defined in searchKey +// so that they are not considered by SetToFollowing. +// It could become a method of WordKey but lacks generalisation and +// from what I see it is a rather specific operation. +// +static inline void complement(WordKey& key, const WordKey& mask) +{ + int nfields = WordKey::NFields(); + int i; + // + // Undefine in 'key' all fields defined in 'mask' + // + for(i = 0; i < nfields; i++) { + if(mask.IsDefined(i)) + key.Undefined(i); + else + key.SetDefined(i); + } + // + // If searching for a prefix, we must allow the word in + // key to increment. + // + if(mask.IsDefinedWordSuffix()) { + key.UndefinedWordSuffix(); + } else { + key.SetDefinedWordSuffix(); + key.SetDefined(0); + } +} + +// ***************************************************************************** +// +// Find out if we should better jump to the next possible key (DB_SET_RANGE) instead of +// sequential iterating (DB_NEXT). +// If it is decided that jump is a better move : +// cursor_set_flags = DB_SET_RANGE +// key = calculated next possible key +// Else +// do nothing +// Return values +// OK: skipping successfull. +// WORD_WALK_ATEND : no more possible match, reached the maximum +// WORD_WALK_FAILED: general failure, occurs if called and no skipping +// necessary. +// +// Sequential searching can waste time by searching all keys, for example: +// If searching for Key: argh <DEF> <UNDEF> 10 +// Under normal circonstances we would do the following +// +// DATA STATUS ACTION +// 1: argh 1 10 match DB_NEXT +// 2: argh 2 11 nomatch DB_NEXT +// 3: argh 2 15 nomatch DB_NEXT +// 4: argh 2 20 nomatch DB_NEXT +// 5: argh 2 30 nomatch DB_NEXT +// 6: argh 5 1 nomatch DB_NEXT +// 7: argh 5 8 nomatch DB_NEXT +// 8: argh 8 6 nomatch DB_NEXT +// +// But the optimal would be +// +// DATA STATUS ACTION +// 1: argh 1 10 match DB_NEXT +// 2: argh 2 11 nomatch DB_SET_RANGE argh 3 10 +// 3: argh 2 15 +// 4: argh 2 20 +// 5: argh 2 30 +// 6: argh 5 1 nomatch DB_SET_RANGE argh 5 10 +// 7: argh 5 8 +// 8: argh 8 6 nomatch DB_SET_RANGE argh 8 10 +// +// That saves a lot of unecessary hit. The underlying logic is a bit +// more complex but you have the idea. +// +int +WordCursor::SkipUselessSequentialWalking() +{ + WordKey& foundKey = found.Key(); + + int nfields = WordKey::NFields(); + int i; + + // + // Find out how the searchKey and the foundKey differ. + // + int diff_field = 0; + int lower = 0; + if(!foundKey.Diff(searchKey, diff_field, lower)) { + // + // foundKey matches searchKey (no difference), don't + // skip, everything is fine. The caller of SkipUselessSequentialWalking + // is expected to avoid this case for efficiency. + // + return WORD_WALK_FAILED; + } + + if(words->verbose > 2) fprintf(stderr, "WordCursor::SkipUselessSequentialWalking: looking for %s, candidate is %s\n", (char*)searchKey.Get(), (char*)foundKey.Get()); + + // + // Undefine in foundKey all fields defined in searchKey + // so that they are not considered by SetToFollowing. + // + complement(foundKey, searchKey); + + // + // If the key found is lower than the searched key when + // considering only the fields defined in the search key, + // we only need to enforce the key to get the match. + // Otherwise we need to increment the found key to jump + // properly. + // + if(lower) { + if(words->verbose > 1) fprintf(stderr, "WordCursor::SkipUselessSequentialWalking: enforcing the search constraint is enough to jump forward\n"); + for(i = diff_field + 1; i < nfields; i++) + if(foundKey.IsDefined(i)) foundKey.Set(i, 0); + } else { + if(words->verbose > 1) fprintf(stderr, "WordCursor::SkipUselessSequentialWalking: increment the key to jump forward\n"); + // + // diff_field - 1 is not really necessary because diff_field is undefined + // in foundKey and would therefore be ignored by SetToFollowing. We write + // diff_field - 1 to clearly state that incrementing begins just before the + // field for which a difference was found. + // + int ret; + if((ret = foundKey.SetToFollowing(diff_field - 1)) != OK) + return ret; + } + + // + // Copy all fields defined in searchKey into foundKey. This will copy + // searchKey in foundKey because all these fields have been + // previously undefined in foundKey. + // + foundKey.Merge(searchKey); + + if(words->verbose > 2) fprintf(stderr, "WordCursor::SkipUselessSequentialWalking: looking for %s, jump to %s\n", (char*)searchKey.Get(), (char*)foundKey.Get()); + + // + // Instruct Next function to jump to the calculated key + // + if(foundKey.Pack(key) == NOTOK) { + return WORD_WALK_FAILED; + } + cursor_get_flags = DB_SET_RANGE; + + return OK; +} + +// ***************************************************************************** +// +// Copy defined fields in patch into foundKey and +// initialize internal state so that WalkNext jumps to +// this key next time it's called. +// +// Technically this means : Override latest key found (found data member) +// with patch fields values, starting from the first field set in +// patch up to the last. Pack the result in the key field and set +// cursor_get_flags to DB_SET_RANGE. +// +int +WordCursor::Seek(const WordKey& patch) +{ + int nfields = WordKey::NFields(); + WordKey pos = searchKey; + + if(patch.Empty()) { + fprintf(stderr, "WordCursor::Seek: empty patch is useless\n"); + return NOTOK; + } + + int i; + // + // Leave the most significant fields untouched + // + for(i = WORD_FIRSTFIELD; i < nfields; i++) + if(patch.IsDefined(i)) + break; + // + // From the first value set in the patch to the end + // override. + // + for(; i < nfields; i++) { + if(patch.IsDefined(i)) + pos.Set(i, patch.Get(i)); + else + pos.Set(i, 0); + } + + if(!pos.Filled()) { + fprintf(stderr, "WordCursor::Seek: only make sense if the resulting key is fully defined\n"); + return NOTOK; + } + + if(words->verbose > 2) fprintf(stderr, "WordCursor::Seek: seek to %s\n", (char*)pos.Get()); + + // + // Next move will jump to the patched key + // + pos.Pack(key); + cursor_get_flags = DB_SET_RANGE; + + return OK; +} + +int WordCursor::Noccurrence(unsigned int& noccurrence) const +{ + if(!words) { + fprintf(stderr, "WordCursor::Noccurrence: words not set (call Prepare first)\n"); + return NOTOK; + } + return words->Noccurrence(searchKey, noccurrence); +} + +// +// Convert the whole structure to an ascii string description +// +int WordCursor::Get(String& bufferout) const +{ + String tmp; + bufferout.trunc(); + + searchKey.Get(tmp); + bufferout << "Input: searchKey = " << tmp << ", action = " << action << "; Output: collectRes " << (collectRes ? "set" : "not set"); + found.Get(tmp); + bufferout << ", found = " << tmp << ", status = " << status; + prefixKey.Get(tmp); + bufferout << "; Internal State: prefixKey = " << tmp << ", cursor_get_flags = " << cursor_get_flags; + + return OK; +} diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordCursor.h b/debian/htdig/htdig-3.2.0b6/htword/WordCursor.h new file mode 100644 index 00000000..ba6e9732 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordCursor.h @@ -0,0 +1,445 @@ +// +// WordList.h +// +// NAME +// +// search specification and results for WordList. +// +// SYNOPSIS +// +// #include <WordList.h> +// +// int callback(WordList *, WordDBCursor& , const WordReference *, Object &) +// { +// ... +// } +// +// Object* data = ... +// +// WordList *words = ...; +// +// WordCursor *search = words->Cursor(callback, data); +// WordCursor *search = words->Cursor(WordKey("word <DEF> <UNDEF> <UNDEF>")); +// WordCursor *search = words->Cursor(WordKey("word <DEF> <UNDEF> <UNDEF>"), callback, data); +// +// ... +// +// if(search->Walk() == NOTOK) bark; +// List* results = search->GetResults(); +// +// if(search->WalkNext() == OK) +// dosomething(search->GetFound()); +// +// DESCRIPTION +// +// WordCursor is an iterator on an inverted index. It is created by +// asking a <i>WordList</i> object with the <i>Cursor.</i> There is +// no other way to create a WordCursor object. +// When the <i>Walk*</i> methods return, +// the WordCursor object contains the result of the search and +// status information that indicates if it reached the end of +// the list (IsAtEnd() method). +// +// The <b>callback</b> function that is called each time a match is +// found takes the following arguments: +// <pre> +// WordList* words pointer to the inverted index handle. +// WordDBCursor& cursor to call Del() and delete the current match +// WordReference* wordRef is the match +// Object& data is the user data provided by the caller when +// search began. +// </pre> +// +// The <i>WordKey</i> object that specifies the search criterion +// may be used as follows (assuming word is followed by DOCID and +// LOCATION): +// +// Ex1: <b>WordKey("word <DEF> <UNDEF> <UNDEF>")</b> find all occurrences +// of <i>word</i>. +// +// Ex2: <b>WordKey("meet <UNDEF> <UNDEF> <UNDEF>")</b> find all occurrences +// starting with <i>meet</i>, including <i>meeting</i> etc. +// +// Ex3: <b>WordKey("meet <DEF> <UNDEF> 1")</b> find all occurrences of +// <i>meet</i> that occur at LOCATION 1 in any DOCID. This can +// be inefficient since the search has to scan all occurrences +// of <i>meet</i> to find the ones that occur at LOCATION 1. +// +// Ex4: <b>WordKey("meet <DEF> 2 <UNDEF>")</b> find all occurrences of +// <i>meet</i> that occur in DOCID 2, at any location. +// +// Interface functions are virtual so that a derivation of the +// class is possible. Some functions are meant to be used by derived +// classes such as the <b>Initialize</b> function. All data members +// should be accessed using the corresponding accessor if possible. +// +// END +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordCursor.h,v 1.4 2004/05/28 13:15:26 lha Exp $ +// + +#ifndef _WordCursor_h_ +#define _WordCursor_h_ + +#ifndef SWIG +#include "htString.h" +#include "WordKey.h" +#include "WordDB.h" + +class WordList; +class WordDBCursor; +#endif /* SWIG */ +// +// Possible values of the action argument of WordList::Walk +// check walk function in WordList.cc for info on these: +// +#define HTDIG_WORDLIST_COLLECTOR 0x0001 +#define HTDIG_WORDLIST_WALKER 0x0002 + +#ifndef SWIG +// +// Type of the callback argument in WordCursor +// +typedef int (*wordlist_walk_callback_t)(WordList *, WordDBCursor& , const WordReference *, Object &); +#endif /* SWIG */ + +// +// Possible values of the status member +// +// +// WalkNext reached the end of the matches +// +#define WORD_WALK_ATEND 0x0001 +// +// Failed to acquire Berkeley DB cursor +// +#define WORD_WALK_CURSOR_FAILED 0x0002 +// +// Berkeley DB Get operation failed +// +#define WORD_WALK_GET_FAILED 0x0004 +// +// Callback function returned NOTOK +// +#define WORD_WALK_CALLBACK_FAILED 0x0008 +// +// WalkNextStep hit an entry that does not match the +// searched key. +// +#define WORD_WALK_NOMATCH_FAILED 0x0010 +// +// WordCursor contains undefined data +// +#define WORD_WALK_FAILED 0xffffffff + +// +// Possible return values of the IsA() method +// +#define WORD_CURSOR 1 +#define WORD_CURSORS 2 + +// +// Wordlist::Walk uses WordCursor for : +// state information : cursor +// search term description +// debug/trace/benchmarking +// search result format description +// +class WordCursor +{ + public: +#ifndef SWIG + // + // Private constructor. Creator of the object must then call Initialize() + // prior to using any other methods. + // + WordCursor() { Clear(); } + //- + // Private constructor. See WordList::Cursor method with same prototype for + // description. + // + WordCursor(WordList *words, wordlist_walk_callback_t callback, Object * callback_data) { Clear(); Initialize(words, WordKey(), callback, callback_data, HTDIG_WORDLIST_WALKER); } + //- + // Private constructor. See WordList::Cursor method with same prototype for + // description. + // + WordCursor(WordList *words, const WordKey &searchKey, int action = HTDIG_WORDLIST_WALKER) { Clear(); Initialize(words, searchKey, 0, 0, action); } + //- + // Private constructor. See WordList::Cursor method with same prototype for + // description. + // + WordCursor(WordList *words, const WordKey &searchKey, wordlist_walk_callback_t callback, Object * callback_data) { Clear(); Initialize(words, searchKey, callback, callback_data, HTDIG_WORDLIST_WALKER); } +#endif /* SWIG */ + virtual ~WordCursor() {} + //- + // Clear all data in object, set <b>GetResult()</b> data to NULL but + // do not delete it (the application is responsible for that). + // + virtual void Clear(); + virtual void ClearInternal(); + virtual void ClearResult(); + + //- + // Returns the type of the object. May be overloaded by + // derived classes to differentiate them at runtime. + // Returns WORD_CURSOR. + // + virtual int IsA() const { return WORD_CURSOR; } + + //- + // Returns true if WalkNext() step entries in strictly increasing + // order, false if it step entries in random order. + // + virtual int Ordered() const { return 1; } + + //- + // Optimize the cursor before starting a Walk. + // Returns OK on success, NOTOK otherwise. + // + virtual int Optimize() { return OK; } + + //- + // Save in <b>buffer</b> all the information necessary to resume + // the walk at the point it left. The ASCII representation of the + // last key found (GetFound()) is written in <b>buffer</b> using the + // WordKey::Get method. + // + virtual int ContextSave(String& buffer) const { found.Get(buffer); return OK; } + //- + // Restore from buffer all the information necessary to + // resume the walk at the point it left. The <b>buffer</b> is expected + // to contain an ASCII representation of a WordKey (see WordKey::Set + // method). A <b>Seek</b> is done on the key and the object is prepared + // to jump to the next occurrence when <b>WalkNext</b> is called (the + // cursor_get_flags is set to <i>DB_NEXT.</i> + // + virtual int ContextRestore(const String& buffer); + +#ifndef SWIG + //- + // Walk and collect data from the index. + // Returns OK on success, NOTOK otherwise. + // + virtual int Walk(); +#endif /* SWIG */ + //- + // Must be called before other Walk methods are used. + // Fill internal state according to input parameters + // and move before the first matching entry. + // Returns OK on success, NOTOK otherwise. + // + virtual int WalkInit(); + //- + // Move before the first index matching entry. + // Returns OK on success, NOTOK otherwise. + // + virtual int WalkRewind(); + //- + // Move to the next matching entry. + // At end of list, WORD_WALK_ATEND is returned. + // Returns OK on success, NOTOK otherwise. + // + virtual int WalkNext(); +#ifndef SWIG + //- + // Advance the cursor one step. The entry pointed to by the cursor may + // or may not match the requirements. Returns OK if entry pointed + // by cursor matches requirements. Returns NOTOK on + // failure. Returns WORD_WALK_NOMATCH_FAILED if the current entry + // does not match requirements, it's safe to call WalkNextStep again + // until either OK or NOTOK is returned. + // + virtual int WalkNextStep(); +#endif /* SWIG */ + //- + // Terminate Walk, free allocated resources. + // Returns OK on success, NOTOK otherwise. + // + virtual int WalkFinish(); + // + // Find out if cursor should better jump to the next possible key + // (DB_SET_RANGE) instead of sequential iterating (DB_NEXT). If it + // is decided that jump is a better move : cursor_set_flags = + // DB_SET_RANGE key = calculated next possible key Else do nothing + // Return OK if skipping successfull. Returns WORD_WALK_ATEND if no + // more possible match, reached the maximum. Returns + // WORD_WALK_FAILED on general failure, occurs if called and no + // skipping necessary. + // + int SkipUselessSequentialWalking(); + + //- + // Move before the inverted index position specified in <b>patch.</b> + // May only be called after a successfull call to the <i>WalkNext</i> + // or <i>WalkNextStep</i>method. + // Copy defined fields from <b>patch</b> into a copy of the + // <i>found</i> data member and + // initialize internal state so that <i>WalkNext</i> jumps to + // this key next time it's called (cursor_get_flag set to DB_SET_RANGE). + // Returns OK if successfull, NOTOK otherwise. + // + virtual int Seek(const WordKey& patch); + + //- + // Returns true if cursor is positioned after the last possible + // match, false otherwise. + // + virtual int IsAtEnd() const { return status == WORD_WALK_ATEND; } + + // + // Accessors for input parameters + // + //- + // Returns the search criterion. + // + WordKey& GetSearch() { return searchKey; } +#ifndef SWIG + const WordKey& GetSearch() const { return searchKey; } +#endif /* SWIG */ + //- + // Returns the type of action when a matching entry + // is found. + // + int GetAction() const { return action; } + // + // Accessors for output parameters + // + //- + // Returns the list of WordReference found. The application + // is responsible for deallocation of the list. + // + List *GetResults() { return collectRes; } + //- + // For debugging purposes. Returns the list of WordReference hit + // during the search + // process. Some of them match the searched key, some don't. + // The application is responsible for deallocation of the list. + // + List *GetTraces() { return traceRes; } + //- + // For debugging purposes. Set the list of WordReference hit + // during the search process. + // + void SetTraces(List* traceRes_arg) { traceRes = traceRes_arg; } + //- + // Returns the last entry hit by the search. Only contains + // a valid value if the last <i>WalkNext</i> or <i>WalkNextStep</i> + // call was successfull (i.e. returned OK). + // + const WordReference& GetFound() { return found; } + //- + // Returns the number of occurrences of the searched word + // in the inverted index in the <b>noccurrence</b> parameter. + // Returns OK on success, NOTOK on failure. + // + virtual int Noccurrence(unsigned int& noccurrence) const; + +#ifndef SWIG + //- + // Convert the whole structure to an ASCII string description + // Returns OK if successfull, NOTOK otherwise. + // + virtual int Get(String& bufferout) const; + String Get() const { String tmp; Get(tmp); return tmp; } + + protected: + + //- + // Protected method. Derived classes should use this function to initialize + // the object if they do not call a WordCursor constructor in their own + // constructutor. Initialization may occur after the object is created + // and must occur before a <b>Walk*</b> method is called. See the + // DESCRIPTION section for the semantics of the arguments. + // Return OK on success, NOTOK on error. + // + int Initialize(WordList *nwords, const WordKey &nsearchKey, wordlist_walk_callback_t ncallback, Object * ncallback_data, int naction); + + // + // Input parameters + // + //- + // Input data. The key to be searched, see DESCRIPTION for more information. + // + WordKey searchKey; + // + // Input data. What do do when a WordReference is found. + // Can either be + // HTDIG_WORDLIST_COLLECTOR WordReference found stored in collectRes + // HTDIG_WORDLIST_WALKER callback is called for each WordReference found + // + int action; + + // + // Input data. Callback function called for each match found. + // + wordlist_walk_callback_t callback; + // + // Input data. Argument given to callback, contains arbitrary + // caller defined data. + // + Object *callback_data; + + // + // Output parameters + // + // + // Output data. List of WordReference found in the search. + // + List *collectRes; + + //- + // Output data. Last match found. Use GetFound() to retrieve it. + // + WordReference found; + //- + // Output data. WORD_WALK_ATEND if cursor is past last match, + // OK otherwise. Use GetStatus() to retrieve it. + // + int status; + + // + // Debugging section. Do not use unless you know exactly what you do. + // + // + // Collect everything found while searching (not necessarily matching) + // + List *traceRes; + + // + // Internal state + // + // + // The actual Berkeley DB cursor. + // + WordDBCursor cursor; + // + // The latest retrieved key and data + // + String key; + String data; + // + // The shorted prefix key computed from searchKey + // + WordKey prefixKey; + //- + // WalkNext leap is either DB_NEXT or DB_SET_RANGE. + // + int cursor_get_flags; + // + // True if search key is a prefix key + // + int searchKeyIsSameAsPrefix; + //- + // The inverted index used by this cursor. + // + WordList *words; +#endif /* SWIG */ +}; + +#endif /* _WordCursor_h_ */ diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordCursorOne.cc b/debian/htdig/htdig-3.2.0b6/htword/WordCursorOne.cc new file mode 100644 index 00000000..011cfc9e --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordCursorOne.cc @@ -0,0 +1,590 @@ +// +// WordCursorOne.cc +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordCursorOne.cc,v 1.4 2004/05/28 13:15:26 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <stdlib.h> + +#include "WordCursorOne.h" +#include "WordListOne.h" +#include "WordDead.h" + +#include <stdio.h> + +// +// WordCursorOne implementation +// + +// ***************************************************************************** +WordCursorOne::WordCursorOne(WordList *words) : + WordCursor(words->GetContext()), + prefixKey(words->GetContext()) +{ + Clear(); +} + +// ***************************************************************************** +WordCursorOne::WordCursorOne(WordList *words, wordlist_walk_callback_t callback, Object * callback_data) : + WordCursor(words->GetContext()), + prefixKey(words->GetContext()) +{ + Clear(); + Initialize(words, WordKey(words->GetContext()), callback, callback_data, HTDIG_WORDLIST_WALKER); +} + +// ***************************************************************************** +WordCursorOne::WordCursorOne(WordList *words, const WordKey &searchKey, int action = HTDIG_WORDLIST_WALKER) : + WordCursor(words->GetContext()), + prefixKey(words->GetContext()) +{ + Clear(); + Initialize(words, searchKey, 0, 0, action); +} + +// ***************************************************************************** +WordCursorOne::WordCursorOne(WordList *words, const WordKey &searchKey, wordlist_walk_callback_t callback, Object * callback_data) : + WordCursor(words->GetContext()), + prefixKey(words->GetContext()) +{ + Clear(); + Initialize(words, searchKey, callback, callback_data, HTDIG_WORDLIST_WALKER); +} + +// ***************************************************************************** +// +int WordCursorOne::Initialize(WordList *nwords, const WordKey &nsearchKey, wordlist_walk_callback_t ncallback, Object *ncallback_data, int naction) +{ + action = naction; + searchKey = nsearchKey; + callback = ncallback; + callback_data = ncallback_data; + words = nwords; + cursor = ((WordListOne*)nwords)->db->Cursor(); + return OK; +} + +// ***************************************************************************** +// +void +WordCursorOne::Clear() +{ + searchKey.Clear(); + action = 0; + callback = 0; + callback_data = 0; + ClearResult(); + ClearInternal(); + words = 0; + + // + // Debugging section. + // + traceRes = 0; +} + +// ***************************************************************************** +// +void +WordCursorOne::ClearInternal() +{ + key.trunc(); + data.trunc(); + prefixKey.Clear(); + cursor_get_flags = DB_SET_RANGE; + searchKeyIsSameAsPrefix = 0; +} + +// ***************************************************************************** +// +void +WordCursorOne::ClearResult() +{ + collectRes = 0; + found.Clear(); + status = OK; +} + +int +WordCursorOne::ContextRestore(const String& buffer) +{ + int ret = OK; + if(!buffer.empty()) { + WordKey key(words->GetContext(), buffer); + if((ret = Seek(key)) != OK) + return ret; + // + // Move to restored position so that next call to + // WalkNext will go above the restored position. + // + if((ret = WalkNext()) != OK) + return ret; + } + return ret; +} + +// ***************************************************************************** +// +// Walk and collect data from the word database. +// +// If action bit HTDIG_WORDLIST_COLLECTOR is set WordReferences are +// stored in a list and the list is returned. +// If action bit HTDIG_WORDLIST_WALKER is set the <callback> function +// is called for each WordReference found. No list is built and the +// function returns a null pointer. +// +// The <searchKey> argument may be a fully qualified key, containing precise values for each +// field of the key. It may also contain only some fields of the key. In both cases +// all the word occurrences matching the fields set in the key are retrieved. It may +// be fast if key is a prefix (see WordKey::Prefix for a definition). It may +// be *slow* if key is not a prefix because it forces a complete walk of the +// index. +// +int +WordCursorOne::Walk() +{ + int ret; + if((ret = WalkInit()) != OK) return ret; + while((ret = WalkNext()) == OK) + ; + int ret1; + if((ret1 = WalkFinish()) != OK) return ret1; + + return ret == WORD_WALK_ATEND ? OK : NOTOK; +} + +int +WordCursorOne::WalkInit() +{ + ClearResult(); + ClearInternal(); + + WordReference wordRef(words->GetContext()); + + { + int ret; + if((ret = cursor->Open()) != 0) + return ret; + } + + if(words->verbose) fprintf(stderr, "WordCursorOne::WalkInit: action = %d, SearchKey = %s\n", action, (char*)searchKey.Get()); + + if(action & HTDIG_WORDLIST_COLLECTOR) { + collectRes = new List; + } + + WordKey first_key(words->GetContext()); + // + // Move the cursor to start walking and do some sanity checks. + // + if(searchKey.Empty()) { + // + // Move past the stat data + // + if(words->verbose) fprintf(stderr, "WordCursorOne::WalkInit: at start of keys because search key is empty\n"); + + } else { + prefixKey = searchKey; + // + // If the key is a prefix, the start key is + // the longest possible prefix contained in the key. If the + // key does not contain any prefix, start from the beginning + // of the file. + // + if(prefixKey.PrefixOnly() == NOTOK) { + if(words->verbose) fprintf(stderr, "WordCursorOne::WalkInit: at start of keys because search key is not a prefix\n"); + prefixKey.Clear(); + } else { + if(words->verbose) fprintf(stderr, "WordCursorOne::WalkInit: go to %s \n", (char*)prefixKey.Get()); + first_key = prefixKey; + } + } + + first_key.Pack(key); + // + // Allow Seek immediately after Init + // + found.Key() = first_key; + + status = OK; + searchKeyIsSameAsPrefix = searchKey.ExactEqual(prefixKey); + cursor_get_flags = DB_SET_RANGE; + + return OK; +} + +int +WordCursorOne::WalkRewind() +{ + WordKey first_key(words->GetContext()); + // + // Move the cursor to start walking and do some sanity checks. + // + if(searchKey.Empty()) { + first_key.Clear(); + } else { + prefixKey = searchKey; + // + // If the key is a prefix, the start key is + // the longest possible prefix contained in the key. If the + // key does not contain any prefix, start from the beginning + // of the file. + // + if(prefixKey.PrefixOnly() == NOTOK) { + prefixKey.Clear(); + first_key.Clear(); + } else { + first_key = prefixKey; + } + } + + first_key.Pack(key); + // + // Allow Seek immediately after Rewind + // + found.Key() = first_key; + + status = OK; + searchKeyIsSameAsPrefix = searchKey.ExactEqual(prefixKey); + cursor_get_flags = DB_SET_RANGE; + + return OK; +} + +int +WordCursorOne::WalkNext() +{ + int ret; + while((ret = WalkNextStep()) == WORD_WALK_NOMATCH_FAILED) + if(words->verbose > 1) fprintf(stderr, "WordCursorOne::WalkNext: got false match, retry\n"); + + return ret; +} + +int +WordCursorOne::WalkNextStep() +{ + status = OK; + + { + int error; + if((error = cursor->Get(key, data, cursor_get_flags)) != 0) { + if(error == DB_NOTFOUND) { + if(words->verbose) fprintf(stderr, "WordCursorOne::WalkNextStep: looking for %s, no more matches\n", (char*)searchKey.Get()); + return (status = WORD_WALK_ATEND); + } else { + return WORD_WALK_GET_FAILED; + } + } + } + + // + // Next step operation is always sequential walk + // + cursor_get_flags = DB_NEXT; + + found.Unpack(key, data); + + if(words->Dead()->Exists(found.Key())) + return WORD_WALK_NOMATCH_FAILED; + + if(traceRes) traceRes->Add(new WordReference(found)); + + if(words->verbose > 1) fprintf(stderr, "WordCursorOne::WalkNextStep: looking for %s, candidate is %s\n", (char*)searchKey.Get(), (char*)found.Get()); + + // + // Don't bother to compare keys if we want to walk all the entries + // + if(!(searchKey.Empty())) { + // examples + // searchKey: aabc 1 ? ? ? + // prefixKey: aabc 1 ? ? ? + + // + // Stop loop if we reach a record whose key does not + // match prefix key requirement, provided we have a valid + // prefix key. + // (ie. stop loop if we're past last possible match...) + // + if(!prefixKey.Empty() && + !prefixKey.Equal(found.Key())) { + if(words->verbose) fprintf(stderr, "WordCursorOne::WalkNextStep: looking for %s, no more matches because found a key that is greater than searchKey\n", (char*)searchKey.Get()); + return (status = WORD_WALK_ATEND); + } + + // + // Skip entries that do not exactly match the specified key. + // + if(!searchKeyIsSameAsPrefix && + !searchKey.Equal(found.Key())) { + int ret; + switch((ret = SkipUselessSequentialWalking())) { + case OK: + if(words->verbose > 1) fprintf(stderr, "WordCursorOne::WalkNextStep: looking for %s, false match jump to %s\n", (char*)searchKey.Get(), (char*)found.Get()); + return WORD_WALK_NOMATCH_FAILED; + break; + case WORD_WALK_ATEND: + if(words->verbose) fprintf(stderr, "WordCursorOne::WalkNextStep: looking for %s, no more matches according to SkipUselessSequentialWalking\n", (char*)searchKey.Get()); + return (status = WORD_WALK_ATEND); + break; + default: + fprintf(stderr, "WordCursorOne::WalkNextStep: SkipUselessSequentialWalking failed %d\n", ret); + return NOTOK; + break; + } + } + } + + if(words->verbose) fprintf(stderr, "WordCursorOne::WalkNextStep: looking for %s, found %s\n", (char*)searchKey.Get(), (char*)found.Get()); + + if(collectRes) { + if(words->verbose > 2) fprintf(stderr, "WordCursorOne::WalkNextStep: collect\n"); + collectRes->Add(new WordReference(found)); + } else if(callback) { + if(words->verbose > 2) fprintf(stderr, "WordCursorOne::WalkNextStep: calling callback\n"); + int ret = (*callback)(words, *cursor, &found, *(callback_data) ); + // + // The callback function tells us that something went wrong, might + // as well stop walking. + // + if(ret != OK) { + if(words->verbose) fprintf(stderr, "WordCursorOne::WalkNextStep: callback returned NOTOK"); + return WORD_WALK_CALLBACK_FAILED|(status = WORD_WALK_ATEND); + } + } + + return OK; +} + +int +WordCursorOne::WalkFinish() +{ + if(words->verbose) fprintf(stderr, "WordCursorOne::WalkFinish\n"); + + return cursor->Close() == 0 ? OK : NOTOK; +} + +// ***************************************************************************** +// +// Helper for SkipUselessSequentialWalking. +// Undefine in foundKey all fields defined in searchKey +// so that they are not considered by SetToFollowing. +// It could become a method of WordKey but lacks generalisation and +// from what I see it is a rather specific operation. +// +static inline void complement(WordContext* context, WordKey& key, const WordKey& mask) +{ + int nfields = context->GetKeyInfo().nfields; + int i; + // + // Undefine in 'key' all fields defined in 'mask' + // + for(i = 0; i < nfields; i++) { + if(mask.IsDefined(i)) + key.Undefined(i); + else + key.SetDefined(i); + } +} + +// ***************************************************************************** +// +// Find out if we should better jump to the next possible key (DB_SET_RANGE) instead of +// sequential iterating (DB_NEXT). +// If it is decided that jump is a better move : +// cursor_set_flags = DB_SET_RANGE +// key = calculated next possible key +// Else +// do nothing +// Return values +// OK: skipping successfull. +// WORD_WALK_ATEND : no more possible match, reached the maximum +// WORD_WALK_FAILED: general failure, occurs if called and no skipping +// necessary. +// +// Sequential searching can waste time by searching all keys, for example: +// If searching for Key: argh <DEF> <UNDEF> 10 +// Under normal circonstances we would do the following +// +// DATA STATUS ACTION +// 1: argh 1 10 match DB_NEXT +// 2: argh 2 11 nomatch DB_NEXT +// 3: argh 2 15 nomatch DB_NEXT +// 4: argh 2 20 nomatch DB_NEXT +// 5: argh 2 30 nomatch DB_NEXT +// 6: argh 5 1 nomatch DB_NEXT +// 7: argh 5 8 nomatch DB_NEXT +// 8: argh 8 6 nomatch DB_NEXT +// +// But the optimal would be +// +// DATA STATUS ACTION +// 1: argh 1 10 match DB_NEXT +// 2: argh 2 11 nomatch DB_SET_RANGE argh 3 10 +// 3: argh 2 15 +// 4: argh 2 20 +// 5: argh 2 30 +// 6: argh 5 1 nomatch DB_SET_RANGE argh 5 10 +// 7: argh 5 8 +// 8: argh 8 6 nomatch DB_SET_RANGE argh 8 10 +// +// That saves a lot of unecessary hit. The underlying logic is a bit +// more complex but you have the idea. +// +int +WordCursorOne::SkipUselessSequentialWalking() +{ + WordKey& foundKey = found.Key(); + + int nfields = words->GetContext()->GetKeyInfo().nfields; + int i; + + // + // Find out how the searchKey and the foundKey differ. + // + int diff_field = 0; + int lower = 0; + if(!foundKey.Diff(searchKey, diff_field, lower)) { + // + // foundKey matches searchKey (no difference), don't + // skip, everything is fine. The caller of SkipUselessSequentialWalking + // is expected to avoid this case for efficiency. + // + return WORD_WALK_FAILED; + } + + if(words->verbose > 2) fprintf(stderr, "WordCursorOne::SkipUselessSequentialWalking: looking for %s, candidate is %s\n", (char*)searchKey.Get(), (char*)foundKey.Get()); + + // + // Undefine in foundKey all fields defined in searchKey + // so that they are not considered by SetToFollowing. + // + complement(words->GetContext(), foundKey, searchKey); + + // + // If the key found is lower than the searched key when + // considering only the fields defined in the search key, + // we only need to enforce the key to get the match. + // Otherwise we need to increment the found key to jump + // properly. + // + if(lower) { + if(words->verbose > 1) fprintf(stderr, "WordCursorOne::SkipUselessSequentialWalking: enforcing the search constraint is enough to jump forward\n"); + for(i = diff_field + 1; i < nfields; i++) + if(foundKey.IsDefined(i)) foundKey.Set(i, 0); + } else { + if(words->verbose > 1) fprintf(stderr, "WordCursorOne::SkipUselessSequentialWalking: increment the key to jump forward\n"); + // + // diff_field - 1 is not really necessary because diff_field is undefined + // in foundKey and would therefore be ignored by SetToFollowing. We write + // diff_field - 1 to clearly state that incrementing begins just before the + // field for which a difference was found. + // + int ret; + if((ret = foundKey.SetToFollowing(diff_field - 1)) != OK) + return ret; + } + + // + // Copy all fields defined in searchKey into foundKey. This will copy + // searchKey in foundKey because all these fields have been + // previously undefined in foundKey. + // + foundKey.Merge(searchKey); + + if(words->verbose > 2) fprintf(stderr, "WordCursorOne::SkipUselessSequentialWalking: looking for %s, jump to %s\n", (char*)searchKey.Get(), (char*)foundKey.Get()); + + // + // Instruct Next function to jump to the calculated key + // + if(foundKey.Pack(key) == NOTOK) { + return WORD_WALK_FAILED; + } + cursor_get_flags = DB_SET_RANGE; + + return OK; +} + +// ***************************************************************************** +// +// Copy defined fields in patch into foundKey and +// initialize internal state so that WalkNext jumps to +// this key next time it's called. +// +// Technically this means : Override latest key found (found data member) +// with patch fields values, starting from the first field set in +// patch up to the last. Pack the result in the key field and set +// cursor_get_flags to DB_SET_RANGE. +// +int +WordCursorOne::Seek(const WordKey& patch) +{ + int nfields = words->GetContext()->GetKeyInfo().nfields; + WordKey pos = searchKey; + + if(patch.Empty()) { + fprintf(stderr, "WordCursorOne::Seek: empty patch is useless\n"); + return NOTOK; + } + + int i; + // + // Leave the most significant fields untouched + // + for(i = WORD_KEY_WORD + 1; i < nfields; i++) + if(patch.IsDefined(i)) + break; + // + // From the first value set in the patch to the end + // override. + // + for(; i < nfields; i++) { + if(patch.IsDefined(i)) + pos.Set(i, patch.Get(i)); + else + pos.Set(i, 0); + } + + if(!pos.Filled()) { + fprintf(stderr, "WordCursorOne::Seek: only make sense if the resulting key is fully defined\n"); + return NOTOK; + } + + if(words->verbose > 2) fprintf(stderr, "WordCursorOne::Seek: seek to %s\n", (char*)pos.Get()); + + // + // Next move will jump to the patched key + // + pos.Pack(key); + cursor_get_flags = DB_SET_RANGE; + + return OK; +} + +// +// Convert the whole structure to an ascii string description +// +int WordCursorOne::Get(String& bufferout) const +{ + String tmp; + bufferout.trunc(); + + searchKey.Get(tmp); + bufferout << "Input: searchKey = " << tmp << ", action = " << action << "; Output: collectRes " << (collectRes ? "set" : "not set"); + found.Get(tmp); + bufferout << ", found = " << tmp << ", status = " << status; + prefixKey.Get(tmp); + bufferout << "; Internal State: prefixKey = " << tmp << ", cursor_get_flags = " << cursor_get_flags; + + return OK; +} diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordCursorOne.h b/debian/htdig/htdig-3.2.0b6/htword/WordCursorOne.h new file mode 100644 index 00000000..133ef59c --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordCursorOne.h @@ -0,0 +1,163 @@ +// +// WordCursorOne.h +// +// NAME +// +// search and retrieve entries in a WordListOne object. +// +// SYNOPSIS +// +// #include <WordList.h> +// +// int callback(WordList *, WordDBCursor& , const WordReference *, Object &) +// { +// ... +// } +// +// Object* data = ... +// +// WordList *words = ...; +// +// WordCursor *search = words->Cursor(callback, data); +// WordCursor *search = words->Cursor(WordKey("word <UNDEF> <UNDEF>")); +// WordCursor *search = words->Cursor(WordKey("word <UNDEF> <UNDEF>"), callback, data); +// WordCursor *search = words->Cursor(WordKey()); +// +// ... +// +// if(search->Walk() == NOTOK) bark; +// List* results = search->GetResults(); +// +// search->WalkInit(); +// if(search->WalkNext() == OK) +// dosomething(search->GetFound()); +// search->WalkFinish(); +// +// DESCRIPTION +// +// WordCursorOne is a WordCursor derived class that implements search +// in a WordListOne object. It currently is the only derived class of +// the WordCursor object. Most of its behaviour is described in the +// WordCursor manual page, only the behaviour specific to WordCursorOne +// is documented here. +// +// +// END +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordCursorOne.h,v 1.4 2004/05/28 13:15:26 lha Exp $ +// + +#ifndef _WordCursorOne_h_ +#define _WordCursorOne_h_ + +#ifndef SWIG +#include "htString.h" +#include "WordKey.h" +#include "WordDB.h" +#include "WordCursor.h" + +class WordList; +class WordDBCursor; +#endif /* SWIG */ + +class WordCursorOne : public WordCursor +{ + public: +#ifndef SWIG + //- + // Private constructor. Creator of the object must then call Initialize() + // prior to using any other methods. + // + WordCursorOne(WordList *words); + //- + // Private constructor. See WordList::Cursor method with same prototype for + // description. + // + WordCursorOne(WordList *words, wordlist_walk_callback_t callback, Object * callback_data); + //- + // Private constructor. See WordList::Cursor method with same prototype for + // description. + // + WordCursorOne(WordList *words, const WordKey &searchKey, int action = HTDIG_WORDLIST_WALKER); + //- + // Private constructor. See WordList::Cursor method with same prototype for + // description. + // + WordCursorOne(WordList *words, const WordKey &searchKey, wordlist_walk_callback_t callback, Object * callback_data); +#endif /* SWIG */ + virtual ~WordCursorOne() { + if(cursor) delete cursor; + } + virtual void Clear(); + virtual void ClearInternal(); + virtual void ClearResult(); + + virtual inline int ContextSave(String& buffer) const { found.Get(buffer); return OK; } + virtual int ContextRestore(const String& buffer); + +#ifndef SWIG + virtual int Walk(); +#endif /* SWIG */ + virtual int WalkInit(); + virtual int WalkRewind(); + virtual int WalkNext(); +#ifndef SWIG + virtual int WalkNextStep(); +#endif /* SWIG */ + virtual int WalkFinish(); + // + // Find out if cursor should better jump to the next possible key + // (DB_SET_RANGE) instead of sequential iterating (DB_NEXT). If it + // is decided that jump is a better move : cursor_set_flags = + // DB_SET_RANGE key = calculated next possible key Else do nothing + // Return OK if skipping successfull. Returns WORD_WALK_ATEND if no + // more possible match, reached the maximum. Returns + // WORD_WALK_FAILED on general failure, occurs if called and no + // skipping necessary. + // + int SkipUselessSequentialWalking(); + + virtual int Seek(const WordKey& patch); + +#ifndef SWIG + virtual int Get(String& bufferout) const; + inline String Get() const { String tmp; Get(tmp); return tmp; } + + protected: + + int Initialize(WordList *nwords, const WordKey &nsearchKey, wordlist_walk_callback_t ncallback, Object * ncallback_data, int naction); + + // + // Internal state + // + // + // The actual Berkeley DB cursor. + // + WordDBCursor* cursor; + // + // The latest retrieved key and data + // + String key; + String data; + // + // The shorted prefix key computed from searchKey + // + WordKey prefixKey; + // + // WalkNext leap is either DB_NEXT or DB_SET_RANGE. + // + int cursor_get_flags; + // + // True if search key is a prefix key + // + int searchKeyIsSameAsPrefix; +#endif /* SWIG */ +}; + +#endif /* _WordCursorOne_h_ */ diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordDB.cc b/debian/htdig/htdig-3.2.0b6/htword/WordDB.cc new file mode 100644 index 00000000..5718afa5 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordDB.cc @@ -0,0 +1,71 @@ +// +// WordDB.cc +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordDB.cc,v 1.10 2004/05/28 13:15:26 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "defaults.h" +#include "WordDB.h" + +#include "../db/db.h" + +const char* dberror(int errval) { +#define DB_MAX_ERROR (-DB_TXN_CKP + 1) + static const char* dbstr[DB_MAX_ERROR] = { + "", + "DB_INCOMPLETE", + "DB_KEYEMPTY", + "DB_KEYEXISTS", + "DB_LOCK_DEADLOCK", + "DB_LOCK_NOTGRANTED", + "DB_LOCK_NOTHELD", + "DB_NOTFOUND", + "DB_RUNRECOVERY", + "DB_DELETED", + "DB_NEEDSPLIT", + "DB_SWAPBYTES", + "DB_TXN_CKP", + }; + if(errval < 0 && -errval < DB_MAX_ERROR) + return dbstr[-errval]; + else + return strerror(errval); +} + +int WordDB::Open(const String& filename, DBTYPE type, int flags, int mode) { + if(is_open) { + int error = 0; + if((error = Close()) != 0) + return error; + } + + if(!dbenv) { + const char* progname = "WordDB"; + + // + // Environment initialization + // + // Output errors to the application's log. + // + db->set_errfile(db, stderr); + db->set_errpfx(db, progname); + + } + + int error = db->open(db, filename, NULL, type, (u_int32_t)flags, mode); + + if(error == 0) + is_open = 1; + + return error; +} diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordDB.h b/debian/htdig/htdig-3.2.0b6/htword/WordDB.h new file mode 100644 index 00000000..e48ffc4d --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordDB.h @@ -0,0 +1,295 @@ +// +// WordDB.h +// +// WordDB: Interface to Berkeley DB +// uses String and WordReference instead of Dbt, add some convenience +// methods and implements string translation of Berkeley DB error codes. +// It does not include the 'join' feature. +// Beside this, the interface it identical to the Db class. +// The next evolution for this set of class is to have a single object per +// application so that they all share the same environment (transactions, +// shared pool, database directory). This implies a static common object +// that is refered by each actual instance of WordDB. The static object +// holds the DbEnv and DbInfo, the instances of WordDB only have an open +// descriptor using the same DbEnv and DbInfo. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordDB.h,v 1.7 2004/05/28 13:15:26 lha Exp $ +// + +#ifndef _WordDB_h_ +#define _WordDB_h_ + +#include <stdio.h> +#include <errno.h> + +#include "db.h" +#include "WordReference.h" +#include "WordDBInfo.h" +#include "htString.h" + +#define WORD_DBT_DCL(v) \ + DBT v; \ + memset((char*)&(v), '\0', sizeof(DBT)) + +#define WORD_DBT_SET(v,d,s) \ + v.data = (d); \ + v.size = (s) + +#define WORD_DBT_INIT(v,d,s) \ + WORD_DBT_DCL(v); \ + WORD_DBT_SET(v,d,s) + +// +// Encapsulate the Berkeley DB DB type +// +// Implements the same methods with String instead of Dbt. +// +// Add convenience methods taking WordReference instead of String +// +// The error model is *not* to use exceptions. +// +// To get a cursor use the Open method of WordDBCursor. I find this +// more convinient than getting a cursor from WordDB. +// +// The WordDB has DbInfo and DbEnv members that can be set before +// calling Open to configure it. +// +class WordDB { + public: + inline WordDB() { Alloc(); } + inline ~WordDB() { Dealloc(); } + + inline int Alloc() { + db = 0; + is_open = 0; + dbenv = WordDBInfo::Instance()->dbenv; + return CDB_db_create(&db, dbenv, 0); + } + + inline int Dealloc() { + int error = 0; + is_open = 0; + if(db) + error = db->close(db, 0); + else + fprintf(stderr, "WordDB::Dealloc: null db\n"); + dbenv = 0; + db = 0; + return error; + } + + int Open(const String& filename, DBTYPE type, int flags, int mode); + + inline int Close() { + int error; + if((error = Dealloc()) != 0) + return error; + return Alloc(); + } + + inline int Fd(int *fdp) { + if(!is_open) return DB_UNKNOWN; + return db->fd(db, fdp); + } + + inline int Stat(void *sp, void *(*db_malloc)(size_t), int flags) { + if(!is_open) return DB_UNKNOWN; + return db->stat(db, sp, db_malloc, (u_int32_t) flags); + } + + inline int Sync(int flags) { + if(!is_open) return DB_UNKNOWN; + return db->sync(db, (u_int32_t) flags); + } + + inline int get_byteswapped() const { + if(!is_open) return DB_UNKNOWN; + return db->get_byteswapped(db); + } + + inline DBTYPE get_type() const { + if(!is_open) return DB_UNKNOWN; + return db->get_type(db); + } + + // + // String arguments + // + inline int Put(DB_TXN *txn, const String& key, const String& data, int flags) { + WORD_DBT_INIT(rkey, (void*)key.get(), key.length()); + WORD_DBT_INIT(rdata, (void*)data.get(), data.length()); + + return db->put(db, txn, &rkey, &rdata, flags); + } + + inline int Get(DB_TXN *txn, String& key, String& data, int flags) const { + WORD_DBT_INIT(rkey, (void*)key.get(), (u_int32_t)key.length()); + WORD_DBT_INIT(rdata, (void*)data.get(), (u_int32_t)data.length()); + + int error; + if((error = db->get(db, txn, &rkey, &rdata, 0)) != 0) { + if(error != DB_NOTFOUND) + fprintf(stderr, "WordDB::Get(%s,%s) using %d failed %s\n", (char*)key, (char*)data, flags, CDB_db_strerror(error)); + } else { + // + // Only set arguments if found something. + // + key.set((const char*)rkey.data, (int)rkey.size); + data.set((const char*)rdata.data, (int)rdata.size); + } + + return error; + } + + inline int Del(DB_TXN *txn, const String& key) { + WORD_DBT_INIT(rkey, (void*)key.get(), (u_int32_t)key.length()); + + return db->del(db, txn, &rkey, 0); + } + + // + // WordReference argument + // + inline int Put(const WordReference& wordRef, int flags) { + if(!is_open) return DB_UNKNOWN; + + int ret; + String key; + String record; + + if((ret = wordRef.Pack(key, record)) != OK) return DB_RUNRECOVERY; + + return Put(0, key, record, flags); + } + + inline int Del(const WordReference& wordRef) { + String key; + + wordRef.Key().Pack(key); + + return Del(0, key); + } + + // + // Search entry matching wkey exactly, return key and data + // in wordRef. + // + inline int Get(WordReference& wordRef) const { + if(!is_open) return DB_UNKNOWN; + + String data; + String key; + + if(wordRef.Key().Pack(key) != OK) return DB_RUNRECOVERY; + + int ret; + if((ret = Get(0, key, data, 0)) != 0) + return ret; + + return wordRef.Unpack(key, data) == OK ? 0 : DB_RUNRECOVERY; + } + + // + // Returns 0 of the key of wordRef matches an entry in the database. + // Could be implemented with Get but is not because we don't + // need to build a wordRef with the entry found in the base. + // + inline int Exists(const WordReference& wordRef) const { + if(!is_open) return DB_UNKNOWN; + + String key; + String data; + + if(wordRef.Key().Pack(key) != OK) return DB_RUNRECOVERY; + + return Get(0, key, data, 0); + } + + // + // Accessors + // + inline int set_bt_compare(int (*compare)(const DBT *, const DBT *)) { + return db->set_bt_compare(db, compare); + } + + inline int set_pagesize(u_int32_t pagesize) { + return db->set_pagesize(db, pagesize); + } + + // + // Accessors for description of the compression scheme + // + inline DB_CMPR_INFO* CmprInfo() { return dbenv->mp_cmpr_info; } + inline void CmprInfo(DB_CMPR_INFO* info) { dbenv->mp_cmpr_info = info; } + + int is_open; + DB* db; + DB_ENV* dbenv; +}; + +// +// Interface to DBC that uses String instead of DBT +// +class WordDBCursor { + public: + inline WordDBCursor() { cursor = 0; } + inline ~WordDBCursor() { + Close(); + } + + inline int Open(DB* db) { + Close(); + return db->cursor(db, 0, &cursor, 0); + } + + inline int Close() { + if(cursor) cursor->c_close(cursor); + cursor = 0; + return 0; + } + + // + // String arguments + // + inline int Get(String& key, String& data, int flags) { + WORD_DBT_DCL(rkey); + WORD_DBT_DCL(rdata); + switch(flags & DB_OPFLAGS_MASK) { + case DB_SET_RANGE: + case DB_SET: + case DB_GET_BOTH: + WORD_DBT_SET(rkey, (void*)key.get(), key.length()); + break; + } + int error; + if((error = cursor->c_get(cursor, &rkey, &rdata, (u_int32_t)flags)) != 0) { + if(error != DB_NOTFOUND) + fprintf(stderr, "WordDBCursor::Get(%d) failed %s\n", flags, CDB_db_strerror(error)); + } else { + key.set((const char*)rkey.data, (int)rkey.size); + data.set((const char*)rdata.data, (int)rdata.size); + } + return error; + } + + inline int Put(const String& key, const String& data, int flags) { + WORD_DBT_INIT(rkey, (void*)key.get(), (size_t)key.length()); + WORD_DBT_INIT(rdata, (void*)data.get(), (size_t)data.length()); + return cursor->c_put(cursor, &rkey, &rdata, (u_int32_t)flags); + } + + inline int Del() { + return cursor->c_del(cursor, (u_int32_t)0); + } + +private: + DBC* cursor; +}; + +#endif /* _WordDB_h */ diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordDBCache.cc b/debian/htdig/htdig-3.2.0b6/htword/WordDBCache.cc new file mode 100644 index 00000000..2f7a988a --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordDBCache.cc @@ -0,0 +1,411 @@ +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <stdio.h> + +#ifndef _MSC_VER /* _WIN32 */ +#include <unistd.h> +#endif + +#include <stdlib.h> +#include <sys/stat.h> + +#include "WordKey.h" +#include "WordDB.h" +#include "WordDBCache.h" +#include "WordMeta.h" +#include "ber.h" + +int WordDBCaches::Add(char* key, int key_size, char* data, int data_size) +{ + int ret; + if((ret = cache.Allocate(key_size + data_size)) == ENOMEM) { + if((ret = CacheFlush()) != 0) return ret; + if((ret = cache.Allocate(key_size + data_size))) return ret; + } + + return cache.Add(key, key_size, data, data_size); +} + +int WordDBCaches::AddFile(String& filename) +{ + char tmp[32]; + unsigned int serial; + words->Meta()->Serial(WORD_META_SERIAL_FILE, serial); + if(serial == WORD_META_SERIAL_INVALID) + return NOTOK; + filename = words->Filename(); + sprintf(tmp, "C%08d", serial - 1); + filename << tmp; + + String dummy; + if(files->Put(0, filename, dummy, 0) != 0) + return NOTOK; + + size = (cache.GetMax() / 1024) * serial; + + return OK; +} + +int WordDBCaches::CacheFlush() +{ + if(cache.Empty()) return OK; + + if(cache.Sort() != OK) return NOTOK; + String filename; + int locking = 0; + if(!lock) { + words->Meta()->Lock("cache", lock); + locking = 1; + } + if(AddFile(filename) != OK) return NOTOK; + if(CacheWrite(filename) != OK) return NOTOK; + + unsigned int serial; + words->Meta()->GetSerial(WORD_META_SERIAL_FILE, serial); + if(serial >= (unsigned int)file_max || Full()) + if(Merge() != OK) return NOTOK; + if(locking) words->Meta()->Unlock("cache", lock); + + return OK; +} + +static int merge_cmp_size(WordDBCaches* , WordDBCacheFile* a, WordDBCacheFile* b) +{ + return b->size - a->size; +} + +int WordDBCaches::Merge() +{ + if(CacheFlush() != OK) return NOTOK; + + int locking = 0; + if(!lock) { + words->Meta()->Lock("cache", lock); + locking = 1; + } + unsigned int serial; + words->Meta()->GetSerial(WORD_META_SERIAL_FILE, serial); + if(serial <= 1) return OK; + + // + // heap lists all the files in decreasing size order (biggest first) + // + WordDBCacheFile* heap = new WordDBCacheFile[serial]; + { + String filename; + String dummy; + WordDBCursor* cursor = files->Cursor(); + struct stat stat_buf; + int i; + int ret; + for(i = 0; (ret = cursor->Get(filename, dummy, DB_NEXT)) == 0; i++) { + WordDBCacheFile& file = heap[i]; + file.filename = filename; + if(stat((char*)file.filename, &stat_buf) == 0) { + file.size = stat_buf.st_size; + } else { + const String message = String("WordDBCaches::Merge: cannot stat ") + file.filename; + perror((const char*)message); + return NOTOK; + } + cursor->Del(); + } + delete cursor; + myqsort((void*)heap, serial, sizeof(WordDBCacheFile), (myqsort_cmp)merge_cmp_size, (void*)this); + } + + String tmpname = words->Filename() + String("C.tmp"); + + while(serial > 1) { + WordDBCacheFile* a = &heap[serial - 1]; + WordDBCacheFile* b = &heap[serial - 2]; + + if(Merge(a->filename, b->filename, tmpname) != OK) return NOTOK; + + // + // Remove file a + // + if(unlink((char*)a->filename) != 0) { + const String message = String("WordDBCaches::Merge: unlink ") + a->filename; + perror((const char*)message); + return NOTOK; + } + + // + // Remove file b + // + if(unlink((char*)b->filename) != 0) { + const String message = String("WordDBCaches::Merge: unlink ") + b->filename; + perror((const char*)message); + return NOTOK; + } + + // + // Rename tmp file into file b + // + if(rename((char*)tmpname, (char*)b->filename) != 0) { + const String message = String("WordDBCaches::Merge: rename ") + tmpname + String(" ") + b->filename; + perror((const char*)message); + return NOTOK; + } + + // + // Update b file size. The size need not be accurate number as long + // as it reflects the relative size of each file. + // + b->size += a->size; + + serial--; + // + // update heap + // + myqsort((void*)heap, serial, sizeof(WordDBCacheFile), (myqsort_cmp)merge_cmp_size, (void*)this); + } + + { + String newname(words->Filename()); + newname << "C00000000"; + + if(rename((char*)heap[0].filename, (char*)newname) != 0) { + const String message = String("WordDBCaches::Merge: rename ") + heap[0].filename + String(" ") + newname; + perror((const char*)message); + return NOTOK; + } + + String dummy; + if(files->Put(0, newname, dummy, 0) != 0) + return NOTOK; + words->Meta()->SetSerial(WORD_META_SERIAL_FILE, serial); + } + if(locking) words->Meta()->Unlock("cache", lock); + + return OK; +} + +int WordDBCaches::Merge(const String& filea, const String& fileb, const String& tmpname) +{ + FILE* ftmp = fopen((const char*)tmpname, "w"); + FILE* fa = fopen((const char*)filea, "r"); + FILE* fb = fopen((const char*)fileb, "r"); + + unsigned int buffertmp_size = 128; + unsigned char* buffertmp = (unsigned char*)malloc(buffertmp_size); + unsigned int buffera_size = 128; + unsigned char* buffera = (unsigned char*)malloc(buffera_size); + unsigned int bufferb_size = 128; + unsigned char* bufferb = (unsigned char*)malloc(bufferb_size); + + unsigned int entriesa_length; + if(ber_file2value(fa, entriesa_length) < 1) return NOTOK; + unsigned int entriesb_length; + if(ber_file2value(fb, entriesb_length) < 1) return NOTOK; + + if(ber_value2file(ftmp, entriesa_length + entriesb_length) < 1) return NOTOK; + + WordDBCacheEntry entrya; + WordDBCacheEntry entryb; + + if(entriesa_length > 0 && entriesb_length > 0) { + + if(ReadEntry(fa, entrya, buffera, buffera_size) != OK) return NOTOK; + if(ReadEntry(fb, entryb, bufferb, bufferb_size) != OK) return NOTOK; + + while(entriesa_length > 0 && entriesb_length > 0) { + if(WordKey::Compare(words->GetContext(), (const unsigned char*)entrya.key, entrya.key_size, (const unsigned char*)entryb.key, entryb.key_size) < 0) { + if(WriteEntry(ftmp, entrya, buffertmp, buffertmp_size) != OK) return NOTOK; + if(--entriesa_length > 0) + if(ReadEntry(fa, entrya, buffera, buffera_size) != OK) return NOTOK; + } else { + if(WriteEntry(ftmp, entryb, buffertmp, buffertmp_size) != OK) return NOTOK; + if(--entriesb_length > 0) + if(ReadEntry(fb, entryb, bufferb, bufferb_size) != OK) return NOTOK; + } + } + } + + if(entriesa_length > 0 || entriesb_length > 0) { + FILE* fp = entriesa_length > 0 ? fa : fb; + unsigned int& entries_length = entriesa_length > 0 ? entriesa_length : entriesb_length; + WordDBCacheEntry& entry = entriesa_length > 0 ? entrya : entryb; + while(entries_length > 0) { + if(WriteEntry(ftmp, entry, buffertmp, buffertmp_size) != OK) return NOTOK; + if(--entries_length > 0) + if(ReadEntry(fp, entry, buffera, buffera_size) != OK) return NOTOK; + } + } + + free(buffera); + free(bufferb); + free(buffertmp); + + fclose(fa); + fclose(fb); + fclose(ftmp); + + return OK; +} + +int WordDBCaches::Merge(WordDB& db) +{ + int locking = 0; + if(!lock) { + words->Meta()->Lock("cache", lock); + locking = 1; + } + if(Merge() != OK) return NOTOK; + + String filename; + String dummy; + WordDBCursor* cursor = files->Cursor(); + if(cursor->Get(filename, dummy, DB_FIRST) != 0) { + delete cursor; + return NOTOK; + } + cursor->Del(); + delete cursor; + + FILE* fp = fopen((char*)filename, "r"); + + unsigned int buffer_size = 128; + unsigned char* buffer = (unsigned char*)malloc(buffer_size); + + unsigned int entries_length; + if(ber_file2value(fp, entries_length) < 1) return NOTOK; + + WordDBCacheEntry entry; + + unsigned int i; + for(i = 0; i < entries_length; i++) { + if(ReadEntry(fp, entry, buffer, buffer_size) != OK) return NOTOK; + void* user_data = words->GetContext(); + WORD_DBT_INIT(rkey, (void*)entry.key, entry.key_size); + WORD_DBT_INIT(rdata, (void*)entry.data, entry.data_size); + db.db->put(db.db, 0, &rkey, &rdata, 0); + } + + if(unlink((char*)filename) != 0) { + const String message = String("WordDBCaches::Merge: unlink ") + filename; + perror((const char*)message); + return NOTOK; + } + + words->Meta()->SetSerial(WORD_META_SERIAL_FILE, 0); + if(locking) words->Meta()->Unlock("cache", lock); + size = 0; + free(buffer); + fclose(fp); + + return OK; +} + +int WordDBCaches::CacheWrite(const String& filename) +{ + FILE* fp = fopen(filename, "w"); + if(!fp) { + String message; + message << "WordDBCaches::CacheWrite()" << filename << "): "; + perror((char*)message); + return NOTOK; + } + + int entries_length; + WordDBCacheEntry* entries; + int ret; + if((ret = cache.Entries(entries, entries_length)) != 0) + return ret; + + if(ber_value2file(fp, entries_length) < 1) return NOTOK; + + unsigned int buffer_size = 1024; + unsigned char* buffer = (unsigned char*)malloc(buffer_size); + int i; + for(i = 0; i < entries_length; i++) { + if(WriteEntry(fp, entries[i], buffer, buffer_size) != OK) return NOTOK; + } + free(buffer); + fclose(fp); + + cache.Flush(); + + return OK; +} + +int WordDBCaches::WriteEntry(FILE* fp, WordDBCacheEntry& entry, unsigned char*& buffer, unsigned int& buffer_size) +{ + if(entry.key_size + entry.data_size + 64 > buffer_size) { + buffer_size = entry.key_size + entry.data_size + 64; + buffer = (unsigned char*)realloc(buffer, buffer_size); + } + + int p_size = buffer_size; + unsigned char* p = buffer; + + int ber_len; + if((ber_len = ber_value2buf(p, p_size, entry.key_size)) < 1) { + fprintf(stderr, "WordDBCaches::WriteEntry: BER failed for key %d\n", entry.key_size); + return NOTOK; + } + p += ber_len; + memcpy(p, entry.key, entry.key_size); + p += entry.key_size; + + p_size -= ber_len + entry.key_size; + + if((ber_len = ber_value2buf(p, p_size, entry.data_size)) < 1) { + fprintf(stderr, "WordDBCaches::WriteEntry: BER failed for data %d\n", entry.data_size); + return NOTOK; + } + p += ber_len; + memcpy(p, entry.data, entry.data_size); + p += entry.data_size; + + if(fwrite((void*)buffer, p - buffer, 1, fp) != 1) { + perror("WordDBCaches::WriteEntry: cannot write entry "); + return NOTOK; + } + + return OK; +} + +int WordDBCaches::ReadEntry(FILE* fp, WordDBCacheEntry& entry, unsigned char*& buffer, unsigned int& buffer_size) +{ + if(ber_file2value(fp, entry.key_size) < 1) return NOTOK; + + if(entry.key_size > buffer_size) { + buffer_size += entry.key_size; + if(!(buffer = (unsigned char*)realloc(buffer, buffer_size))) return NOTOK; + } + + if(fread((void*)buffer, entry.key_size, 1, fp) != 1) { + perror("WordDBCaches::ReadEntry(): cannot read key entry "); + return NOTOK; + } + + if(ber_file2value(fp, entry.data_size) < 1) return NOTOK; + + if(entry.data_size > 0) { + if(entry.data_size + entry.key_size > buffer_size) { + buffer_size += entry.data_size; + if(!(buffer = (unsigned char*)realloc(buffer, buffer_size))) return NOTOK; + } + + if(fread((void*)(buffer + entry.key_size), entry.data_size, 1, fp) != 1) { + perror("WordDBCaches::ReadEntry(): cannot read data entry "); + return NOTOK; + } + } + + entry.key = (char*)buffer; + entry.data = (char*)(buffer + entry.key_size); + + return OK; +} diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordDBCache.h b/debian/htdig/htdig-3.2.0b6/htword/WordDBCache.h new file mode 100644 index 00000000..c4c0a2e3 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordDBCache.h @@ -0,0 +1,267 @@ +// +// WordDBCache.h +// +// NAME +// intermediate cache for WordList objects. +// +// SYNOPSIS +// +// Internal helper for the WordListOne object. +// +// DESCRIPTION +// +// To speed up bulk insertions, the WordDBCache allows them to remain in +// memory as long as a given limit is not reached. The inserted entries +// are them sorted and dumped into a file. When a given number of files +// have been produced, they are merged into one. Eventually the resulting +// list of entries is inserted into the WordList index. +// +// +// END +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordDBCache.h,v 1.4 2004/05/28 13:15:26 lha Exp $ +// + +#ifndef _WordDBCache_h_ +#define _WordDBCache_h_ + +#include <stdlib.h> +#include <errno.h> + +#include "htString.h" +#include "List.h" +#include "db.h" +#include "lib.h" +#include "myqsort.h" +#include "WordList.h" + +class WordDB; +class WordLock; + +// +// Minimum size of the pulsing cache +// +#define WORD_DB_CACHE_MINIMUM (500 * 1024) + +// +// We could use DBT instead but it's more than two times bigger and +// time saving by the most efficient way of memory space is the whole +// point of the cache. +// +class WordDBCacheEntry { +public: + char* key; + unsigned int key_size; + char* data; + unsigned int data_size; +}; + +class WordDBCache { +public: + inline WordDBCache(WordContext* ncontext) { + context = ncontext; + + entries = (WordDBCacheEntry*)malloc(1000 * sizeof(WordDBCacheEntry)); + entries_length = 0; + entries_size = 1000; + + pool = (char*)malloc(WORD_DB_CACHE_MINIMUM); + pool_length = 0; + pool_size = pool_max = WORD_DB_CACHE_MINIMUM; + } + + inline ~WordDBCache() { + if(pool_length > 0) { + fprintf(stderr, "WordDBCache::~WordDBCache: destructor called and cache not empty\n"); + } + free(entries); + free(pool); + } + + inline int ResizeEntries() { + entries_size *= 2; + entries = (WordDBCacheEntry*)realloc(entries, entries_size * sizeof(WordDBCacheEntry)); + return entries ? 0 : DB_RUNRECOVERY; + } + + inline int ResizePool(int wanted) { + if(pool_size * 2 > pool_max) { + if(pool_max > pool_size && pool_max > wanted) + pool_size = pool_max; + else + return ENOMEM; + } else { + pool_size *= 2; + } + pool = (char*)realloc(pool, pool_size); + return pool ? 0 : DB_RUNRECOVERY; + } + + inline int Allocate(int size) { + int ret; + if(entries_length >= entries_size) + if((ret = ResizeEntries()) != 0) + return ret; + if(pool_length + size >= pool_size) { + if((ret = ResizePool(pool_length + size)) != 0) + return ret; + } + return 0; + } + + inline int GetMax() const { return pool_max; } + + inline int SetMax(int max) { + if(max > pool_max) + pool_max = max; + return 0; + } + + inline int SetCompare(int (*ncompare)(WordContext *, const WordDBCacheEntry *, const WordDBCacheEntry *)) { + compare = ncompare; + return 0; + } + + inline int Sort() { + if(Absolute() != OK) return NOTOK; + // + // Reorder entries in increasing order + // + myqsort((void*)entries, entries_length, sizeof(WordDBCacheEntry), (myqsort_cmp)compare, (void*)context); + return 0; + } + + inline int Relative() { + int i; + for(i = 0; i < entries_length; i++) { + entries[i].key = (char*)(entries[i].key - pool); + entries[i].data = (char*)(entries[i].data - pool); + } + return OK; + } + + inline int Absolute() { + int i; + for(i = 0; i < entries_length; i++) { + entries[i].key = pool + (int)(entries[i].key); + entries[i].data = pool + (int)(entries[i].data); + } + return OK; + } + + inline int Entries(WordDBCacheEntry*& nentries, int& nentries_length) { + nentries = entries; + nentries_length = entries_length; + return 0; + } + + inline int Pool(char*& npool, int& npool_length) { + npool = pool; + npool_length = pool_length; + return OK; + } + + inline int Add(char* key, int key_size, char* data, int data_size) { + int ret; + if((ret = Allocate(key_size + data_size)) != 0) + return ret; + + entries[entries_length].key = (char*)pool_length; + entries[entries_length].key_size = key_size; + entries[entries_length].data = (char*)(pool_length + key_size); + entries[entries_length].data_size = data_size; + entries_length++; + memcpy(pool + pool_length, key, key_size); + memcpy(pool + pool_length + key_size, data, data_size); + pool_length += key_size + data_size; + + return 0; + } + + inline int Flush() { + entries_length = 0; + pool_length = 0; + return 0; + } + + inline int Empty() { + return entries_length <= 0; + } + +private: + WordDBCacheEntry* entries; + int entries_length; + int entries_size; + + char* pool; + int pool_length; + int pool_size; + int pool_max; + + int (*compare)(WordContext *, const WordDBCacheEntry *, const WordDBCacheEntry *); + WordContext *context; +}; + +class WordDBCacheFile : public Object +{ +public: + WordDBCacheFile() { size = 0; } + + String filename; + unsigned int size; +}; + +class WordDBCaches { + public: + inline WordDBCaches(WordList* nwords, int nfile_max, int size_hint, int nsize_max) : cache(nwords->GetContext()) { + words = nwords; + + files = new WordDB(words->GetContext()->GetDBInfo()); + files->Open(words->Filename(), "tmp", DB_BTREE, words->Flags(), 0666, WORD_DB_FILES); + file_max = nfile_max; + size_max = nsize_max; + lock = 0; + + cache.SetMax(size_hint / 2); + } + + ~WordDBCaches() { + delete files; + } + + int Full() const { return size_max > 0 ? size >= size_max : 0; } + + int Add(char* key, int key_size, char* data, int data_size); + int AddFile(String& filename); + + int CacheFlush(); + + int Merge(); + int Merge(const String& filea, const String& fileb, const String& tmpname); + int Merge(WordDB& db); + + int CacheWrite(const String& filename); + int CacheCompare(int (*compare)(WordContext *, const WordDBCacheEntry *, const WordDBCacheEntry *)) { cache.SetCompare(compare); return OK; } + + int WriteEntry(FILE* fp, WordDBCacheEntry& entry, unsigned char*& buffer, unsigned int& buffer_size); + int ReadEntry(FILE* fp, WordDBCacheEntry& entry, unsigned char*& buffer, unsigned int& buffer_size); + + private: + WordList* words; + + WordDB* files; + int file_max; + int size_max; + int size; + + WordLock* lock; + WordDBCache cache; +}; + +#endif /* _WordDBCache_h */ diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordDBCompress.cc b/debian/htdig/htdig-3.2.0b6/htword/WordDBCompress.cc new file mode 100644 index 00000000..4fe9f738 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordDBCompress.cc @@ -0,0 +1,175 @@ +// +// WordDBCompress.h +// +// WordDBCompress: Implements specific compression scheme for +// Berkeley DB pages containing WordReferences objects. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordDBCompress.cc,v 1.7 2004/05/28 13:15:26 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <ctype.h> + +#include "WordDBPage.h" +#include "WordDBCompress.h" +#include "WordBitCompress.h" + +/* + * WordDBCompress: C-callbacks, actually called by Berkeley-DB + * they just call their WordDBCompress equivalents (by using user_data) + */ +extern "C" +{ + +static int WordDBCompress_compress_c(const u_int8_t* inbuff, int inbuff_length, u_int8_t** outbuffp, int* outbuff_lengthp, void *user_data) +{ + if(!user_data) { + fprintf(stderr, "WordDBCompress_compress_c:: user_data is NULL"); + return NOTOK; + } + return ((WordDBCompress *)user_data)->Compress((unsigned char*)inbuff, inbuff_length, (unsigned char**)outbuffp, outbuff_lengthp); +} + +static int WordDBCompress_uncompress_c(const u_int8_t* inbuff, int inbuff_length, u_int8_t* outbuff, int outbuff_length, void *user_data) +{ + if(!user_data) { + fprintf(stderr, "WordDBCompress_uncompress_c:: user_data is NULL"); + return NOTOK; + } + return ((WordDBCompress *)user_data)->Uncompress((unsigned char *)inbuff, inbuff_length, (unsigned char*)outbuff, outbuff_length); +} + +} + +// *********************************************** +// *********** WordDBCompress ******************* +// *********************************************** + +WordDBCompress::WordDBCompress() +{ + + cmprInfo = 0; + + // + // DEBUGING / BENCHMARKING + // + debug = 0; + + //zlib WordDB Compression + use_zlib = 0; + zlib_level = 0; + +} + + +WordDBCompress::WordDBCompress(int zlib, int level) +{ + + cmprInfo = 0; + + // + // DEBUGING / BENCHMARKING + // + debug = 0; + + //zlib WordDB Compression + use_zlib = zlib; + zlib_level = level; +} + + +DB_CMPR_INFO* WordDBCompress::CmprInfo() +{ + + DB_CMPR_INFO *cmpr_info = new DB_CMPR_INFO; + + cmpr_info->user_data = (void *)this; + cmpr_info->compress = WordDBCompress_compress_c; + cmpr_info->uncompress = WordDBCompress_uncompress_c; + cmpr_info->coefficient = 3; // reduce page size by factor of 1<<3 = 8 + cmpr_info->max_npages = 9; + + if(use_zlib == 1) + cmpr_info->zlib_flags = zlib_level; + else + cmpr_info->zlib_flags = 0; + + cmprInfo = cmpr_info; + + return cmpr_info; +} + +int +WordDBCompress::Compress(const u_int8_t *inbuff, int inbuff_length, u_int8_t **outbuffp, int *outbuff_lengthp) +{ + WordDBPage pg(inbuff, inbuff_length); + + if(debug > 2) { + printf("########################### WordDBCompress::Compress: #################################################\n"); + pg.show(); + printf("~~~~~~~~~~~~~\n"); + } + + if(debug) TestCompress(inbuff, inbuff_length); + + Compressor *res = pg.Compress(0, cmprInfo); + + (*outbuffp) = res->get_data(); + (*outbuff_lengthp) = res->buffsize(); + + if(debug > 2) { + res->show(); + printf("\n%%%%%%%% Final COMPRESSED size:%4d %f\n",res->size(),res->size()/8.0); + printf("*************************** #################################################\n"); + } + + delete res; + if(debug > 2) printf("WordDBCompress::Compress: final output size:%6d (inputsize:%6d)\n", (*outbuff_lengthp), inbuff_length); + + pg.unset_page(); + + return(0); +} + +int +WordDBCompress::Uncompress(const u_int8_t *inbuff, int inbuff_length, u_int8_t *outbuff,int outbuff_length) +{ + if(debug > 2) printf("WordDBCompress::Uncompress:: %5d -> %5d\n", inbuff_length, outbuff_length); + + WordDBPage pg(outbuff_length); + + if(debug > 2) printf("------------------------ WordDBCompress::Uncompress: --------------------------------\n"); + + Compressor in(inbuff_length); + in.set_data(inbuff,inbuff_length*8); + in.rewind(); + + pg.Uncompress(&in,0); + + memcpy((void *)outbuff, (void *)pg.pg, outbuff_length); + + if(debug > 2) printf("------------------------ WordDBCompress::Uncompress: END\n"); + + // DEBUGING / BENCHMARKING + + pg.delete_page(); + return(0); +} + +int +WordDBCompress::TestCompress(const u_int8_t* pagebuff, int pagebuffsize) +{ + WordDBPage pg(pagebuff,pagebuffsize); + pg.TestCompress(debug); + pg.unset_page(); + return 0; +} diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordDBCompress.h b/debian/htdig/htdig-3.2.0b6/htword/WordDBCompress.h new file mode 100644 index 00000000..0f5c1973 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordDBCompress.h @@ -0,0 +1,114 @@ +// +// WordDBCompress.h +// +// WordDBCompress: Implements specific compression scheme for +// Berkeley DB pages containing WordReferences objects. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordDBCompress.h,v 1.6 2004/05/28 13:15:26 lha Exp $ +// + +#ifndef _WordDBCompress_h_ +#define _WordDBCompress_h_ + +// *********************************************** +// *************** WordDBCompress***************** +// *********************************************** +// Starting point for compression. +// +// +// Comrpession HOW IT WORKS: +// +// ** General outline: +// +// BerkeleyDB pages are stored in a memory pool. When the memory pool +// is full, least recently used pages are swaped to disk. Page +// compression occurs at page in/out level. The +// WordDBCompress_compress_c functions are C callbacks that are called +// by the the page compression code in BerkeleyDB. The C callbacks the +// call the WordDBCompress comress/uncompress methods. The +// WordDBCompress creates a WordDBPage which does the actual +// compress/uncompress job. +// +// The WordDBPage compression/uncompression methods store/retreive data +// from a bitstream. BitStream is a simple bitstream, and Compressor is +// a bitstream with added compression capabilities. +// + +// Compression algorithm. +// +// Most DB pages are full of really redundant data. Mifluz choice of using +// one db entry per word makes the DB pages have an even more redundant. +// But this choice also makes the pages have a very simple structure. +// +// Here is a real world example of what a page can look like: +// (key structure: word + 4 numerical fields) +// +// "trois" 1 4482 1 10b +// "trois" 1 4482 1 142 +// "trois" 1 4484 1 40 +// "trois" 1 449f 1 11e +// "trois" 1 4545 1 11 +// "trois" 1 45d3 1 545 +// "trois" 1 45e0 1 7e5 +// "trois" 1 45e2 1 830 +// "trois" 1 45e8 1 545 +// "trois" 1 45fe 1 ec +// "trois" 1 4616 1 395 +// "trois" 1 461a 1 1eb +// "trois" 1 4631 1 49 +// "trois" 1 4634 1 48 +// .... etc .... +// +// To compress we chose to only code differences between succesive entries. +// +// Differences in words are coded by 2 numbers and some letters: +// - the position within the word of the first letter that changes +// - the size of the new suffix +// - the letters in the new suffix +// +// Only differences in succesive numerical entries are stored. +// +// A flag is stored for each entry indicating which fields have changed. +// +// All this gives us a few numerical arrays which are themselves compressed +// and sent to the bitstream. +// +// +class WordDBCompress +{ + public: + WordDBCompress(); + WordDBCompress(int, int); + + int Compress(const u_int8_t* inbuff, int inbuff_length, u_int8_t** outbuffp, int* outbuff_lengthp); + int Uncompress(const u_int8_t* inbuff, int inbuff_length, u_int8_t* outbuff, int outbuff_length); + + // + // Return a new DB_CMPR_INFO initialized with characteristics of the + // current object and suitable as WordDB::CmprInfo argument. + // + DB_CMPR_INFO *CmprInfo(); + + private: + DB_CMPR_INFO *cmprInfo; + + //ZLIB WordDBCompression Flags + int use_zlib; + int zlib_level; + +// DEBUGING / BENCHMARKING + int debug; +// 0 : no debug no check +// 1 : TestCompress before each compression (but no debug within Compress Uncompress) +// 2 : use_tags (BitStream) within TestCompress -> Compress Uncompress +// 3 : verbose + int TestCompress(const u_int8_t* pagebuff, int pagebuffsize); +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordDBInfo.cc b/debian/htdig/htdig-3.2.0b6/htword/WordDBInfo.cc new file mode 100644 index 00000000..b4fb1225 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordDBInfo.cc @@ -0,0 +1,97 @@ +// WordDBInfo.cc +// +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <stdlib.h> +#include <stdio.h> + +#include "db.h" +#include "WordDBInfo.h" + +// +// WordDBInfo implementation +// + +WordDBInfo* WordDBInfo::instance = 0; + +// +// Like standard function but allows easy breakpoint setting. +// +static void message(const char *errpfx, char *msg) +{ + fprintf(stderr, "%s: %s\n", errpfx, msg); +} + +WordDBInfo::WordDBInfo(const Configuration& config) +{ + dbenv = 0; + + if(config.Boolean("wordlist_env_skip")) return; + + int error; + if((error = CDB_db_env_create(&dbenv, 0)) != 0) { + fprintf(stderr, "WordDBInfo: CDB_db_env_create %s\n", CDB_db_strerror(error)); + return; + } + dbenv->set_errpfx(dbenv, "WordDB"); + dbenv->set_errcall(dbenv, message); + if(dbenv->set_verbose(dbenv, DB_VERB_CHKPOINT, 1) != 0) + return; + if(dbenv->set_verbose(dbenv, DB_VERB_DEADLOCK, 1) != 0) + return; + if(dbenv->set_verbose(dbenv, DB_VERB_RECOVERY, 1) != 0) + return; + if(dbenv->set_verbose(dbenv, DB_VERB_WAITSFOR, 1) != 0) + return; + int cache_size = config.Value("wordlist_cache_size", 10*1024*1024); + if(cache_size > 0) { + if(dbenv->set_cachesize(dbenv, 0, cache_size, 1) != 0) + return; + } + + char* dir = 0; + int flags = DB_CREATE; + if(config.Boolean("wordlist_env_share")) { + const String& env_dir = config["wordlist_env_dir"]; + if(env_dir.empty()) { + fprintf(stderr, "WordDB: wordlist_env_dir not specified\n"); + return; + } + dir = strdup((const char*)env_dir); + + if(config.Boolean("wordlist_env_cdb")) + flags |= DB_INIT_CDB; + else + flags |= DB_INIT_LOCK | DB_INIT_MPOOL; + + } else { + flags |= DB_PRIVATE | DB_INIT_LOCK | DB_INIT_MPOOL; + } + + if((error = dbenv->open(dbenv, (const char*)dir, NULL, flags, 0666)) != 0) + dbenv->err(dbenv, error, "open %s", (dir ? dir : "")); + if(dir) free(dir); +} + +WordDBInfo::~WordDBInfo() +{ + if(dbenv) dbenv->close(dbenv, 0); +} + +void +WordDBInfo::Initialize(const Configuration &config_arg) +{ + if(instance != 0) + delete instance; + instance = new WordDBInfo(config_arg); +} diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordDBInfo.h b/debian/htdig/htdig-3.2.0b6/htword/WordDBInfo.h new file mode 100644 index 00000000..86fa5576 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordDBInfo.h @@ -0,0 +1,82 @@ +// +// WordDBInfo.h +// +// NAME +// inverted index usage environment. +// +// SYNOPSIS +// +// Only called thru WordContext::Initialize() +// +// DESCRIPTION +// +// The inverted indexes may be shared among processes/threads and provide the +// appropriate locking to prevent mistakes. In addition the memory cache +// used by <i>WordList</i> objects may be shared by processes/threads, +// greatly reducing the memory needs in multi-process applications. +// For more information about the shared environment, check the Berkeley +// DB documentation. +// +// CONFIGURATION +// +// wordlist_env_skip {true,false} (default false) +// If true no environment is created at all. This must never +// be used if a <i>WordList</i> object is created. It may be +// useful if only <i>WordKey</i> objects are used, for instance. +// +// wordlist_env_share {true,false} (default false) +// If true a sharable environment is open or created if none exist. +// +// wordlist_env_dir <directory> (default .) +// Only valid if <i>wordlist_env_share</i> set to <i>true.</i> +// Specify the directory in which the sharable environment will +// be created. All +// inverted indexes specified with a non-absolute pathname will be +// created relative to this directory. +// +// +// END +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// + +#ifndef _WordDBInfo_h_ +#define _WordDBInfo_h_ + +#include "Configuration.h" + +struct __db_env; + +class WordDBInfo +{ + public: + WordDBInfo(const Configuration& config); + ~WordDBInfo(); + // + // Unique instance handlers + // + static void Initialize(const Configuration& config); + + static WordDBInfo* Instance() { + if(instance) return instance; + fprintf(stderr, "WordDBInfo::Instance: no instance\n"); + return 0; + } + + // + // Berkeley DB environment + // + struct __db_env *dbenv; + + // + // Unique instance pointer + // + static WordDBInfo* instance; +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordDBPage.cc b/debian/htdig/htdig-3.2.0b6/htword/WordDBPage.cc new file mode 100644 index 00000000..eb43af30 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordDBPage.cc @@ -0,0 +1,1024 @@ +// +// WordDBPage.cc +// +// WordDBPage: Implements specific compression scheme for +// Berkeley DB pages containing WordReferences objects. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordDBPage.cc,v 1.5 2004/05/28 13:15:26 lha Exp $ +// +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include"WordDBPage.h" +#include"WordDBCompress.h" +#include<ctype.h> + +#define NBITS_CMPRTYPE 2 +#define CMPRTYPE_NORMALCOMRPESS 0 +#define CMPRTYPE_BADCOMPRESS 1 + +// *********************************************** +// ********** Compression Versions ************** +// *********************************************** + +// never change NBITS_COMPRESS_VERSION ! (otherwise version tracking will fail) +#define NBITS_COMPRESS_VERSION 11 + +// IMPORTANT: change these EVERY time you change something that affects the compression +#define COMPRESS_VERSION 4 +static const char *version_label[]={"INVALID_VERSION_0","INVALID_VERSION_1","INVALID_VERSION_2","14 Dec 1999","3 Jan 2000",NULL}; + +// returns the label of compression version v +static const char * +get_version_label(int v) +{ + // check if version number is ok + if(COMPRESS_VERSION <0 || COMPRESS_VERSION>((sizeof(version_label)/sizeof(*version_label))-1)) + { + errr("get_version_label: version_label[COMPRESS_VERSION] is not valid, please update version_label"); + } + if( v >= (int)((sizeof(version_label)/sizeof(*version_label))-1) ) + { + return("INVALID_VERSION"); + } + // return label + return(version_label[v]); +} + + + +// *********************************************** +// ********** WordDBPage *********************** +// *********************************************** + +// checks if compression/decompression sequence is harmless +int +WordDBPage::TestCompress(int debuglevel) +{ + if(debuglevel>2){printf("ttttttttttttt WordDBPage::TestCompress BEGIN\n");} + int compress_debug=debuglevel-1; + // start by compressing this page + Compressor *res=Compress(compress_debug); + + if(res) + { + int size=res->size(); + // now uncompress into pageu + WordDBPage pageu(pgsz); + res->rewind(); + pageu.Uncompress(res,compress_debug); + + // comapre this page and pageu + int cmp=Compare(pageu); + + // show some results + if(debuglevel>2)printf("TOTAL SIZE: %6d %8f\n",size,size/8.0); + // argh! compare failed somthing went wrong + // display the compress/decompress sequence and fail + if(cmp || size>8*1024*1000000000) + { + if(size>8*1024) + { + printf("---------------------------------------------------\n"); + printf("-----------overflow:%5d------------------------------\n",size/8); + printf("---------------------------------------------------\n"); + printf("---------------------------------------------------\n"); + } + printf("################### ORIGINAL #########################################\n"); + show(); + printf("################### REDECOMPRESSED #########################################\n"); + pageu.show(); + + // re-compress the page verbosely + Compressor *res2=Compress(2); + res2->rewind(); + // re-uncompress the page verbosely + WordDBPage pageu2(pgsz); + pageu2.Uncompress(res2,2); + pageu2.show(); + if(cmp){errr("Compare failed");} + delete res2; + } + pageu.delete_page(); + delete res; + + }else {errr("WordDBPage::TestCompress: Compress failed");} + + if(debuglevel>2){printf("ttttttttttttt WordDBPage::TestCompress END\n");} + return OK; +} + +// find position of first difference between 2 strings +static int first_diff(const String &s1,const String &s2) +{ + int j; + for(j=0;j<s1.length() && j<s2.length() && s1[j]==s2[j];j++); + return(j); +} + +// ******* Uncompress Compressor into this page +int +WordDBPage::Uncompress(Compressor *pin,int ndebug, DB_CMPR_INFO */*=NULL*/) +{ + debug=ndebug; + if(debug>1){verbose=1;} + if(verbose){printf("uuuuuuuuu WordDBPage::Uncompress: BEGIN\n");} + + + // ** first check if versions are OK + int read_version = pin->get_uint(NBITS_COMPRESS_VERSION,"COMPRESS_VERSION"); + if(read_version != COMPRESS_VERSION) + { + fprintf(stderr,"WordDBPage::Uncompress: *** Compression version mismatch ***\n"); + fprintf(stderr,"found version : %3d but using version : %3d\n",read_version,COMPRESS_VERSION); + fprintf(stderr,"found version label: %s\n",get_version_label(read_version)); + fprintf(stderr,"using version label: %s\n",get_version_label(COMPRESS_VERSION)); + fprintf(stderr,"Are you sure you're not reading an old DB with a newer version of the indexer??\n"); + errr("WordDBPage::Uncompress: *** Compression version mismatch ***"); + exit(1); + } + + + // ** now see if this page was a normal or uncorrectly compressed page + int cmprtype=pin->get_uint(NBITS_CMPRTYPE,"CMPRTYPE"); + // two possible cases + switch(cmprtype) + { + case CMPRTYPE_NORMALCOMRPESS:// this was a normaly compressed page + Uncompress_main(pin); + break; + case CMPRTYPE_BADCOMPRESS:// this page did not compress correctly + pin->get_zone((byte *)pg,pgsz*8,"INITIALBUFFER"); + break; + default: + errr("WordDBPage::Uncompress: CMPRTYPE incoherent"); + } + + if(verbose){printf("uuuuuuuuu WordDBPage::Uncompress: END\n");} + return OK; +} + +// ******* Uncompress Compressor into this page +// normally compressed page case +int +WordDBPage::Uncompress_main(Compressor *pin) +{ + if(!pin){errr("WordDBPage::Uncompress: no Compressor to uncompress from!!");} + Compressor &in=*((Compressor *)pin); + if(debug>0){in.set_use_tags();} + int i,j; + // number arrays used to reconstruct the original page + unsigned int **rnums=new unsigned int *[nnums]; + CHECK_MEM(rnums); + // sizes of each array + int *rnum_sizes=new int[nnums]; + CHECK_MEM(rnum_sizes); + // char differences between words + byte *rworddiffs=NULL; + int nrworddiffs; + + // *********** read header + if(Uncompress_header(in)!=OK){return NOTOK;} + + // get first key(s): + //type=5: key(0) stored seperately ... others are decompressed frome differences + // + //type=3: btikey(0) is particular (len=0) it is stored seperately + // btikey(1) stored seperately ... others are decompressed frome differences + // + int nkeysleft=nk; + if(nkeysleft>0) + { + WordDBKey key0=uncompress_key(in,0); + if(type==P_LBTREE){uncompress_data(in,0,key0.RecType());} + nkeysleft--; + } + if(nkeysleft>0 && type==P_IBTREE){uncompress_key(in,1);nkeysleft--;} + + if(nkeysleft>0) + { + // ********* read numerical fields + Uncompress_vals_chaged_flags(in,&(rnums[0]),&(rnum_sizes[0])); + for(j=1;j<nnums;j++) + { + if(verbose)printf("field %2d : start position:%4d \n",j,in.size()); + if(j==3 && verbose){in.verbose=2;} + rnum_sizes[j]=in.get_vals(&(rnums[j]),label_str("NumField",j));// *** + if(j==3 && verbose){in.verbose=0;} + if(verbose){printf("WordDBPage::Uncompress_main:got numfield:%2d:nvals:%4d\n",j,rnum_sizes[j]);} + } + + // ********* read word differences + nrworddiffs=in.get_fixedbitl(&rworddiffs,"WordDiffs"); + + + // ********* rebuild original page + Uncompress_rebuild(rnums,rnum_sizes,nnums,rworddiffs,nrworddiffs); + Uncompress_show_rebuild(rnums,rnum_sizes,nnums,rworddiffs,nrworddiffs); + + + for(i=0;i<nnums;i++){delete [] rnums[i];} + } + delete [] rnums; + delete [] rnum_sizes; + if(rworddiffs){delete [] rworddiffs;} + return 0; +} +void +WordDBPage::Uncompress_vals_chaged_flags(Compressor &in,unsigned int **pcflags,int *pn) +{ + int n=in.get_uint_vl(NBITS_NVALS,"FlagsField"); + unsigned int *cflags=new unsigned int[n]; + unsigned int ex=0; + int nbits=num_bits(n); + for(int i=0;i<n;i++) + { + ex=in.get_uint(WordKey::NFields(),label_str("cflags",i)); + cflags[i]=ex; + int rep=in.get("rep"); + if(rep) + { + rep=in.get_uint_vl(nbits,NULL); + for(int k=1;k<=rep;k++){cflags[k+i]=ex;} + i+=rep; + } + } + + *pn=n; + *pcflags=cflags; +} +int +WordDBPage::Uncompress_header(Compressor &in) +{ + pg->lsn.file =in.get_uint_vl( 8*sizeof(pg->lsn.file ),"page:lsn.file"); + pg->lsn.offset =in.get_uint_vl( 8*sizeof(pg->lsn.offset ),"page:lsn.offset"); + pg->pgno =in.get_uint_vl( 8*sizeof(pg->pgno ),"page:pgno"); + pg->prev_pgno =in.get_uint_vl( 8*sizeof(pg->prev_pgno ),"page:prev_pgno"); + pg->next_pgno =in.get_uint_vl( 8*sizeof(pg->next_pgno ),"page:next_pgno"); + pg->entries =in.get_uint_vl( 8*sizeof(pg->entries ),"page:entries"); + pg->hf_offset =in.get_uint_vl( 8*sizeof(pg->hf_offset ),"page:hf_offset"); + pg->level =in.get_uint_vl( 8*sizeof(pg->level ),"page:level"); + pg->type =in.get_uint_vl( 8*sizeof(pg->type ),"page:type"); + + init(); + + if(verbose) + { + printf("************************************\n"); + printf("******** WordDBPage::Uncompress: page header ***\n"); + printf("************************************\n"); + printf("page size:%d\n",(int)pgsz); + printf(" 00-07: Log sequence number. file : %d\n", pg->lsn.file ); + printf(" 00-07: Log sequence number. offset: %d\n", pg->lsn.offset ); + printf(" 08-11: Current page number. : %d\n", pg->pgno ); + printf(" 12-15: Previous page number. : %d\n", pg->prev_pgno ); + printf(" 16-19: Next page number. : %d\n", pg->next_pgno ); + printf(" 20-21: Number of item pairs on the page. : %d\n", pg->entries ); + printf(" 22-23: High free byte page offset. : %d\n", pg->hf_offset ); + printf(" 24: Btree tree level. : %d\n", pg->level ); + printf(" 25: Page type. : %d\n", pg->type ); + } + return OK; +} +void +WordDBPage::Uncompress_rebuild(unsigned int **rnums,int *rnum_sizes,int nnums0,byte *rworddiffs,int nrworddiffs) +{ + int irwordiffs=0; + int nfields=WordKey::NFields(); + int *rnum_pos=new int[ nnums0];// current index count + CHECK_MEM(rnum_pos); + + int ii,j; + for(j=0;j<nnums0;j++){rnum_pos[j]=0;} + + int i0=0; + if(type==P_IBTREE){i0=1;}// internal pages have particular first key + + WordDBKey pkey; + WordDBKey akey=get_WordDBKey(i0); + + // reconstruct each key using previous key and coded differences + for(ii=i0;ii<nk;ii++) + { + WordDBRecord arec; + BINTERNAL bti; + + if(type==P_LBTREE) + { + // **** get the data fields + arec.set_decompress(rnums,rnum_sizes,ii,CNDATADATA,CNDATASTATS0,CNDATASTATS1); + } + else + { + if(type!=3){errr("WordDBPage::Uncompress_rebuild: unsupported type!=3");} + // ****** btree internal page specific + bti.pgno =rnums[CNBTIPGNO ][rnum_pos[CNBTIPGNO ]++]; + bti.nrecs=rnums[CNBTINRECS][rnum_pos[CNBTINRECS]++]; + } + // all that follows codes differences between succesive entries + // that is: Numerical key fields, Words + if(ii>i0) + { + unsigned int flags=rnums[CNFLAGS][rnum_pos[CNFLAGS]++]; + int foundfchange=0; + // **** reconstruct the word + if(flags&pow2(nfields-1))// check flags to see if word has changed + { + foundfchange=1; + if(rnum_pos[CNWORDDIFFLEN]>=rnum_sizes[CNWORDDIFFLEN]){errr("WordDBPage::Uncompress read wrong num worddiffs");} + // get position of first character that changes in this word + int diffpos=rnums[CNWORDDIFFPOS][rnum_pos[CNWORDDIFFPOS]++]; + // get size of changed part of the word + int difflen=rnums[CNWORDDIFFLEN][rnum_pos[CNWORDDIFFLEN]++]; + int wordlen=diffpos+difflen; + char *str=new char [wordlen+1]; + CHECK_MEM(str); + // copy the unchanged part into str from previos key's word + if(diffpos)strncpy(str,(char *)pkey.GetWord(),diffpos); + // copy the changed part from coded word differences + strncpy(str+diffpos,(char *)rworddiffs+irwordiffs,difflen); + str[wordlen]=0; + if(verbose)printf("key %3d word:\"%s\"\n",ii,str); + akey.SetWord(str); + irwordiffs+=difflen; + delete [] str; + + }else{akey.SetWord(pkey.GetWord());} + // **** reconstruct the numerical key fields + for(j=1;j<nfields;j++) + { + // check flags to see if this field has changed + int changed=flags&pow2(j-1); + if(changed) + { + // this field's number + int k=CNFIELDS+j-1; + // current position within coded differences of this field + int indx=rnum_pos[k]; + if(indx>=rnum_sizes[k]){errr("WordDBPage::Uncompress read wrong num of changes in a field");} + if(!foundfchange) + { + // this is the first field that changes in this key + // so difference is coded compared to value in pevious key + akey.Set(j,rnums[k][indx]+pkey.Get(j)); + } + else + { + // this is NOT the first field that changes in this key + // so difference is coded from 0 + akey.Set(j,rnums[k][indx]); + } + // we read 1 element from coded differences in this field + rnum_pos[k]++; + foundfchange=1; + } + else + { + // no changes found, just copy from previous key + if(!foundfchange){akey.Set(j,pkey.Get(j));} + else{akey.Set(j,0);} + } + } + } + // now insert key/data into page + if(type==P_LBTREE) + { + if(ii>i0)insert_key(akey); + if(ii>i0)insert_data(arec); + } + else + { + if(type!=3){errr("WordDBPage::Uncompress_rebuild: unsupported type!=3");} + if(ii>i0)insert_btikey(akey,bti); + } + pkey=akey; + } + delete [] rnum_pos; +} + +// display +void +WordDBPage::Uncompress_show_rebuild(unsigned int **rnums,int *rnum_sizes,int nnums0,byte *rworddiffs,int nrworddiffs) +{ + int i,j; + if(verbose) + { + printf("WordDBPage::Uncompress_show_rebuild: rebuilt numerical fields\n"); + for(j=0;j<nnums0;j++) + { + printf("resfield %2d %13s:",j,number_field_label(j)); + for(i=0;i<rnum_sizes[j];i++) + { + printf("%4d ",rnums[j][i]); + } + printf("\n"); + printf("diffield %2d:",j); + for(i=0;i<rnum_sizes[j];i++) + { + ;// printf("%2d:%d ",i,nums[j*nk+i] == rnums[j][i]); + } + printf("\n"); + } + printf("reswordiffs:"); + for(i=0;i<nrworddiffs;i++){printf("%c",(isalnum(rworddiffs[i]) ? rworddiffs[i] : '#'));} + printf("\n"); + } +} + +Compressor * +WordDBPage::Compress(int ndebug, DB_CMPR_INFO *cmprInfo/*=NULL*/) +{ + debug=ndebug; + if(debug>1){verbose=1;} + + Compressor *res=(Compressor *)new Compressor((cmprInfo ? + pgsz/(1<<(cmprInfo->coefficient)) : + pgsz/4)); + CHECK_MEM(res); + if(debug>0){res->set_use_tags();} + + res->put_uint(COMPRESS_VERSION,NBITS_COMPRESS_VERSION,"COMPRESS_VERSION"); + res->put_uint(CMPRTYPE_NORMALCOMRPESS,NBITS_CMPRTYPE,"CMPRTYPE"); + + if(verbose){printf("WordDBPage::Compress: trying normal compress\n");} + int cmpr_ok=Compress_main(*((Compressor *)res)); + + if(cmpr_ok!=OK || res->buffsize()>pgsz) + { + if(verbose){printf("WordDBCompress::Compress full compress failed ... not compressing at all\n");} + show(); + + if(res){delete res;} + res=new Compressor; + CHECK_MEM(res); + + if(debug>0){res->set_use_tags();} + + res->put_uint(COMPRESS_VERSION,NBITS_COMPRESS_VERSION,"COMPRESS_VERSION"); + res->put_uint(CMPRTYPE_BADCOMPRESS,NBITS_CMPRTYPE,"CMPRTYPE"); + + res->put_zone((byte *)pg,pgsz*8,"INITIALBUFFER"); + } + + if(verbose) + { + printf("WordDBPage::Compress: Final bitstream result\n"); + res->show(); + } + return res; +}; + +int +WordDBPage::Compress_main(Compressor &out) +{ + if(debug>1){verbose=1;} + if(verbose){printf("WordDBPage::Compress_main: starting compression\n");} + + if(pg->type!=5 && pg->type!=3){ printf("pg->type:%3d\n",pg->type);return NOTOK;} +// if(pg->type==P_IBTREE){show();} + + + // *************** initialize data structures ************** + int j; + // 0 -> changed/unchanged flags : 4bits + // 1..n -> numerical fields delta : ?bits (depending on field) + // n+1 -> word changed size : 1 + int *nums =new int[nk*nnums]; + CHECK_MEM(nums); + int *nums_pos=new int[ nnums]; + CHECK_MEM(nums_pos); +// int *cnsizes =new int[ nnums]; + for(j=0;j<nnums;j++){nums_pos[j]=0;} +// for(j=1;j<nfields;j++) {cnsizes[j]=word_key_info->sort[j].bits;} +// cnsizes[CNFLAGS]=4; +// cnsizes[CNWORDDIFFPOS ]=8; +// cnsizes[CNWORDDIFFLEN ]=8; + HtVector_byte worddiffs; + + +//bmt_START; + // *************** extract values and wordiffs ************** + if(nk>0) + { + Compress_extract_vals_wordiffs(nums,nums_pos,nnums,worddiffs); + if(verbose)Compress_show_extracted(nums,nums_pos,nnums,worddiffs); + } + + // *************** init compression ************** + +//bmt_END;bmt_START; + Compress_header(out); + + // *************** compress values and wordiffs ************** + + // compress first key(s) + int nkeysleft=nk; + if(nkeysleft>0) + { + compress_key(out,0); + if(type==P_LBTREE){compress_data(out,0);} + nkeysleft--; + } + if(nkeysleft>0 && type==P_IBTREE){compress_key(out,1);nkeysleft--;} + + if(nkeysleft>0) + { +//bmt_END;bmt_START; + // compress values + Compress_vals(out,nums,nums_pos,nnums); +//bmt_END;bmt_START; + + // compress worddiffs + int size=out.put_fixedbitl(worddiffs.begin(),worddiffs.size(),"WordDiffs"); + if(verbose)printf("compressed wordiffs : %3d values: %4d bits %4f bytes\n",worddiffs.size(),size,size/8.0); +//bmt_END; + } + + // *************** cleanup ************** + + delete [] nums ; + delete [] nums_pos; + + return OK; +} + +void +WordDBPage::Compress_extract_vals_wordiffs(int *nums,int *nums_pos,int ,HtVector_byte &worddiffs) +{ + WordDBKey pkey; + + int ii,j; + int i0=0; + if(type==P_IBTREE){i0=1;}// internal pages have particular first key + for(ii=i0;ii<nk;ii++) + { + WordDBKey akey=get_WordDBKey(ii); + + if(type==P_LBTREE) + { + // ****** WordRecord (data/stats) + // get word record + WordDBRecord arec(data(ii),akey.RecType()); + // add record + if(arec.type==WORD_RECORD_STATS) + { + nums[CNDATASTATS0*nk+nums_pos[CNDATASTATS0]++]=arec.info.stats.noccurrence; + nums[CNDATASTATS1*nk+nums_pos[CNDATASTATS1]++]=arec.info.stats.ndoc; + } + else + if(arec.type==WORD_RECORD_DATA) + { + nums[CNDATADATA *nk+nums_pos[CNDATADATA ]++]=arec.info.data; + } + } + else + { + if(type!=3){errr("WordDBPage::Compress_extract_vals_wordiffs: unsupported type!=3");} + // ****** btree internal page specific + nums[CNBTIPGNO *nk+nums_pos[CNBTIPGNO ]++]=btikey(ii)->pgno ; + nums[CNBTINRECS*nk+nums_pos[CNBTINRECS]++]=btikey(ii)->nrecs; + } + + // all that follows codes differences between succesive entries + // that is: Numerical key fields, Words + if(ii>i0) + { + // clear changed falgs + int iflag=CNFLAGS*nk+nums_pos[CNFLAGS]++; + nums[iflag]=0; + + int foundfchange=0; + const String &aword=akey.GetWord(); + const String &pword=pkey.GetWord(); + if(!(aword==pword)){foundfchange=1;} + + // check numerical fields for changes + // ******** sets CNFIELDS and some of CNFLAGS ************ + for(j=1;j<akey.NFields();j++) + { + int diff=akey.Get(j)-(foundfchange ? 0 : pkey.Get(j)); + if(diff) + { + foundfchange=1; + nums[iflag]|=pow2(j-1); + nums[ j*nk+nums_pos[j]++]=diff; + } + } + + // ************ check word for changes + // ******** sets CNWORDDIFFPOS CNWORDDIFFLEN and some of CNFLAGS ************ + if(!(aword==pword)) + { + nums[iflag]|=pow2(akey.NFields()-1); + int fd=first_diff(aword,pword); + nums[CNWORDDIFFPOS*nk+nums_pos[CNWORDDIFFPOS]++]=fd; + nums[CNWORDDIFFLEN*nk+nums_pos[CNWORDDIFFLEN]++]=aword.length()-fd; + for(int s=fd;s<aword.length();s++){worddiffs.push_back(aword[s]);} + } + } + pkey=akey; + } +// nums_pos[CNFLAGS]=nk-1; + +} + +void +WordDBPage::Compress_vals_changed_flags(Compressor &out,unsigned int *cflags,int n) +{ + int size=out.size(); + out.put_uint_vl(n,NBITS_NVALS,"FlagsField"); + unsigned int ex=0; + int nbits=num_bits(n); + for(int i=0;i<n;i++) + { + ex=cflags[i]; + out.put_uint(ex,WordKey::NFields(),label_str("cflags",i)); + int k; + for(k=1;k+i<n;k++){if(ex!=cflags[i+k]){break;}} + k--; + if(k>0) + { + out.put(1,"rep"); + out.put_uint_vl(k,nbits,NULL); + i+=k; + } + else + {out.put(0,"rep");} + } + size=out.size()-size; + if(verbose)printf("compressed flags %2d : %3d values: %4d bits %8f bytes : ended bit field pos:%6d\n",0,n,size,size/8.0,out.size()); + +} + +void +WordDBPage::Compress_vals(Compressor &out,int *nums,int *nums_pos,int nnums0) +{ + // the changed flags fields are particular + Compress_vals_changed_flags(out,(unsigned int *)(nums+0*nk),nums_pos[0]); + + + // compress the difference numbers for the numerical fields + for( int j=1;j<nnums0;j++) + { + int nv=nums_pos[j]; + unsigned int *v=(unsigned int *)(nums+j*nk); + if((1 || j==3) && verbose){out.verbose=2;} + int size=out.put_vals(v,nv,label_str("NumField",j)); + if((1 || j==3) && verbose){out.verbose=0;} + if(verbose)printf("compressed field %2d : %3d values: %4d bits %8f bytes : ended bit field pos:%6d\n",j,n,size,size/8.0,out.size()); + } +} + +void +WordDBPage::Compress_header(Compressor &out) +{ +// no smart compression ... for now + out.put_uint_vl(pg->lsn.file , 8*sizeof(pg->lsn.file ),"page:lsn.file"); + out.put_uint_vl(pg->lsn.offset , 8*sizeof(pg->lsn.offset ),"page:lsn.offset"); + out.put_uint_vl(pg->pgno , 8*sizeof(pg->pgno ),"page:pgno"); + out.put_uint_vl(pg->prev_pgno , 8*sizeof(pg->prev_pgno ),"page:prev_pgno"); + out.put_uint_vl(pg->next_pgno , 8*sizeof(pg->next_pgno ),"page:next_pgno"); + out.put_uint_vl(pg->entries , 8*sizeof(pg->entries ),"page:entries"); + out.put_uint_vl(pg->hf_offset , 8*sizeof(pg->hf_offset ),"page:hf_offset"); + out.put_uint_vl(pg->level , 8*sizeof(pg->level ),"page:level"); + out.put_uint_vl(pg->type , 8*sizeof(pg->type ),"page:type"); +} + +void +WordDBPage::Compress_show_extracted(int *nums,int *nums_pos,int nnums0,HtVector_byte &worddiffs) +{ + int i,j; + int *cnindexe2=new int[ nnums0]; + CHECK_MEM(cnindexe2); + for(j=0;j<nnums0;j++){cnindexe2[j]=0;} + for(j=0;j<nnums0;j++) + { + printf("%13s",number_field_label(j)); + } + printf("\n"); + int w=0; + int mx=(nk>worddiffs.size() ? nk : worddiffs.size()); + for(i=0;i<mx;i++) + { + printf("%3d: ",i); + for(j=0;j<nnums0;j++) + { + int k=cnindexe2[j]++; + int nbits=(j ? 16:4);// just to show the flags field... + if(k<nums_pos[j]) + { + int val=nums[j*nk+k]; + if(nbits<8){show_bits(val,nbits);printf(" ");} + else + { + printf("|%12u",val); + } + } + else + { + if(nbits<8){printf(" ");} + else + { + printf("| "); + } + } + } + if(w<worddiffs.size()){printf(" %02x %c ",worddiffs[w],(isalnum(worddiffs[w]) ? worddiffs[w] : '#'));} + w++; + printf("\n"); + } + delete [] cnindexe2; +} + +// Compare two pages to check if equal +int +WordDBPage::Compare(WordDBPage &other) +{ + int res=0; + // Compare headers + if(other.pgsz != pgsz ){res++;printf("compare failed for pgsz \n");} + if(other.pg->lsn.file != pg->lsn.file ){res++;printf("compare failed for pg->lsn.file \n");} + if(other.pg->lsn.offset != pg->lsn.offset ){res++;printf("compare failed for pg->lsn.offset \n");} + if(other.pg->pgno != pg->pgno ){res++;printf("compare failed for pg->pgno \n");} + if(other.pg->prev_pgno != pg->prev_pgno ){res++;printf("compare failed for pg->prev_pgno \n");} + if(other.pg->next_pgno != pg->next_pgno ){res++;printf("compare failed for pg->next_pgno \n");} + if(other.pg->entries != pg->entries ){res++;printf("compare failed for pg->entries \n");} + if(other.pg->hf_offset != pg->hf_offset ){res++;printf("compare failed for pg->hf_offset \n");} + if(other.pg->level != pg->level ){res++;printf("compare failed for pg->level \n");} + if(other.pg->type != pg->type ){res++;printf("compare failed for pg->type \n");} + int i,k; + // double check header + if(memcmp((void *)pg,(void *)other.pg,sizeof(PAGE)-sizeof(db_indx_t))) + { + res++; + printf("compare failed in some unknown place in header:\n"); + for(i=0;i<(int)(sizeof(PAGE)-sizeof(db_indx_t));i++) + { + printf("%3d: %3x %3x\n",i,((byte *)pg)[i],((byte *)other.pg)[i]); + } + } + + // pg->type != 5 && !=3 pages are not really compressed: just memcmp + if(pg->type != 5 && pg->type != 3) + { + if(memcmp((void *)pg,(void *)other.pg,pgsz)) + { + printf("compare:PAGETYPE:!=5 and memcmp failed\n"); + res++; + printf("compare failed\n"); + } + return(res); + } + + // compare each key/data pair + for(i=0;i<(type==P_LBTREE ? pg->entries/2 : pg->entries);i++) + { + if(pg->type==P_LBTREE) + { + // compare keys + if(key(i)->len !=other.key(i)->len ) + { + printf("compare:key(%2d) len : %2d != %2d\n",i,key(i)->len ,other.key(i)->len ); + res++; + } + if(key(i)->type!=other.key(i)->type) + { + printf("compare:key(%2d) type: %2d != %2d\n",i,key(i)->type,other.key(i)->type); + res++; + } + if(memcmp(key(i)->data,other.key(i)->data,key(i)->len)) + { + printf("compare :key(%2d)\n",i); + for(k=0;k<key(i)->len;k++) + { + int c=key(i)->data[k]; + if(isalnum(c)){printf(" %c ",c);} + else{printf("%02x ",c);} + } + printf("\n"); + for(k=0;k<key(i)->len;k++) + { + int c=other.key(i)->data[k]; + if(isalnum(c)){printf(" %c ",c);} + else{printf("%02x ",c);} + } + printf("\n"); + res++;printf("compare:key failed\n"); + } + // compare data + if(data(i)->len !=other.data(i)->len ) + { + printf("compare:data(%2d) len : %2d != %2d\n",i,data(i)->len ,other.data(i)->len ); + res++; + } + if(data(i)->type!=other.data(i)->type) + { + printf("compare:data(%2d) type: %2d != %2d\n",i,data(i)->type,other.key(i)->type); + res++; + } + if(memcmp(data(i)->data,other.data(i)->data,data(i)->len)) + { + printf("compare :data(%2d)\n",i); + for(k=0;k<data(i)->len;k++) + { + printf("%02x ",data(i)->data[k]); + } + printf("\n"); + for(k=0;k<data(i)->len;k++) + { + printf("%02x ",other.data(i)->data[k]); + } + printf("\n"); + res++;printf("compare:data failed\n"); + } + } + else + { + if(type!=3){errr("WordDBPage::Compare: unsupported type!=3");} + if(btikey(i)->len != other.btikey(i)->len || + btikey(i)->type != other.btikey(i)->type || + btikey(i)->pgno != other.btikey(i)->pgno || + btikey(i)->nrecs != other.btikey(i)->nrecs ) + { + printf("compare:btikey(%2d) failed\n",i); + printf("this :len :%4d type :%4d pgno :%4d nrecs :%4d \n",btikey(i)->len,btikey(i)->type, + btikey(i)->pgno,btikey(i)->nrecs); + printf("other:len :%4d type :%4d pgno :%4d nrecs :%4d \n",other.btikey(i)->len,other.btikey(i)->type, + other.btikey(i)->pgno,other.btikey(i)->nrecs); + res++; + + } + if(memcmp(btikey(i)->data,other.btikey(i)->data,btikey(i)->len)) + { + printf("compare :btikey(%2d)\n",i); + for(k=0;k<btikey(i)->len;k++) + { + printf("%02x ",btikey(i)->data[k]); + } + printf("\n"); + for(k=0;k<btikey(i)->len;k++) + { + printf("%02x ",other.btikey(i)->data[k]); + } + printf("\n"); + res++;printf("compare:btikey failed\n"); + + } + } + } + if(pg->entries>0) + { + int smallestoffset=HtMaxMin::min_v(pg->inp,pg->entries); + int other_smallestoffset=HtMaxMin::min_v(other.pg->inp,other.pg->entries); + if(smallestoffset!=other_smallestoffset) + { + printf("compare fail:smallestoffset:%d other_smallestoffset:%d\n",smallestoffset,other_smallestoffset); + res++; + } + } + + return(res); +} + +// Bit stream description +// | field[last] changed only | yes -> delta field[last] +// + +// redo=0 -> +// redo=1 -> oops, dont show! +// redo=2 -> +void +WordDBPage::show() +{ + int i,j,dd,l; + + printf("************************************\n"); + printf("************************************\n"); + printf("************************************\n"); + printf("page size:%d\n",(int)pgsz); + printf(" 00-07: Log sequence number. file : %d\n", pg->lsn.file ); + printf(" 00-07: Log sequence number. offset: %d\n", pg->lsn.offset ); + printf(" 08-11: Current page number. : %d\n", pg->pgno ); + printf(" 12-15: Previous page number. : %d\n", pg->prev_pgno ); + printf(" 16-19: Next page number. : %d\n", pg->next_pgno ); + printf(" 20-21: Number of item pairs on the page. : %d\n", pg->entries ); + printf(" 22-23: High free byte page offset. : %d\n", pg->hf_offset ); + printf(" 24: Btree tree level. : %d\n", pg->level ); + printf(" 25: Page type. : %d\n", pg->type ); + + + printf("entry offsets:"); + for(i=0;i<pg->entries;i++){printf("%4d ",pg->inp[i]);} + printf("\n"); + + if(pg->type ==5) + { + + WordRecord dud; + WordKey prev; + int pagecl=0; + for(i=0;i<pg->entries;i++) + { + if( (i%2) && dud.type==WORD_RECORD_NONE){continue;} + printf("\n||%c:%3d:off:%03d:invoff:%4d:len:%2d:typ:%x:",i%2 ? 'D' : 'K',i,e_offset(i),pgsz-e_offset(i),entry(i)->len,entry(i)->type); + if(i>0) + { + l=entry(i)->len+3; + dd=(int)(e_offset(i-1))-l; + dd-=dd%4; + printf("% 5d:: ",(e_offset(i)-dd)); + } + if(!(i%2)) + { + WordDBKey tkey(entry(i)); + int fieldchanged[10]; + char *wordchange=NULL; + printf("\""); + printf("%s",(char *)tkey.GetWord()); + printf("\""); + for(j=0;j<20-tkey.GetWord().length();j++){printf(" ");} + printf("|"); + for(j=1;j<tkey.NFields();j++){printf("%4x ",tkey.Get(j));} + printf("|"); + + for(j=1;j<tkey.NFields();j++) + { + int diff=tkey.Get(j)-prev.Get(j); + if(diff<0){diff=tkey.Get(j);} + printf("%6d ",diff); + fieldchanged[j]=diff; + } + + String &word=tkey.GetWord(); + String &pword=prev.GetWord(); + if(word==pword){printf(" 00 ===");fieldchanged[0]=0;} + else + { + int fd=first_diff(word,pword); + fieldchanged[0]=fd+1; + wordchange=((char *)word)+fd; + printf(" %2d %s",fd,((char *)word)+fd); + } + + int keycl=tkey.NFields(); + for(j=1;j<tkey.NFields();j++) + { + if(fieldchanged[j]){keycl+=WordKeyInfo::Instance()->sort[j].bits;} + } + if(fieldchanged[0]){keycl+=3;keycl+=8*strlen(wordchange);} + printf(" ::%2d %f",keycl,keycl/8.0); + pagecl+=keycl; + prev=tkey; + } + else + { + if(entry(i)->len>100){printf("WordDBPage::show: aaargh strange failing\n");return;} + for(j=0;j<entry(i)->len;j++) + { + printf("%02x ",entry(i)->data[j]); + } + } + } + printf("\n"); + } + else + if(1) + { + int nn=0; + // dump hex + for(i=0;;i++) + { + printf("%5d: ",nn); + for(j=0;j<20;j++) + { + printf("%2x ",((byte *)pg)[nn++]); + if(nn>=pgsz){break;} + } + printf("\n"); + if(nn>=pgsz){break;} + } + } + if(pg->type == 3) + { + for(i=0;i<pg->entries;i++) + { + BINTERNAL *bie=GET_BINTERNAL(pg,i); + printf("%3d: off:%4d:len:%3d :type:%3d :pgno:%4d: nrecs:%4d:: ",i,pg->inp[i],bie->len,bie->type,bie->pgno,bie->nrecs); + WordDBKey tkey(bie); + for(j=0;j<bie->len-tkey.GetWord().length();j++){printf("%2x ",bie->data[j]);} + printf(" : "); + for(j=1;j<tkey.NFields();j++){printf("%5d ",tkey.Get(j));} + printf("\"%s\"\n",(char *)tkey.GetWord()); + } + } + +} + + diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordDBPage.h b/debian/htdig/htdig-3.2.0b6/htword/WordDBPage.h new file mode 100644 index 00000000..1f23d5ff --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordDBPage.h @@ -0,0 +1,508 @@ +// +// WordDBPage.h +// +// WordDBPage: Implements specific compression scheme for +// Berkeley DB pages containing WordReferences objects. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordDBPage.h,v 1.8 2004/05/28 13:15:26 lha Exp $ +// +// +// Access to Berkeley DB internal +// + +#ifndef _WordDBPage_h_ +#define _WordDBPage_h_ + +extern "C" +{ +#include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "mp.h" +#include "db_page.h" +#include "common_ext.h" +} + +#include "WordDBCompress.h" +#include "WordBitCompress.h" +#include "WordRecord.h" +#include "WordKey.h" + + +#define WORD_ALIGN_TO(v,a) ( (v)%(a) ? (v+((a)-(v)%(a))) : v) +#define NBITS_KEYLEN 16 +#define NBITS_DATALEN 16 + +// *********************************************** +// *************** WordDBRecord ***************** +// *********************************************** + +// WordRecord with added functionalities to help with compression/decompression +class WordDBRecord : public WordRecord +{ +public: + + // retreive WordRecord data/stats from coded numbers + void set_decompress(unsigned int **data,int *indexes,int i,int pdata,int pstat0,int pstat1) + { + if(i>=indexes[pstat0]) + {// were pas the end of coded stats, so this can't be a stat + type=DefaultType(); + if(type==WORD_RECORD_DATA){info.data=data[pdata][i-indexes[pstat0]];} + else{info.data=0;} + } + else + {// this is a stat + type=WORD_RECORD_STATS; + info.stats.noccurrence=data[pstat0][i]; + info.stats.ndoc =data[pstat1][i]; + } + } + WordDBRecord():WordRecord(){;} + WordDBRecord(byte *dat,int len,int rectyp):WordRecord() + { + type=(rectyp ? DefaultType() : WORD_RECORD_STATS); + Unpack(String((char *)dat,len)); + } + WordDBRecord(BKEYDATA *ndata,int rectyp):WordRecord() + {// typ: 0->stat 1->data + type=(rectyp ? DefaultType() : WORD_RECORD_STATS); + Unpack(String((char *)ndata->data,ndata->len)); + } +}; + + +// *********************************************** +// **************** WordDBKey ***************** +// *********************************************** + +// WordKey with added functionalities to help with compression/decompression +class WordDBKey : public WordKey +{ + BKEYDATA *key; +public: + + int RecType(){return (GetWord()[0]!=1 ? 1 :0);} + WordDBKey():WordKey() + { + key=NULL; + } + WordDBKey(BKEYDATA *nkey):WordKey() + { + key=nkey; + Unpack(String((char *)key->data,key->len)); + } + int is_null() + { + errr("UNUSED"); + if(GetWord().length()==0) + { + for(int j=1;j<NFields();j++) + {if(Get(j)!=0){errr("WordDBKey::is_null key has 0 len word but is not null");}} + return 1; + } + return 0; + } + WordDBKey(BINTERNAL *nkey):WordKey() + { + key=NULL; + if(nkey->len==0) + { + ;// errr("WordDBKey::WordDBKey(BINTERNAL) : nkey->len==0"); + } + else{Unpack(String((char *)nkey->data,nkey->len));} + } + WordDBKey(byte *data,int len):WordKey() + { + key=NULL; + if(!data || !len){errr("WordDBKey::WordDBKey(data,len) !data || !len");} + Unpack(String((char *)data,len)); + } +}; + + +// *********************************************** +// **************** WordDBPage ***************** +// *********************************************** + +// encapsulation of Berkeley DB BTREE page. +// this one knows how to compress/decompress itself +class WordDBPage +{ + public: + int n; // number of entries + int nk; // number of keys + int type; // for now 3(btreeinternal) && 5(leave:normal case) are allowed + int pgsz; + + PAGE *pg; // pointer to BerkeleyDB BTREE page structure + + // assert this page is a leave + void isleave() + { + if(type!=P_LBTREE){errr("WordDBPage::isleave: trying leave specific on non leave");} + } + + // assert this page is an internal (non-leave) page + void isintern() + { + if(type!=P_IBTREE){errr("WordDBPage::isintern: trying btreeinternal specific on non btreeinternal page type");} + + } + + // get the i'th key stored in this page + WordDBKey get_WordDBKey(int i) + { + if(type==P_LBTREE){return(WordDBKey(key(i)));} + else + if(type==P_IBTREE){return(WordDBKey(btikey(i)));} + else + {errr("WordDBPage:get_WordDBKey: bad page type");} + return WordDBKey(); + } + + // ******************* Accessors to packed entries **************** + + // get the i'th key stored in this (internal==nonleave) page. (ptr to packed) + BINTERNAL *btikey(int i) + { + if(i<0 || i>=pg->entries){printf("btikey:%d\n",i);errr("WordDBPage::btikey out iof bounds");} + isintern();return(GET_BINTERNAL(pg,i )); + } + // get the i'th entry stored in this (nonleave) page. (ptr to packed) + // an entry can either be a key or a data entry + BKEYDATA *entry (int i) + { + if(i<0 || i>=pg->entries){printf("entry:%d\n",i);errr("WordDBPage::entry out iof bounds");} + isleave(); return(GET_BKEYDATA (pg,i )); + } + // get the i'th key stored in this (leave) page. (ptr to packed) + BKEYDATA *key (int i) + { + if(i<0 || 2*i>=pg->entries){printf("key:%d\n",i);errr("WordDBPage::key out iof bounds");} + isleave(); return(GET_BKEYDATA (pg,i*2 )); + } + // get the i'th data stored in this (leave) page. (ptr to packed) + BKEYDATA *data (int i) + { + if(i<0 || 2*i+1>=pg->entries){printf("data:%d\n",i);errr("WordDBPage::data out iof bounds");} + isleave(); return(GET_BKEYDATA (pg,i*2+1)); + } + + + // ********************* Inserting entries into a page *************** + + int insert_pos; // offset in page of last inserted entry + int insert_indx; // index of next entry to be inserted + + int e_offset(int i) {return((int)(pg->inp[i]));} + + // allocate space (in the db page) for adding an entry to this page + void *alloc_entry(int size) + { + size=WORD_ALIGN_TO(size,4); + int inp_pos=((byte *)&(pg->inp[insert_indx]))-(byte *)pg; + insert_pos-=size; + if(insert_pos<=inp_pos) + { + show(); + printf("alloc_entry: allocating size:%4d entrynum:insert_indx:%4d at:insert_pos:%4d\n",size,insert_indx,insert_pos); + errr("WordDBPage::alloc_entry: PAGE OVERFLOW"); + } + pg->inp[insert_indx++]=insert_pos; + return((void *)((byte *)pg+insert_pos)); + } + + + // add a data entry to this page + void insert_data(WordDBRecord &wrec) + { + isleave(); + if(!(insert_indx%2)){errr("WordDBPage::insert_data data must be an odd number!");} + String prec; + wrec.Pack(prec); + int len=prec.length(); + int size=len+(sizeof(BKEYDATA)-1); + + BKEYDATA *dat=(BKEYDATA *)alloc_entry(size); + dat->len=len; + dat->type=1;//!!!!!!!!!!!!! + memcpy((void *)dat->data,(void *)(char *)prec,len); + } + // add a key entry to this page + void insert_key(WordDBKey &ky) + { + isleave(); + if(insert_indx%2){errr("WordDBPage::insert_key key must be an even number!");} + String pkey; + ky.Pack(pkey); + int keylen=pkey.length(); + int size=keylen+(sizeof(BKEYDATA)-1); + BKEYDATA *bky=(BKEYDATA *)alloc_entry(size); + bky->len=keylen; + bky->type=1;// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + memcpy((void *)bky->data,(void *)(char *)pkey,keylen); + } + // add a key entry to this internal page + void insert_btikey(WordDBKey &ky,BINTERNAL &bti,int empty=0) + { + isintern(); + int keylen=0; + String pkey; + if(!empty) + { + ky.Pack(pkey); + keylen=pkey.length(); + } + int size=keylen+((byte *)&(bti.data))-((byte *)&bti);// pos of data field in BINTERNAL + if(empty) + { + if(verbose){printf("WordDBPage::insert_btikey: empty : BINTERNAL:%d datapos:%d keylen:%d size:%d alligned to:%d\n",(int)sizeof(BINTERNAL), + (int)(((byte *)&(bti.data))-((byte *)&bti)), + keylen,size,WORD_ALIGN_TO(size,4));} + } + + BINTERNAL *btik=(BINTERNAL *)alloc_entry(size); + btik->len =(empty ? 0 : keylen); + btik->type=1;// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + btik->pgno =bti.pgno; + btik->nrecs=bti.nrecs; + if(!empty){memcpy((void *)btik->data,(void *)(char *)pkey,keylen);} +// else +// {btik->data[0]=0;}// just to avoid uninit memory read + } + int entry_struct_size() + { + return(type==P_IBTREE ? sizeof(BINTERNAL) : sizeof(BKEYDATA ) )-1; + } + int entry_size(int i) + { + return entry_struct_size() + (type==P_IBTREE ? btikey(i)->len : key(i)->len ); + } + + + + + + // ************** Comrpession/Uncompression *************************** + + // The compression functions + void Compress_extract_vals_wordiffs(int *nums,int *nums_pos,int nnums,HtVector_byte &wordiffs); + void Compress_show_extracted(int *nums,int *nums_pos,int nnums,HtVector_byte &wordiffs); + void Compress_vals(Compressor &out,int *nums,int *nums_pos,int nnums); + void Compress_vals_changed_flags(Compressor &out,unsigned int *cflags,int n); + void Compress_header(Compressor &out); + int Compress_main(Compressor &out); + Compressor *Compress(int debug=0, DB_CMPR_INFO *cmprInfo=NULL); + + // The uncompression functions + int Uncompress(Compressor *pin,int debug=0, DB_CMPR_INFO *cmprInfo=NULL); + int Uncompress_main(Compressor *pin); + void Uncompress_vals_chaged_flags(Compressor &in,unsigned int **pcflags,int *pn); + int Uncompress_header(Compressor &in); + void Uncompress_rebuild(unsigned int **rnums,int *rnum_sizes,int nnums,byte *rworddiffs,int nrworddiffs); + void Uncompress_show_rebuild(unsigned int **rnums,int *rnum_sizes,int nnums,byte *rworddiffs,int nrworddiffs); + + int TestCompress(int debuglevel); + int Compare(WordDBPage &other); + + // the following functions are use to compress/uncompress + // keys/data directly + // This is necesary for the first key/data elements of the page + void compress_key(Compressor &out,int i) + { + if(type==P_IBTREE) + { + int len=btikey(i)->len; + out.put_uint(len,NBITS_KEYLEN,label_str("seperatekey_len",i)); + if(verbose){printf("WordDBPage::compress_key:compress(typ3):%d ::: sizeof(BINTERNAL):%d\n",len,(int)sizeof(BINTERNAL));} + out.put_uint(btikey(i)->len ,sizeof(btikey(i)->len )*8,label_str("seperatekey_bti_len" ,i)); + out.put_uint(btikey(i)->type ,sizeof(btikey(i)->type )*8,label_str("seperatekey_bti_type" ,i)); + out.put_uint(btikey(i)->pgno ,sizeof(btikey(i)->pgno )*8,label_str("seperatekey_bti_pgno" ,i)); + out.put_uint(btikey(i)->nrecs,sizeof(btikey(i)->nrecs)*8,label_str("seperatekey_bti_nrecs",i)); + if(len){out.put_zone((byte *)btikey(i)->data,8*len,label_str("seperatekey_btidata",i));} + } + else + { + int len=key(i)->len; + out.put_uint(len,NBITS_KEYLEN,label_str("seperatekey_len",i)); + if(verbose){printf("WordDBPage::compress_key: compress(typ5):%d\n",len);} + out.put_zone((byte *)key(i)->data,8*len,label_str("seperatekey_data",i)); + } + } + void compress_data(Compressor &out,int i) + { + int len=data(i)->len; + out.put_uint(len,NBITS_DATALEN,label_str("seperatedata_len",i)); + if(verbose){printf("WordDBPage::compress_data: compressdata(typ5):%d\n",len);} + out.put_zone((byte *)data(i)->data,8*len,label_str("seperatedata_data",i)); + } + WordDBKey uncompress_key(Compressor &in,int i) + { + WordDBKey res; + int len=in.get_uint(NBITS_KEYLEN,label_str("seperatekey_len",i)); + if(verbose){printf("WordDBPage::uncompress_key: seperatekey:len:%d\n",len);} + + if(type==P_IBTREE) + { + if(len==0 && i!=0){errr("WordDBPage::uncompress_key: keylen=0 && i!=0");} + BINTERNAL bti; + bti.len =in.get_uint(sizeof(bti.len )*8,label_str("seperatekey_bti_len" ,i)); + bti.type =in.get_uint(sizeof(bti.type )*8,label_str("seperatekey_bti_type" ,i)); + bti.pgno =in.get_uint(sizeof(bti.pgno )*8,label_str("seperatekey_bti_pgno" ,i)); + bti.nrecs=in.get_uint(sizeof(bti.nrecs)*8,label_str("seperatekey_bti_nrecs",i)); + if(len!=bti.len){errr("WordDBPage::uncompress_key: incoherence: len!=bti.len");} + if(len) + { + byte *gotdata=new byte[len]; + CHECK_MEM(gotdata); + in.get_zone(gotdata,8*len,label_str("seperatekey_btidata",i)); + res=WordDBKey(gotdata,len); + delete [] gotdata; + } + insert_btikey(res,bti,(len==0 ? 1:0)); + } + else + { + byte *gotdata=new byte[len]; + CHECK_MEM(gotdata); + in.get_zone(gotdata,8*len,label_str("seperatekey_data",i)); + res=WordDBKey(gotdata,len); + insert_key(res); + delete [] gotdata; + } + return res; + } + WordDBRecord uncompress_data(Compressor &in,int i,int rectyp) + { + WordDBRecord res; + int len=in.get_uint(NBITS_DATALEN,label_str("seperatedata_len",i)); + if(verbose)printf("uncompressdata:len:%d\n",len); + byte *gotdata=new byte[len]; + CHECK_MEM(gotdata); + in.get_zone(gotdata,8*len,label_str("seperatedata_data",i)); + res=WordDBRecord(gotdata,len,rectyp); + insert_data(res); + delete [] gotdata; + return res; + } + + + // exctracted numerical fields + + const char* number_field_label(int j) + { + if(j>0 && j<WordKey::NFields()){return (char *)(WordKey::Info()->sort[j].name);} + if( j==CNFLAGS )return "CNFLAGS " ; + if( j==CNDATASTATS0 )return "CNDATASTATS0 " ; + if( j==CNDATASTATS1 )return "CNDATASTATS1 " ; + if( j==CNDATADATA )return "CNDATADATA " ; + if( j==CNBTIPGNO )return "CNBTIPGNO " ; + if( j==CNBTINRECS )return "CNBTINRECS " ; + if( j==CNWORDDIFFPOS )return "CNWORDDIFFPOS" ; + if( j==CNWORDDIFFLEN )return "CNWORDDIFFLEN" ; + return "BADFIELD"; + } + // positions of different fileds in + // number arrays that are extracted + int CNFLAGS ;// FLAGS: which key-fields have changed + int CNFIELDS ;// first numerical field + int CNDATASTATS0 ;// word record - stats element 0 + int CNDATASTATS1 ;// word record - stats element 1 + int CNDATADATA ;// word record - data + int CNBTIPGNO ;// internal page: page pointed at by node + int CNBTINRECS ;// internal page: ?? + int CNWORDDIFFPOS ;// position of first caracter that changed in word + int CNWORDDIFFLEN ;// number of chars that changed in word + int nnums ; + + + // ************** DEBUGING/BENCHMARKING *************** + void show(); + int verbose; + int debug; + + + // ************** Initialization/Destruction ***************** + + // initialize when header is valid + void init() + { + type=pg->type; + n=pg->entries; + nk=(type==P_LBTREE ? n/2 : n); + insert_pos=pgsz; + insert_indx=0; + } + + void init0() + { + CNFLAGS =0; + CNFIELDS =1; + CNDATASTATS0 = WordKey::NFields() ; + CNDATASTATS1 = WordKey::NFields() + 1; + CNDATADATA = WordKey::NFields() + 2; + CNBTIPGNO = WordKey::NFields() + 3; + CNBTINRECS = WordKey::NFields() + 4; + CNWORDDIFFPOS = WordKey::NFields() + 5; + CNWORDDIFFLEN = WordKey::NFields() + 6; + nnums=(CNWORDDIFFLEN+1); + + pg=NULL; + pgsz=0; + n=0; + nk=0; + type=-1; + verbose=0; + debug=0; + insert_pos=pgsz; + insert_indx=0; + } + + // db page was created here, destroy it + void delete_page() + { + if(!pg){errr("WordDBPage::delete_page: pg==NULL");} + delete [] pg; + pg=NULL; + } + // unlink db page from this encapsulation + void unset_page() + { + if(!pg){errr("WordDBPage::unset_page: pg==NULL");} + pg=NULL; + } + // the DB page must be unset or deleted + // before destroying this encapsulation + ~WordDBPage() + { + if(pg){errr("WordDBPage::~WordDBPage: page not empty");} + } + WordDBPage(int npgsz) + { + init0(); + pgsz=npgsz; + pg=(PAGE *)(new byte[pgsz]); + CHECK_MEM(pg); + insert_pos=pgsz; + insert_indx=0; + } + WordDBPage(const u_int8_t* buff,int buff_length) + { + init0(); + pg=(PAGE *)buff; + pgsz=buff_length; + insert_pos=pgsz; + insert_indx=0; + init(); + } +}; + + +#endif// _WordDBPage_h_ diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordDead.cc b/debian/htdig/htdig-3.2.0b6/htword/WordDead.cc new file mode 100644 index 00000000..ff5e5250 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordDead.cc @@ -0,0 +1,123 @@ +// +// WordDead.cc +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordDead.cc,v 1.4 2004/05/28 13:15:26 lha Exp $ +// +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <fcntl.h> + +#include "WordDead.h" +#include "WordListOne.h" + +class WordDeadCursor { +public: + WordDBCursor* cursor; +}; + +WordDead::~WordDead() +{ + delete db; + delete mask; +} + +int WordDead::Initialize(WordList* nwords) +{ + words = nwords; + db = new WordDB(nwords->GetContext()->GetDBInfo()); + mask = new WordKey(words->GetContext()); + return OK; +} + +int WordDead::Open() +{ + const String& filename = words->Filename(); + int flags = words->Flags(); + + db->set_pagesize(words->Pagesize()); + + return db->Open(filename, "dead", DB_BTREE, flags, 0666, WORD_DB_DEAD) == 0 ? OK : NOTOK; +} + +int WordDead::Remove() +{ + return db->Remove(words->Filename(), "dead") == 0 ? OK : NOTOK; +} + +int WordDead::Close() +{ + return db->Close() == 0 ? OK : NOTOK; +} + +int WordDead::Normalize(WordKey& key) const +{ + int nfields = words->GetContext()->GetKeyInfo().nfields; + int i; + // + // Undefine in 'key' all fields not defined in 'mask' + // + for(i = 0; i < nfields; i++) { + if(!mask->IsDefined(i)) + key.Set(i, WORD_KEY_VALUE_INVALID); + } + + return OK; +} + +int WordDead::Exists(const WordKey& key) const +{ + WordKey tmp_key = key; + + Normalize(tmp_key); + + String coded; + String dummy; + + tmp_key.Pack(coded); + + return db->Get(0, coded, dummy, 0) == 0; +} + +int WordDead::Put(const WordKey& key) const +{ + WordKey tmp_key = key; + + Normalize(tmp_key); + + String coded; + String dummy; + + tmp_key.Pack(coded); + + return db->Put(0, coded, dummy, 0) == 0 ? OK : NOTOK; +} + +WordDeadCursor* WordDead::Cursor() const +{ + WordDeadCursor* cursor = new WordDeadCursor; + cursor->cursor = db->Cursor(); + + return cursor; +} + +int WordDead::Next(WordDeadCursor* cursor, WordKey& key) +{ + String coded; + String dummy; + int ret = cursor->cursor->Get(coded, dummy, DB_NEXT); + if(ret != 0) { + delete cursor->cursor; + delete cursor; + } else { + key.Unpack(coded); + } + return ret; +} diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordDead.h b/debian/htdig/htdig-3.2.0b6/htword/WordDead.h new file mode 100644 index 00000000..a9a6e2ed --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordDead.h @@ -0,0 +1,70 @@ +// +// WordDead.h +// +// NAME +// +// list of documents that must be ignored and then deleted from the index. +// +// SYNOPSIS +// +// Helper for the WordList class. +// +// DESCRIPTION +// +// WordDead is a list of WordKey entries describing deleted documents. +// All inverted index entries that match a WordKey entry of the WordDead +// list are treated as if they do not appear in the inverted index. +// +// +// +// END +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordDead.h,v 1.4 2004/05/28 13:15:26 lha Exp $ +// + +#ifndef _WordDead_h_ +#define _WordDead_h_ + +#include <stdio.h> + +#include "htString.h" +#include "WordDB.h" + +class WordList; +class WordDeadCursor; + +class WordDead +{ + public: + WordDead() { words = 0; db = 0; mask = 0; } + ~WordDead(); + + int Initialize(WordList* words); + + int Open(); + int Remove(); + int Close(); + + int Mask(const WordKey& nmask) { *mask = nmask; return OK; } + + List* Words() const; + + int Normalize(WordKey& key) const; + int Exists(const WordKey& key) const; + int Put(const WordKey& key) const; + + WordDeadCursor* Cursor() const; + int Next(WordDeadCursor* cursor, WordKey& key); + + private: + WordList* words; + WordDB* db; + WordKey* mask; +}; +#endif /* _WordDead_h_ */ diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordDict.cc b/debian/htdig/htdig-3.2.0b6/htword/WordDict.cc new file mode 100644 index 00000000..85bac6f5 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordDict.cc @@ -0,0 +1,274 @@ +// +// WordDict.cc +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordDict.cc,v 1.4 2004/05/28 13:15:26 lha Exp $ +// +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <fcntl.h> + +#include "WordDict.h" +#include "WordListOne.h" + +#define WORD_DICT_CURSOR_FIRST 1 +#define WORD_DICT_CURSOR_NEXT 2 + +class WordDictCursor { +public: + int info; + String prefix; + WordDBCursor* cursor; +}; + +WordDict::~WordDict() +{ + delete db; +} + +int WordDict::Initialize(WordList* nwords) +{ + words = nwords; + db = new WordDB(nwords->GetContext()->GetDBInfo()); + return OK; +} + +int WordDict::Open() +{ + const String& filename = words->Filename(); + int flags = words->Flags(); + + db->set_pagesize(words->Pagesize()); + + return db->Open(filename, "dict", DB_BTREE, flags, 0666, WORD_DB_DICT) == 0 ? OK : NOTOK; +} + +int WordDict::Remove() +{ + return db->Remove(words->Filename(), "dict") == 0 ? OK : NOTOK; +} + +int WordDict::Close() +{ + return db->Close() == 0 ? OK : NOTOK; +} + +int WordDict::Serial(const String& word, unsigned int& serial) +{ + int ret; + WordDictRecord entry; + if((ret = entry.Get(db, word)) != 0 && ret != DB_NOTFOUND) + return NOTOK; + if(ret == DB_NOTFOUND) { + words->Meta()->Serial(WORD_META_SERIAL_WORD, entry.id); + if(entry.Put(db, word) != 0) return NOTOK; + } + serial = entry.id; + + return OK; +} + +int WordDict::SerialExists(const String& word, unsigned int& serial) +{ + int ret; + WordDictRecord entry; + if((ret = entry.Get(db, word)) != 0 && ret != DB_NOTFOUND) + return NOTOK; + + serial = ret == DB_NOTFOUND ? WORD_DICT_SERIAL_INVALID : entry.id; + + return OK; +} + +int WordDict::SerialRef(const String& word, unsigned int& serial) +{ + int ret; + WordDictRecord entry; + if((ret = entry.Get(db, word)) != 0 && ret != DB_NOTFOUND) + return NOTOK; + if(ret == DB_NOTFOUND) + words->Meta()->Serial(WORD_META_SERIAL_WORD, entry.id); + entry.count++; + if(entry.Put(db, word) != 0) return NOTOK; + serial = entry.id; + + return OK; +} + +int WordDict::Noccurrence(const String& word, unsigned int& noccurrence) const +{ + if(word.empty()) { + fprintf(stderr, "WordDict::Noccurrence: null word\n"); + return NOTOK; + } + WordDictRecord entry; + noccurrence = 0; + int ret; + if((ret = entry.Get(db, word)) != 0) { + if(ret != DB_NOTFOUND) + return NOTOK; + } + noccurrence = entry.count; + + return OK; +} + +int WordDict::Normalize(String& word) const +{ + const WordType& wtype = words->GetContext()->GetType(); + + return wtype.Normalize(word); +} + +int WordDict::Incr(const String& word, unsigned int incr) +{ + int ret; + WordDictRecord entry; + if((ret = entry.Get(db, word)) != 0 && ret != DB_NOTFOUND) + return NOTOK; + if(ret == DB_NOTFOUND) + words->Meta()->Serial(WORD_META_SERIAL_WORD, entry.id); + entry.count += incr; + if(entry.Put(db, word) != 0) return NOTOK; + return OK; +} + +int WordDict::Decr(const String& word, unsigned int decr) +{ + WordDictRecord entry; + int ret; + if((ret = entry.Get(db, word)) != 0) { + if(ret == DB_NOTFOUND) + fprintf(stderr, "WordDict::Unref(%s) Unref on non existing word occurrence\n", (const char*)word); + return NOTOK; + } + entry.count -= decr; + if(entry.count > 0) + ret = entry.Put(db, word) == 0 ? OK : NOTOK; + else + ret = entry.Del(db, word) == 0 ? OK : NOTOK; + + return ret; +} + +int WordDict::Put(const String& word, unsigned int noccurrence) +{ + int ret; + WordDictRecord entry; + if((ret = entry.Get(db, word)) != 0 && ret != DB_NOTFOUND) + return NOTOK; + if(ret == DB_NOTFOUND) + words->Meta()->Serial(WORD_META_SERIAL_WORD, entry.id); + entry.count = noccurrence; + if(entry.Put(db, word) != 0) return NOTOK; + return OK; +} + +List *WordDict::Words() const +{ + String key; + String coded; + WordDBCursor* cursor = db->Cursor(); + List* list = new List; + + while(cursor->Get(key, coded, DB_NEXT) == 0) + list->Add(new String(key)); + + delete cursor; + + return list; +} + +int WordDict::Exists(const String& word) const +{ + String tmp_word = word; + String coded; + + return db->Get(0, tmp_word, coded, 0) == 0; +} + +WordDictCursor* WordDict::Cursor() const +{ + WordDictCursor* cursor = new WordDictCursor; + cursor->cursor = db->Cursor(); + + return cursor; +} + +int WordDict::Next(WordDictCursor* cursor, String& word, WordDictRecord& record) +{ + String coded; + int ret = cursor->cursor->Get(word, coded, DB_NEXT); + if(ret != 0) { + delete cursor->cursor; + delete cursor; + } else { + record.Unpack(coded); + } + return ret; +} + +WordDictCursor* WordDict::CursorPrefix(const String& prefix) const +{ + WordDictCursor* cursor = new WordDictCursor; + cursor->cursor = db->Cursor(); + cursor->prefix = prefix; + cursor->info = WORD_DICT_CURSOR_FIRST; + + return cursor; +} + +int WordDict::NextPrefix(WordDictCursor* cursor, String& word, WordDictRecord& record) +{ + String coded; + int ret; + if(cursor->info == WORD_DICT_CURSOR_FIRST) { + word = cursor->prefix; + ret = cursor->cursor->Get(word, coded, DB_SET_RANGE); + cursor->info = WORD_DICT_CURSOR_NEXT; + } else { + ret = cursor->cursor->Get(word, coded, DB_NEXT); + } + // + // Stop walking when 1) DB_NOTFOUND, 2) the word found is shorter than + // the required prefix, 3) the word found does not start with the + // required prefix. + // + if(ret != 0 || + cursor->prefix.length() > word.length() || + strncmp(cursor->prefix.get(), word.get(), cursor->prefix.length())) { + delete cursor->cursor; + delete cursor; + if(ret == 0) ret = DB_NOTFOUND; + } else { + record.Unpack(coded); + } + return ret; +} + +int WordDict::Write(FILE* f) +{ + WordDBCursor* cursor = db->Cursor(); + String key; + String coded; + unsigned int occurrence; + unsigned int id; + + while(cursor->Get(key, coded, DB_NEXT) == 0) { + int offset = 0; + coded.ber_shift(offset, occurrence); + coded.ber_shift(offset, id); + fprintf(f, "%s %d %d\n", (char*)key, id, occurrence); + } + + delete cursor; + + return OK; +} diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordDict.h b/debian/htdig/htdig-3.2.0b6/htword/WordDict.h new file mode 100644 index 00000000..86b45717 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordDict.h @@ -0,0 +1,252 @@ +// +// WordDict.h +// +// NAME +// +// manage and use an inverted index dictionary. +// +// SYNOPSIS +// +// #include <mifluz.h> +// +// WordList* words = ...; +// WordDict* dict = words->Dict(); +// +// DESCRIPTION +// +// WordDict maps strings to unique identifiers and frequency in the +// inverted index. Whenever a new word is found, the WordDict class +// can be asked to assign it a serial number. When doing so, an entry +// is created in the dictionary with a frequency of zero. The application +// may then increment or decrement the frequency to reflect the inverted +// index content. +// +// The serial numbers range from 1 to 2^32 inclusive. +// +// A WordDict object is automatically created by the WordList object and +// should not be created directly by the application. +// +// END +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordDict.h,v 1.4 2004/05/28 13:15:26 lha Exp $ +// + +#ifndef _WordDict_h_ +#define _WordDict_h_ + +#include <stdio.h> + +#ifndef SWIG +#include "htString.h" +#include "WordDB.h" + +class WordList; +class WordDictCursor; + +#define WORD_DICT_SERIAL_INVALID 0 + +class WordDictRecord { + public: + inline WordDictRecord() { count = 0; id = WORD_DICT_SERIAL_INVALID; } + + inline int Unpack(const String& coded) { + int offset = 0; + coded.ber_shift(offset, count); + coded.ber_shift(offset, id); + return OK; + } + + inline int Pack(String& coded) const { + int offset = 0; + coded.ber_push(offset, count); + coded.ber_push(offset, id); + return OK; + } + + inline int Get(WordDB* db, const String& word) { + String tmp_word = word; + String coded(BER_MAX_BYTES * 2); + int ret; + if((ret = db->Get(0, tmp_word, coded, 0)) != 0) return ret; + + Unpack(coded); + + return ret; + } + + inline int Put(WordDB* db, const String& word) { + String coded(BER_MAX_BYTES * 2); + Pack(coded); + return db->Put(0, word, coded, 0); + } + + inline int Del(WordDB* db, const String& word) { + return db->Del(0, word); + } + + inline unsigned int Count() { return count; } + inline unsigned int Id() { return id; } + + unsigned int count; + unsigned int id; +}; +#endif /* SWIG */ + +class WordDict +{ + public: +#ifndef SWIG + //- + // Private constructor. + // + WordDict() { words = 0; db = 0; } + ~WordDict(); + + //- + // Bind the object a WordList inverted index. Return OK on success, + // NOTOK otherwise. + // + int Initialize(WordList* words); + + //- + // Open the underlying Berkeley DB sub-database. The enclosing + // file is given by the <i>words</i> data member. Return OK on success, + // NOTOK otherwise. + // + int Open(); + //- + // Destroy the underlying Berkeley DB sub-database. Return OK on success, + // NOTOK otherwise. + // + int Remove(); + //- + // Close the underlying Berkeley DB sub-database. Return OK on success, + // NOTOK otherwise. + // + int Close(); + + //- + // If the <b>word</b> argument exists in the dictionnary, return its + // serial number in the <b>serial</b> argument. If it does not already + // exists, assign it a serial number, create an entry with a frequency + // of zero and return the new serial in the <b>serial</b> argument. + // Return OK on success, NOTOK otherwise. + // + int Serial(const String& word, unsigned int& serial); + //- + // If the <b>word</b> argument exists in the dictionnary, return its + // serial number in the <b>serial</b> argument. If it does not exists + // set the <b>serial</b> argument to WORD_DICT_SERIAL_INVALID. + // Return OK on success, NOTOK otherwise. + // + int SerialExists(const String& word, unsigned int& serial); + //- + // Short hand for Serial() followed by Ref(). + // Return OK on success, NOTOK otherwise. + // + int SerialRef(const String& word, unsigned int& serial); + //- + // Return the frequency of the <b>word</b> argument + // in the <b>noccurrence</b> argument. + // Return OK on success, NOTOK otherwise. + // + int Noccurrence(const String& word, unsigned int& noccurrence) const; +#endif /* SWIG */ + + //- + // Short hand for words->GetContext()->GetType()->Normalize(word). + // Return OK on success, NOTOK otherwise. + // + int Normalize(String& word) const; + + //- + // Short hand for Incr(word, 1) + // + int Ref(const String& word) { return Incr(word, 1); } + //- + // Add <b>incr</b> to the frequency of the <b>word</b>. + // Return OK on success, NOTOK otherwise. + // + int Incr(const String& word, unsigned int incr); + //- + // Short hand for Decr(word, 1) + // + int Unref(const String& word) { return Decr(word, 1); } + //- + // Subtract <b>decr</b> to the frequency of the <b>word</b>. If + // the frequency becomes lower or equal to zero, remove the entry + // from the dictionnary and lose the association between the word and its + // serial number. + // Return OK on success, NOTOK otherwise. + // + int Decr(const String& word, unsigned int decr); + //- + // Set the frequency of <b>word</b> with the value of the <b>noccurrence</b> + // argument. + // + int Put(const String& word, unsigned int noccurrence); + + //- + // Return true if <b>word</b> exists in the dictionnary, false otherwise. + // + int Exists(const String& word) const; + +#ifndef SWIG + //- + // Return a pointer to the associated WordList object. + // + List* Words() const; + + //- + // Return a cursor to sequentially walk the dictionnary using the + // <b>Next</b> method. + // + WordDictCursor* Cursor() const; + //- + // Return the next entry in the dictionnary. The <b>cursor</b> argument + // must have been created using the <i>Cursor</i> method. The word is + // returned in the <b>word</b> argument and the record is returned in + // the <b>record</b> argument. + // On success the function returns 0, at the end of the dictionnary it + // returns DB_NOTFOUND. The <b>cursor</b> argument is deallocated when + // the function hits the end of the dictionnary or an error occurs. + // + int Next(WordDictCursor* cursor, String& word, WordDictRecord& record); + + //- + // Return a cursor to sequentially walk the entries of the dictionnary + // that start with the <b>prefix</b> argument, using the + // <b>NextPrefix</b> method. + // + WordDictCursor* CursorPrefix(const String& prefix) const; + //- + // Return the next prefix from the dictionnary. The <b>cursor</b> argument + // must have been created using the <i>CursorPrefix</i> method. The word is + // returned in the <b>word</b> argument and the record is returned in + // the <b>record</b> argument. The <b>word</b> is guaranteed to start with + // the prefix specified to the <b>CursorPrefix</b> method. + // On success the function returns 0, at the end of the dictionnary it + // returns DB_NOTFOUND. The <b>cursor</b> argument is deallocated when + // the function hits the end of the dictionnary or an error occurs. + // + int NextPrefix(WordDictCursor* cursor, String& word, WordDictRecord& record); + + //- + // Dump the complete dictionary in the file descriptor <b>f.</b> The + // format of the dictionary is <i>word serial frequency</i>, one by + // line. + // + int Write(FILE* f); + + private: + WordList* words; + WordDB* db; +#endif /* SWIG */ +}; +#endif /* _WordDict_h_ */ diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordKey.cc b/debian/htdig/htdig-3.2.0b6/htword/WordKey.cc new file mode 100644 index 00000000..413faaac --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordKey.cc @@ -0,0 +1,673 @@ +// +// WordKey.cc +// +// WordKey: All the functions are implemented regardless of the actual +// structure of the key using word_key_info. +// WARNING: although it may seem that you can have two String +// fields in the key, some code does not support that. This should +// not be a problem since the goal of the WordKey class is to +// implement the keys of an inverted index. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordKey.cc,v 1.9 2004/05/28 13:15:26 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <stdlib.h> +#include <ctype.h> + +#include "WordKey.h" + +// +// Returns OK if fields set in 'object' and 'other' are all equal. +// +// Fields not set in either 'object' or 'other' are ignored +// completely. If the prefix_length is > 0 the 'object' String +// fields are compared to the prefix_length bytes of the 'other' +// String fields only. +// +// This function is useful to compare existing keys with a search +// criterion that may be incomplete. For instance if we look for keys +// that contain words starting with a given prefix or keys that +// are located in a specific document, regardless of their location +// in the document. +// +int WordKey::Equal(const WordKey& other) const +{ + const WordKeyInfo& info = *WordKey::Info(); + // + // Walk the fields in sorting order. As soon as one of them + // does not compare equal, return. + // + for(int j = 0; j < info.nfields; j++) + { + // + // Only compare fields that are set in both key + // + if(!IsDefined(j) || !other.IsDefined(j)) continue; + + switch(info.sort[j].type) { + case WORD_ISA_STRING: + if(!IsDefinedWordSuffix()) { + if(kword != other.kword.sub(0, kword.length())) + return 0; + } else { + if(kword != other.kword) + return 0; + } + break; + default: + if(Get(j) != other.Get(j)) return 0; + break; + } + } + return 1; +} + +// +// Compare <a> and <b> in the Berkeley DB fashion. +// <a> and <b> are packed keys. +// Compares full WordKey, unlike Compare_WordOnly. +// +inline int +WordKey::Compare(const char *a, int a_length, const char *b, int b_length) +{ + const WordKeyInfo& info = *WordKey::Info(); + + if(a_length < info.num_length || b_length < info.num_length) { + fprintf(stderr, "WordKey::Compare: key length %d or %d < info.num_length = %d\n", a_length, b_length, info.num_length); + return NOTOK; + } + + // + // Walk the fields, as soon as one of them does not compare equal, + // return. + // + + // + // first field: string + // + const int p1_length = a_length - info.num_length; + const int p2_length = b_length - info.num_length; + { + int len = p1_length > p2_length ? p2_length : p1_length; + const unsigned char* p1 = (unsigned char *)a; + const unsigned char* p2 = (unsigned char *)b; + + for (;len--; ++p1, ++p2) { + if (*p1 != *p2) + return (int)*p1 - (int)*p2; + } + if(p1_length != p2_length) + return p1_length - p2_length; + } + // + // following fields: numerical + // But what *are* they?? -- lha + // + for(int j = 1; j < info.nfields; j++) + { + WordKeyNum p1; + int a_index = info.sort[j].bytes_offset + p1_length; + WordKey::UnpackNumber((unsigned char *)&a[a_index], + info.sort[j].bytesize, + p1, + info.sort[j].lowbits, + info.sort[j].bits); + + WordKeyNum p2; + int b_index = info.sort[j].bytes_offset + p2_length; + WordKey::UnpackNumber((unsigned char *)&b[b_index], + info.sort[j].bytesize, + p2, + info.sort[j].lowbits, + info.sort[j].bits); + if(p1 != p2) + return p1 - p2; + } + + // + // If we reach this point, everything compared equal + // + return 0; +} +// +// Compare <a> and <b> in the Berkeley DB fashion. +// <a> and <b> are packed keys. +// Only compares "word" part of WordKey, unlike Compare. +// +inline int +WordKey::Compare_WordOnly(const char *a, int a_length, const char *b, int b_length) +{ + const WordKeyInfo& info = *WordKey::Info(); + + if(a_length < info.num_length || b_length < info.num_length) { + fprintf(stderr, "WordKey::Compare: key length %d or %d < info.num_length = %d\n", a_length, b_length, info.num_length); + return NOTOK; + } + + // + // compare first field only: actual word + // + const int p1_length = a_length - info.num_length; + const int p2_length = b_length - info.num_length; + { + int len = p1_length > p2_length ? p2_length : p1_length; + const unsigned char* p1 = (unsigned char *)a; + const unsigned char* p2 = (unsigned char *)b; + + for (;len--; ++p1, ++p2) { + if (*p1 != *p2) + return (int)*p1 - (int)*p2; + } + if(p1_length != p2_length) + return p1_length - p2_length; + } + return 0; +} + +// +// Compare <a> and <b> in the Berkeley DB fashion. +// <a> and <b> are packed keys. +// Compares full WordKey, unlike Compare_WordOnly. +// +int +WordKey::Compare(const String& a, const String& b) +{ + return WordKey::Compare(a, a.length(), b, b.length()); +} + +// +// Compare <a> and <b> in the Berkeley DB fashion. +// <a> and <b> are packed keys. +// Only compares "word" part of WordKey, unlike Compare. +// +int +WordKey::Compare_WordOnly(const String& a, const String& b) +{ + return WordKey::Compare_WordOnly(a, a.length(), b, b.length()); +} + +// +// C comparison function interface for Berkeley DB (bt_compare) +// Just call the static Compare function of WordKey. It is *critical* +// that this function is as fast as possible. See the Berkeley DB +// documentation for more information on the return values. +// Compares full WordKey, unlike word_only_db_cmp. +// +int +word_db_cmp(const DBT *a, const DBT *b) +{ + return WordKey::Compare((char*)a->data, a->size, (char*)b->data, b->size); +} + +// +// C comparison function interface for Berkeley DB (bt_compare) +// Just call the static Compare function of WordKey. +// See the Berkeley DB +// documentation for more information on the return values. +// Only compares text part of the WordKey, unlike word_db_cmp. +// +int +word_only_db_cmp(const DBT *a, const DBT *b) +{ + return WordKey::Compare_WordOnly((char*)a->data, a->size, (char*)b->data, b->size); +} + +// +// Compare current key defined fields with other key defined fields only, +// ignore fields that are not defined in key or other. Return 1 if different +// 0 if equal. If different, position is set to the field number that differ, +// lower is set to 1 if Get(position) is lower than other.Get(position) otherwise +// lower is set to 0. +// +int WordKey::Diff(const WordKey& other, int& position, int& lower) +{ + position = -1; + + if(IsDefined(0) && other.IsDefined(0)) { + int ret = 0; + if(other.IsDefinedWordSuffix()) + ret = GetWord().compare(other.GetWord()); + else + ret = strncmp((char*)GetWord(), (const char*)other.GetWord(), other.GetWord().length()); + if(ret) { + position = 0; + lower = ret > 0; + } + } + + if(position < 0) { + int nfields=WordKey::NFields(); + + int i; + for(i = 1; i < nfields; i++) { + if(IsDefined(i) && other.IsDefined(i) && + Get(i) != other.Get(i)) { + lower = Get(i) < other.Get(i); + break; + } + } + if(i < nfields) + position = i; + } + + return position >= 0; +} + +// +// Compare object and <other> using comparison of their packed form +// +int +WordKey::PackEqual(const WordKey& other) const +{ + String this_pack; + Pack(this_pack); + + String other_pack; + other.Pack(other_pack); + + return this_pack == other_pack; +} + +// +// Implement ++ on a key. +// +// It behaves like arithmetic but follows these rules: +// . Increment starts at field <position> +// . If a field value overflows, increment field <position> - 1 +// . Undefined fields are ignored and their value untouched +// . Incrementing the word field is done by appending \001 +// . When a field is incremented all fields to the left are set to 0 +// If position is not specified it is equivalent to NFields() - 1. +// It returns OK if successfull, NOTOK if position out of range or +// WORD_FOLLOWING_ATEND if the maximum possible value was reached. +// +// Examples assuming numerical fields are 8 bits wide: +// +// 0 1 2 3 OPERATION RESULT +// --------------------------------------------------------------------------------------- +// foo <DEF> 1 1 1 -> SetToFollowing(3) -> foo <DEF> 1 1 2 +// foo <DEF> 1 1 1 -> SetToFollowing(2) -> foo <DEF> 1 2 0 +// foo <DEF> 1 1 255 -> SetToFollowing(3) -> foo <DEF> 1 2 0 +// foo <DEF> 255 255 255 -> SetToFollowing(3) -> foo\001 <DEF> 0 0 0 +// foo <DEF> 255 1 1 -> SetToFollowing(1) -> foo\001 <DEF> 0 0 0 +// <UNDEF><UNDEF> 255 1 1 -> SetToFollowing(1) -> WORD_FOLLOWING_ATEND +// foo <DEF> 1 <UNDEF> 255 -> SetToFollowing(3) -> foo <DEF> 2 <UNDEF> 0 +// foo <DEF><UNDEF><UNDEF> 255 -> SetToFollowing(3) -> foo\001 <DEF><UNDEF><UNDEF> 0 +// +// +int WordKey::SetToFollowing(int position /* = WORD_FOLLOWING_MAX */) +{ + if(position == WORD_FOLLOWING_MAX) + position = NFields() - 1; + + if(position < 0 || position >= NFields()) { + fprintf(stderr, "WordKey::SetToFollowing invalid position = %d\n", position); + return NOTOK; + } + + int i = position; + while(i > 0) { + if(IsDefined(i)) { + if(Overflow(i, 1)) + Set(i, 0); + else + break; + } + i--; + } + + if(i == 0) { + if(IsDefined(i)) + GetWord() << '\001'; + else + return WORD_FOLLOWING_ATEND; + } else + Get(i)++; + + for(i = position + 1; i < NFields(); i++) + if(IsDefined(i)) Set(i,0); + + return OK; +} + +// +// Return true if the key may be used as a prefix for search. +// In other words return true if the fields set in the key +// are all contiguous, starting from the first field in sort order. +// +int +WordKey::Prefix() const +{ + const WordKeyInfo& info = *WordKey::Info(); + // + // If all fields are set, it can be considered as a prefix although + // it really is a fully qualified key. + // + if(Filled()) return OK; + // + // If the first field is not set this cannot be a prefix + // + if(!IsDefined(0)) return NOTOK; + + int found_unset = 0; + if(!IsDefinedWordSuffix()) { found_unset = 1; } + // + // Walk the fields in sorting order. + // + for(int j = WORD_FIRSTFIELD; j < info.nfields; j++) + { + // + // Fields set, then fields unset then field set -> not a prefix + // + if(IsDefined(j)) + if(found_unset) return NOTOK; + else + // + // Found unset fields and this is fine as long as we do + // not find a field set later on. + // + found_unset++; + } + + return OK; +} + +// +// Unset all fields past the first unset field +// Return the number of fields in the prefix or 0 if +// first field is not set, ie no possible prefix. +// +int +WordKey::PrefixOnly() +{ + const WordKeyInfo& info = *WordKey::Info(); + // + // If all fields are set, the whole key is the prefix. + // + if(Filled()) return OK; + // + // If the first field is not set there is no possible prefix + // + if(!IsDefined(0)) + { + return NOTOK; + } + + int found_unset = 0; + // + // Walk the fields in sorting order. + // + if(!IsDefinedWordSuffix()){found_unset=1;} + + for(int j = WORD_FIRSTFIELD; j < info.nfields; j++) + { + // + // Unset all fields after the first unset field + // + if(IsDefined(j)) + { + if(found_unset) {Set(j,0);Undefined(j);} + } + else {found_unset=1;} + } + + return OK; +} + +// +// Unpack from data and fill fields of object +// +int +WordKey::Unpack(const char* string,int length) +{ + const WordKeyInfo& info = *WordKey::Info(); + if(length < info.num_length) { + fprintf(stderr, "WordKey::Unpack: key record length < info.num_length\n"); + return NOTOK; + } + + int string_length = length - info.num_length; + SetWord(string, string_length); + + for(int j = WORD_FIRSTFIELD; j < info.nfields; j++) + { + WordKeyNum value = 0; + int index = string_length + info.sort[j].bytes_offset; + WordKey::UnpackNumber((unsigned char *)&string[index], + info.sort[j].bytesize, + value, + info.sort[j].lowbits, + info.sort[j].bits); + Set(j,value); + } + + return OK; +} + +// +// Pack object into the <packed> string +// +int +WordKey::Pack(String& packed) const +{ + const WordKeyInfo& info = *WordKey::Info(); + + char* string; + int length = info.num_length; + + length += kword.length(); + + if((string = (char*)malloc(length)) == 0) { + fprintf(stderr, "WordKey::Pack: malloc returned 0\n"); + return NOTOK; + } + memset(string, '\0', length); + + memcpy(string, kword.get(), kword.length()); + for(int i = WORD_FIRSTFIELD; i < info.nfields; i++) { + int index = kword.length() + info.sort[i].bytes_offset; + WordKey::PackNumber(Get(i), + &string[index], + info.sort[i].bytesize, + info.sort[i].lowbits, + info.sort[i].lastbits); + } + + packed.set(string, length); + + free(string); + + return OK; +} + +// +// Copy all fields set in <other> to object, only if +// the field is not already set in <other> +// +int WordKey::Merge(const WordKey& other) +{ + const WordKeyInfo& info = *WordKey::Info(); + + + for(int j = 0; j < info.nfields; j++) { + if(!IsDefined(j) && other.IsDefined(j)) { + switch(info.sort[j].type) { + case WORD_ISA_STRING: + SetWord(other.GetWord()); + if(!other.IsDefinedWordSuffix()) UndefinedWordSuffix(); + break; + default: + Set(j,other.Get(j)); + break; + } + } + } + + return OK; +} + +// +// Convert the whole structure to an ascii string description +// +int +WordKey::Get(String& buffer) const +{ + buffer.trunc(); + const WordKeyInfo& info = *WordKey::Info(); + + // + // Walk the fields in sorting order. As soon as one of them + // does not compare equal, return. + // + for(int j = 0; j < info.nfields; j++) { + if(!IsDefined(j)) { + buffer << "<UNDEF>"; + } else { + switch(info.sort[j].type) { + case WORD_ISA_STRING: + buffer << GetWord(); + break; + case WORD_ISA_NUMBER: + buffer << Get(j); + break; + default: + fprintf(stderr, "WordKey::Get: invalid type %d for field %d\n", info.sort[j].type, j); + return NOTOK; + } + } + // + // Output virtual word suffix field + // + if(j == 0) { + if(IsDefined(j) && !IsDefinedWordSuffix()) { + buffer << "\t<UNDEF>"; + } else { + buffer << "\t<DEF>"; + } + } + buffer << "\t"; + } + return OK; +} + +String +WordKey::Get() const +{ + String tmp; + Get(tmp); + return tmp; +} + +// +// Set a key from an ascii representation +// +int +WordKey::Set(const String& buffer) +{ + StringList fields(buffer, "\t "); + return SetList(fields); +} + +// +// Set a key from list of fields +// +int +WordKey::SetList(StringList& fields) +{ + const WordKeyInfo& info = *WordKey::Info(); + int length = fields.Count(); + + // + // + 1 counts for the word suffix field + // + if(length < info.nfields + 1) { + fprintf(stderr, "WordKey::Set: expected at least %d fields and found %d (ignored)\n", info.nfields + 1, length); + return NOTOK; + } + if(length < 2) { + fprintf(stderr, "WordKey::Set: expected at least two fields in line\n"); + return NOTOK; + } + + Clear(); + + fields.Start_Get(); + // + // Handle word and its suffix + // + int i = 0; + { + // + // Get the word + // + String* word = (String*)fields.Get_Next(); + if(word == 0) { + fprintf(stderr, "WordKey::Set: failed to get word\n"); + return NOTOK; + } + if(word->nocase_compare("<undef>") == 0) + UndefinedWord(); + else + SetWord(*word); + i++; + + // + // Get the word suffix status + // + String* suffix = (String*)fields.Get_Next(); + if(suffix == 0) { + fprintf(stderr, "WordKey::Set: failed to get word suffix %d\n", i); + return NOTOK; + } + if(suffix->nocase_compare("<undef>") == 0) + UndefinedWordSuffix(); + else + SetDefinedWordSuffix(); + } + + // + // Handle numerical fields + // + int j; + for(j = WORD_FIRSTFIELD; i < info.nfields; i++, j++) { + String* field = (String*)fields.Get_Next(); + + if(field == 0) { + fprintf(stderr, "WordKey::Set: failed to retrieve field %d\n", i); + return NOTOK; + } + + if(field->nocase_compare("<undef>") == 0) { + Undefined(j); + } else { + WordKeyNum value = strtoul(field->get(), 0, 10); + Set(j, value); + } + } + + return OK; +} + +int WordKey::Write(FILE* f) const +{ + String tmp; + Get(tmp); + fprintf(f, "%s", (char*)tmp); + return 0; +} + +void WordKey::Print() const +{ + Write(stderr); +} + diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordKey.h b/debian/htdig/htdig-3.2.0b6/htword/WordKey.h new file mode 100644 index 00000000..3890ad47 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordKey.h @@ -0,0 +1,612 @@ +// WordKey.h +// +// NAME +// inverted index key. +// +// SYNOPSIS +// +// #include <WordKey.h> +// +// #define DOCID 1 +// #define LOCATION 1 +// +// WordKey key("word <DEF> 1 2"); +// key.Set(DOCID, 100); +// key.SetWord("other"); +// +// DESCRIPTION +// +// Describes the key used to store a entry in the inverted index. +// The structure of a key is described by the <i>WordKeyInfo</i> +// Each field in the key has a bit in the <b>set</b> +// member that says if it is set or not. This bit allows to +// say that a particular field is <i>undefined</i> regardless of +// the actual value stored. The methods +// <b>IsDefined, SetDefined</b> and <b>Undefined</b> are used to manipulate +// the <i>defined</i> status of a field. The <b>Pack</b> and <b>Unpack</b> +// methods are used to convert to and from the disk storage representation +// of the key. +// +// Generic functions to manipulate the key should use the <i>WordKeyInfo</i> +// information to work regardless of the actual structure of the key. +// +// Suffix definition: a word suffix is a kind of marker that says if +// the word is a full word or only the beginning of a +// word. If a word has a suffix then it's a full word. If it +// has no suffix then it's only the beginning of a word. +// This is mostly useful when specifying search keys. If a +// search key word has no suffix, the search mechanism is +// expected to return all words that begin with the word. If +// the search key word has a suffix, only words that exactly +// match the search key word will be returned. +// +// ASCII FORMAT +// +// The ASCII description is a string with fields separated by tabs or +// white space. +// <pre> +// Example: Foo <DEF> 0 1 4 2 +// Field 1: The word as a string or <UNDEF> if not defined +// Field 2: <DEF> if suffix defined, <UNDEF> if suffix undefined +// Field 3 to nfield + 1: numerical value of the field or <UNDEF> if +// not defined +// +// </pre> +// +// END +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// + +#ifndef _WordKey_h_ +#define _WordKey_h_ + +#ifndef SWIG +#include "db.h" +#include "htString.h" +#include "StringList.h" +#endif /* SWIG */ + +// +// WORDSUFFIX: +// +// field in set flag that says if a word is just a prefix (incomplete word) +// WORD_KEY_WORDSUFFIX_DEFINED -> means that word IS complete (not a prefix) +// +#define WORD_KEY_WORDSUFFIX_DEFINED (1 << 30) +#define WORD_KEY_WORD_DEFINED 1 +#define WORD_KEY_WORDFULLY_DEFINED ( WORD_KEY_WORDSUFFIX_DEFINED | WORD_KEY_WORD_DEFINED ) + +// +// Possible return values of Outbound/Overflow/Underflow methods +// +#define WORD_INBOUND 0 +#define WORD_OVERFLOW 1 +#define WORD_UNDERFLOW 2 + +// +// Possible return values of SetToFollowing +// +#define WORD_FOLLOWING_ATEND 0x0001 +// +// Default value for position argument of SetToFollowing +// meaning NFields() - 1 +// +#define WORD_FOLLOWING_MAX -1 + +// +// Position of the first numerical field (just after the word) +// +#define WORD_FIRSTFIELD 1 + +// +// Unknown field position +// +#define WORD_KEY_UNKNOWN_POSITION -1 + +#ifndef SWIG +// C comparison function interface for Berkeley DB (bt_compare) +// +int word_db_cmp(const DBT *a, const DBT *b); +int word_only_db_cmp(const DBT *a, const DBT *b); +#endif /* SWIG */ + +#ifndef SWIG +#include"WordKeyInfo.h" +#endif /* SWIG */ + +// +// Describe a word occurrence +// +// !!!!!!!DEBUGTMP +#ifndef SWIG +#define WORD_FATAL_ABORT fflush(stdout);fprintf(stderr,"FATAL ERROR at file:%s line:%d !!!\n",__FILE__,__LINE__);fflush(stderr);(*(int *)NULL)=1 +#define word_errr(s) {fprintf(stderr,"FATAL ERROR:%s\n",s);WORD_FATAL_ABORT;} +#endif /* SWIG */ +class WordKey +{ + public: + // + // Constructors, destructors, copy and clear + // + //- + // Constructor. Build an empty key. + // + WordKey() { Initialize(); } +#ifndef SWIG + //- + // Constructor. Initialize from an ASCII description of a key. + // See <i>ASCII FORMAT</i> section. + // + WordKey(const String& word) + { + Initialize(); + Set(word); + } + // + // Copy constructor (needed because of the array pointer) + // + WordKey(const WordKey &other) + { + Initialize(); + CopyFrom(other); + } +#endif /* SWIG */ + ~WordKey() + { + delete [] numerical_fields; + } +#ifndef SWIG + protected: + // + // Constructor helper, allocate members and set to empty key + // + void Initialize() + { + if(!Info()) + { + fprintf(stderr, "WordKey::WordKey used before word_key_info set\n"); + word_errr("WordKey::initialize"); + } + + numerical_fields = new WordKeyNum[NFields()-1]; + Clear(); + } + public: + // + // Copy operator (needed because of the array pointer) + // + void operator =(const WordKey &other) + { + Clear(); + CopyFrom(other); + } +#endif /* SWIG */ + //- + // Copy <b>other</b> into object. + // + void CopyFrom(const WordKey &other) + { + if(other.IsDefined(0)) { SetWord(other.GetWord()); } + for(int i=1;i<NFields();i++) + { + if(other.IsDefined(i)) + { + Set(i, other.Get(i)); + } + } + setbits=other.setbits; + } + //- + // Reset to empty key. + // + void Clear() + { + setbits = 0; + kword.trunc(); + for(int i=0;i<NFields()-1;i++) + { + numerical_fields[i] = 0; + } + } + +#ifndef SWIG + //- + // Convenience function to access key structure + // information (see <i>WordKeyInfo(3)</i>). + // + static inline const WordKeyInfo *Info() { return WordKeyInfo::Instance(); } +#endif /* SWIG */ + //- + // Convenience functions to access the total number of fields + // in a key (see <i>WordKeyInfo(3)</i>). + // + static inline int NFields() { return Info()->nfields; } + //- + // Convenience functions to access the + // maximum possible value for field at <b>position.</b> + // in a key (see <i>WordKeyInfo(3)</i>). + // + static inline WordKeyNum MaxValue(int position) { return Info()->sort[position].MaxValue(); } + + // + // Accessors + // + //- + // Returns the word as a const. + // +#ifndef SWIG + inline const String& GetWord() const { return kword; } +#endif /* SWIG */ + + //- + // Returns the word. + // + inline String& GetWord() { return kword; } + //- + // Set the word. + // + inline void SetWord(const String& arg) { kword = arg; setbits |= WORD_KEY_WORDFULLY_DEFINED; } + protected: + //- + // Set the word. + // + inline void SetWord(const char* arg, int arg_length) { kword.set(arg, arg_length); setbits |= WORD_KEY_WORDFULLY_DEFINED; } + public: + //- + // Change status of the word to <i>undefined.</i> Also undefines + // its suffix. + // + inline void UndefinedWord() { kword.trunc(); setbits &= ~WORD_KEY_WORDFULLY_DEFINED; } + //- + // Set the status of the word suffix to <i>undefined.</i> + // + inline void UndefinedWordSuffix() {setbits &= ~WORD_KEY_WORDSUFFIX_DEFINED;} + //- + // Set the status of the word suffix to <i>defined.</i> + // + inline void SetDefinedWordSuffix() {setbits |= WORD_KEY_WORDSUFFIX_DEFINED;} + //- + // Returns true if word suffix is <i>defined</i>, false otherwise. + // + inline int IsDefinedWordSuffix() const {return( (setbits & WORD_KEY_WORDSUFFIX_DEFINED) == WORD_KEY_WORDSUFFIX_DEFINED);} + // + // Get/Set numerical fields + // + //- + // Return value of numerical field at <b>position</b> as const. + // + inline WordKeyNum Get(int position) const + { + // if(position<1 || position>=NFields()){errr("Get: out of bounds");} + return(numerical_fields[position-1]); + } +#ifndef SWIG + //- + // Return value of numerical field at <b>position.</b> + // + inline WordKeyNum& Get(int position) + { + return(numerical_fields[position-1]); + } + //- + // Return value of numerical field at <b>position</b> as const. + // + inline const WordKeyNum & operator[] (int position) const { return(numerical_fields[position-1]); } + //- + // Return value of numerical field at <b>position.</b> + // + inline WordKeyNum & operator[] (int position) { return(numerical_fields[position-1]); } +#endif /* SWIG */ + //- + // Set value of numerical field at <b>position</b> to <b>val.</b> + // + inline void Set(int position, WordKeyNum val) + { + // if(position<1 || position>=NFields()){errr("Set: out of bounds");} + SetDefined(position); + numerical_fields[position-1] = val; + } + + // + // Key field value existenz. Defined means the value of the field contains + // a valid value. Undefined means the value of the field is not valid. + // + //- + // Returns true if field at <b>position</b> is <i>defined</i>, false + // otherwise. + // + int IsDefined(int position) const { return setbits & (1 << position); } + //- + // Value in field <b>position</b> becomes <i>defined.</i> + // + void SetDefined(int position) { setbits |= (1 << position); } + //- + // Value in field <b>position</b> becomes <i>undefined.</i> + // + void Undefined(int position) { setbits &= ~(1 << position); } + +#ifndef SWIG + // + // Set and Get the whole structure from/to ASCII description + //- + // Set the whole structure from ASCII string in <b>bufferin.</b> + // See <i>ASCII FORMAT</i> section. + // Return OK if successfull, NOTOK otherwise. + // + int Set(const String& bufferin); + int SetList(StringList& fields); + //- + // Convert the whole structure to an ASCII string description + // in <b>bufferout.</b> + // See <i>ASCII FORMAT</i> section. + // Return OK if successfull, NOTOK otherwise. + // + int Get(String& bufferout) const; + //- + // Convert the whole structure to an ASCII string description + // and return it. + // See <i>ASCII FORMAT</i> section. + // + String Get() const; +#endif /* SWIG */ + + // + // Storage format conversion + // +#ifndef SWIG + //- + // Set structure from disk storage format as found in + // <b>string</b> buffer or length <b>length.</b> + // Return OK if successfull, NOTOK otherwise. + // + int Unpack(const char* string, int length); + // + //- + // Set structure from disk storage format as found in + // <b>data</b> string. + // Return OK if successfull, NOTOK otherwise. + // + inline int Unpack(const String& data) { return(Unpack(data,data.length())); } + // + //- + // Convert object into disk storage format as found in + // and place the result in <b>data</b> string. + // Return OK if successfull, NOTOK otherwise. + // + int Pack(String& data) const; +#endif /* SWIG */ + + // + // Transformations + // + //- + // Copy each <i>defined</i> field from other into the object, if + // the corresponding field of the object is not defined. + // Return OK if successfull, NOTOK otherwise. + // + int Merge(const WordKey& other); + //- + // Undefine all fields found after the first undefined field. The + // resulting key has a set of defined fields followed by undefined fields. + // Returns NOTOK if the word is not defined because the resulting key would + // be empty and this is considered an error. Returns OK on success. + // + int PrefixOnly(); +#ifndef SWIG + //- + // Implement ++ on a key. + // + // It behaves like arithmetic but follows these rules: + // <pre> + // . Increment starts at field <position> + // . If a field value overflows, increment field <b>position</b> - 1 + // . Undefined fields are ignored and their value untouched + // . Incrementing the word field is done by appending \001 + // . When a field is incremented all fields to the left are set to 0 + // </pre> + // If position is not specified it is equivalent to NFields() - 1. + // It returns OK if successfull, NOTOK if <b>position</b> out of range or + // WORD_FOLLOWING_ATEND if the maximum possible value was reached. + // + int SetToFollowing(int position = WORD_FOLLOWING_MAX); +#endif /* SWIG */ + + // + // Predicates + // + //- + // Return true if all the fields are <i>defined</i>, false otherwise. + // + int Filled() const { return setbits == (unsigned int) (((1 << NFields()) - 1) | WORD_KEY_WORDSUFFIX_DEFINED); } + //- + // Return true if no fields are <i>defined</i>, false otherwise. + // + int Empty() const { return setbits == 0; } + //- + // Return true if the object and <b>other</b> are equal. + // Only fields defined in both keys are compared. + // + int Equal(const WordKey& other) const; + //- + // Return true if the object and <b>other</b> are equal. + // All fields are compared. If a field is defined in <b>object</b> + // and not defined in the object, the key are not considered + // equal. + // + int ExactEqual(const WordKey& other) const {return(Equal(other) && other.setbits == setbits);} +#ifndef SWIG + //- + // Return true if the object and <b>other</b> are equal. + // The packed string are compared. An <i>undefined</i> numerical field + // will be 0 and therefore undistinguishable from a <i>defined</i> field + // whose value is 0. + // + int PackEqual(const WordKey& other) const; + //- + // Return true if adding <b>increment</b> in field at <b>position</b> makes + // it overflow or underflow, false if it fits. + // + int Outbound(int position, int increment) { + if(increment < 0) return Underflow(position, increment); + else if(increment > 0) return Overflow(position, increment); + else return WORD_INBOUND; + } + //- + // Return true if adding positive <b>increment</b> to field at + // <b>position</b> makes it overflow, false if it fits. + // + int Overflow(int position, int increment) { + return MaxValue(position) - Get(position) < (WordKeyNum)increment ? WORD_OVERFLOW : WORD_INBOUND; + } + //- + // Return true if subtracting positive <b>increment</b> to field + // at <b>position</b> makes it underflow, false if it fits. + // + int Underflow(int position, int increment) { + return Get(position) < (WordKeyNum)(-increment) ? WORD_UNDERFLOW : WORD_INBOUND; + } +#endif /* SWIG */ + //- + // Return OK if the key may be used as a prefix for search. + // In other words return OK if the fields set in the key + // are all contiguous, starting from the first field. + // Otherwise returns NOTOK + // + int Prefix() const; + +#ifndef SWIG + //- + // Compare <b>a</b> and <b>b</b> in the Berkeley DB fashion. + // <b>a</b> and <b>b</b> are packed keys. The semantics of the + // returned int is as of strcmp and is driven by the key description + // found in <i>WordKeyInfo.</i> + // + static int Compare(const String& a, const String& b); + static int Compare_WordOnly(const String& a, const String& b); + //- + // Compare <b>a</b> and <b>b</b> in the Berkeley DB fashion. + // <b>a</b> and <b>b</b> are packed keys. The semantics of the + // returned int is as of strcmp and is driven by the key description + // found in <i>WordKeyInfo.</i> + // + static int Compare(const char *a, int a_length, const char *b, int b_length); + static int Compare_WordOnly(const char *a, int a_length, const char *b, int b_length); + //- + // Compare object defined fields with <b>other</b> key defined fields only, + // ignore fields that are not defined in object or <b>other.</b> + // Return 1 if different 0 if equal. + // If different, <b>position</b> is set to the field number that differ, + // <b>lower</b> is set to 1 if Get(<b>position</b>) is lower than + // other.Get(<b>position</b>) otherwise lower is set to 0. + // + int Diff(const WordKey& other, int& position, int& lower); + + //- + // Print object in ASCII form on <b>f</b> (uses <i>Get</i> method). + // See <i>ASCII FORMAT</i> section. + // + int Write(FILE* f) const; +#endif /* SWIG */ + //- + // Print object in ASCII form on <b>stdout</b> (uses <i>Get</i> method). + // See <i>ASCII FORMAT</i> section. + // + void Print() const; + +#ifndef SWIG + +private: + + // + // Convert a single number from and to disk storage representation + // + static int UnpackNumber(const unsigned char* from, const int from_size, WordKeyNum &res, const int lowbits, const int bits); + static int PackNumber(WordKeyNum from, char* to, int to_size, int lowbits, int lastbits); + + // + // Data members + // + // + // Bit field for defined/undefined status of each key field + // + unsigned int setbits; + // + // Holds the numerical values of the key fields + // + WordKeyNum *numerical_fields; + // + // Holds the word key field + // + String kword; +#endif /* SWIG */ +}; + +#ifndef SWIG +// +// Set bit number <b> to 0 and others to 1. <b> may have a value from 0 to 8. If +// 8 then all bits are 1. +// +#define WORD_BIT_MASK(b) ((b) == 0 ? 0xff : ((( 1 << (b)) - 1) & 0xff)) +#define WORD_BIT_MASK2(b) ((1<<(b)) -1) +// +// Decode integer found in <from> using <from_size> bytes. The integer starts at <lowbits> bit +// in the first byte and occupies a total of <bits> bits. The resulting integer is stored in *<top> +// +inline int WordKey::UnpackNumber(const unsigned char* from, const int from_size, WordKeyNum& to, const int lowbits, const int bits) +{ + to = 0; + to = ((from[0] & 0xff) >> lowbits); + + if(lowbits) to &= WORD_BIT_MASK(8 - lowbits); + + if(from_size == 1) + to &= WORD_BIT_MASK(bits); + else { + for(int i = 1; i < from_size; i++) { + to |= (from[i] & 0xff) << ((i - 1) * 8 + (8 - lowbits)); + } + } + + if(bits < (int)(sizeof(WordKeyNum) * 8)) + to &= ( 1 << bits ) - 1; + + return OK; +} + +// +// Encode integer <from>, starting at bit <lowbits> in byte array <to>. It will span +// <to_size> bytes and only the <lastbits> bits of the last byte (to[to_size - 1]) are +// filled. See word_builder.pl for more information. +// +inline int WordKey::PackNumber(WordKeyNum from, char* to, int to_size, int lowbits, int lastbits) +{ + // first byte + if(lowbits) { + to[0] |= ((from & WORD_BIT_MASK(8 - lowbits)) << lowbits) & 0xff; + } else { + to[0] = from & 0xff; + } + from >>= 8 - lowbits; + + // following bytes + for(int i = 1; i < to_size; i++) { + to[i] = from & 0xff; + from >>= 8; + } + + // clip the end off (clobbers anything left at the end of this byte) + if(lastbits) to[to_size - 1] &= WORD_BIT_MASK(lastbits); + + return OK; +} + +#undef WORD_BIT_MASK +#endif /* SWIG */ + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordKeyInfo.cc b/debian/htdig/htdig-3.2.0b6/htword/WordKeyInfo.cc new file mode 100644 index 00000000..5a7adffc --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordKeyInfo.cc @@ -0,0 +1,225 @@ +// WordKeyInfo.cc +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <stdlib.h> +#include <errno.h> + +#include "WordKeyInfo.h" +#include "StringList.h" + +#define WORDKEYFIELD_BITS_MAX 64 + +// +// WordKeyField implementation +// +int WordKeyField::SetNum(WordKeyField *previous, char *nname, int nbits) +{ + type = WORD_ISA_NUMBER; + name.set(nname, strlen(nname)); + + bits = nbits; + bits_offset = (previous ? previous->bits_offset + previous->bits : 0 ); + + if(bits_offset < 0 || + bits_offset > WORDKEYFIELD_BITS_MAX*WORD_KEY_MAX_NFIELDS) { + fprintf(stderr, "WordKeyField::WordKeyField: bits_offset: %d out of bounds\n", bits_offset); + return EINVAL; + } + bytes_offset = bits_offset / 8; + bytesize = ((bits_offset + bits - 1) / 8) - bytes_offset + 1; + lastbits = (bits_offset + bits) % 8; + lowbits = bits_offset % 8; + + return 0; +} + +int WordKeyField::SetString() +{ + name.set("Word"); + type = WORD_ISA_STRING; + return 0; +} + +// +// Tabulate for printing +// +static void nprint(char c, int n) +{ + for(int i = 0; i < n; i++) { + if(!(i % 4)) { + printf("%c", 'a' + i / 4); + } else { + printf("%c", c); + } + } +} + +// +// Print object on standard output +// +void +WordKeyField::Show() +{ + if(!name.nocase_compare("Word")) { + printf("Word type: %2d\n", type); + } else { + nprint(' ',bits_offset); + printf("\"%s\" type:%2d lowbits:%2d lastbits:%2d\n", + (char *)name, + type, + lowbits, + lastbits); + nprint(' ',bits_offset); + printf("|---bytesize:%2d bytes_offset:%2d bits:%2d bits_offset:%2d\n", bytesize, bytes_offset, bits, bits_offset); + } +} + +// +// WordKeyInfo implementation +// + +WordKeyInfo* WordKeyInfo::instance = 0; + +WordKeyInfo::WordKeyInfo(const Configuration& config) +{ + sort = NULL; + nfields = -1; + num_length = 0; + + const String &keydesc = config["wordlist_wordkey_description"]; + + if(!keydesc.empty()) { + Set(keydesc); + } else { + fprintf(stderr, "WordKeyInfo::WordKeyInfo: didn't find key description in config\n"); + } +} + +void +WordKeyInfo::Initialize(const Configuration &config_arg) +{ + if(instance != 0) + delete instance; + instance = new WordKeyInfo(config_arg); +} + +void +WordKeyInfo::InitializeFromString(const String &desc) +{ + Configuration config; + config.Add("wordlist_wordkey_description", desc); + Initialize(config); +} + +int +WordKeyInfo::Alloc(int nnfields) +{ + nfields = nnfields; + if(!(sort = new WordKeyField[nfields])) { + fprintf(stderr, "WordKeyInfo::Alloc: cannot allocate\n"); + return ENOMEM; + } + num_length = 0; + return 0; +} + +int +WordKeyInfo::Set(const String &desc) +{ + int ret = 0; + StringList fields(desc, "/"); + + if(fields.Count() > WORD_KEY_MAX_NFIELDS) { + fprintf(stderr, "WordKeyInfo::Set: too many fields in %s, max is %d\n", (const char*)desc, WORD_KEY_MAX_NFIELDS); + return EINVAL; + } + + if(fields.Count() <= 0) { + fprintf(stderr, "WordKeyInfo::Set: no fields\n"); + return EINVAL; + } + + if((ret = Alloc(fields.Count()))) + return ret; + + WordKeyField* previous = 0; + int i; + for(i = 0; i < fields.Count(); i++) { + char* field = fields[i]; + WordKeyField& key_field = sort[i]; + if(!mystrcasecmp(field, "word")) { + // + // String field + // + if(i != 0) { + fprintf(stderr, "WordKeyInfo::Set: Word field must show in first position %s\n", (const char*)desc); + return EINVAL; + } + key_field.SetString(); + } else { + // + // Numerical field + // + StringList pair(field, "\t "); + + if(pair.Count() != 2) { + fprintf(stderr, "WordKeyInfo::AddField: there must be exactly two strings separated by a white space (space or tab) in a field description (%s in key description %s)\n", field, (const char*)desc); + return EINVAL; + } + + int bits = atoi(pair[1]); + char* name = pair[0]; + key_field.SetNum(previous, name, bits); + previous = &key_field; + } + } + + // + // Total length in bytes of the numerical fields + // + num_length = sort[i - 1].bytes_offset + sort[i - 1].bytesize; + + return ret; +} + +void +WordKeyInfo::Show() +{ + fprintf(stderr, "-----------------------------------------\n"); + fprintf(stderr, "nfields:%3d num_length:%3d\n", nfields, num_length); + int i; + for(i = 0; i < nfields; i++) + sort[i].Show(); + + char str[WORDKEYFIELD_BITS_MAX*WORD_KEY_MAX_NFIELDS]; + memset(str, '_', WORDKEYFIELD_BITS_MAX*WORD_KEY_MAX_NFIELDS); + + int last = 0; + int j; + for(j = 0; j < nfields; j++) { + for(i = 0; i < sort[j].bits; i++) { + char c = (j % 10) + '0'; + int pos = sort[j].bits_offset + i; + if(str[pos] != '_') { + fprintf(stderr, "WordKeyInfo::Show: overlaping bits (field %d), bit %d\n", j, i); + c='X'; + } + str[pos] = c; + if(last < pos) last = pos; + } + } + str[last + 1] = '\0'; + fprintf(stderr, "%s (bits)\n",str); + fprintf(stderr, "^0 ^1 ^2 ^3 ^4 ^5 ^6 ^7\n"); + fprintf(stderr, "0123456701234567012345670123456701234567012345670123456701234567\n"); +} diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordKeyInfo.h b/debian/htdig/htdig-3.2.0b6/htword/WordKeyInfo.h new file mode 100644 index 00000000..039dbf4f --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordKeyInfo.h @@ -0,0 +1,170 @@ +// WordKeyInfo.h +// +// NAME +// information on the key structure of the inverted index. +// +// SYNOPSIS +// +// Use the WordKey::NField() method instead. +// +// DESCRIPTION +// +// Describe the structure of the index key (<i>WordKey</i>). +// The description includes the layout of the packed version +// stored on disk. +// +// CONFIGURATION +// +// wordlist_wordkey_description <desc> (no default) +// Describe the structure of the inverted index key. +// In the following explanation of the <i><desc></i> format +// mandatory words are +// in bold and values that must be replaced in italic. +// <br> +// <b>Word</b>/<i>name bits</i>[/...] +// <br> +// The <i>name</i> is an alphanumerical symbolic name for the key field. +// The <i>bits</i> is the number of bits required to store this field. +// Note that all values are stored in unsigned integers (unsigned int). +// +// +// END +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// + +#ifndef _WordKeyInfo_h_ +#define _WordKeyInfo_h_ + +#include "Configuration.h" + +// +// Type number associated to each possible type for a key element +// (type field of struct WordKeyInfo). +// +#define WORD_ISA_NUMBER 1 +#define WORD_ISA_STRING 2 + +// +// Maximum number of fields in a key description +// +#define WORD_KEY_MAX_NFIELDS 20 + +// +// All numerical fields of the key are typed WordKeyNum. +// Most of the code strongly assume that it is unsigned. +// Mainly provided to be replaced by unsigned longlong WordKeyNum +// for 64 bits machines. +// +typedef unsigned int WordKeyNum; + +// +// Maximum number of bits in a field +// +#define WORD_KEY_MAXBITS ((int)(sizeof(WordKeyNum) * 8)) +#define WORD_KEY_MAXVALUE ((WordKeyNum)~(WordKeyNum)0) + +// +// Description of a single field +// +class WordKeyField +{ + public: + WordKeyField() { + type = lowbits = lastbits = bytesize = bytes_offset = bits = bits_offset = 0; + } + + // + // Precompute information that will be needed to pack/unpack the key + // to/from disk. + // + // The <previous> field is used to compute the position of the field + // in packed string. <nname> is the symbolic name of the field + // <nbits> is the number of bits actualy used in a number. + // + int SetNum(WordKeyField *previous, char *nname, int nbits); + // + // Set the one and only string field + // + int SetString(); + + // + // Maximum possible value for this field. + // + WordKeyNum MaxValue() const { + return bits >= WORD_KEY_MAXBITS ? WORD_KEY_MAXVALUE : ((1 << bits) - 1); + } + + // + // Debugging and printing + // + void Show(); + + String name; // Symbolic name of the field + int type; // WORD_ISA_{STRING|NUMBER} + // + // 01234567012345670123456701234567 + // +-------+-------+-------+-------+-- + // 100101010011100111101011110 + // ^^^ ^^^^^^ + // | | + // lowbits = 3 lastbits = 6 + // + int lowbits; + int lastbits; + int bytesize; // Number of bytes involved + int bytes_offset; // Offset of first byte from start + int bits; // Size of field in bits + int bits_offset; // Offset of first bit from start +}; + +// +// Description of the key structure +// +class WordKeyInfo +{ + public: + WordKeyInfo(const Configuration& config); + ~WordKeyInfo() { if(sort) delete [] sort; } + + // + // Unique instance handlers + // + static void Initialize(const Configuration& config); + static void InitializeFromString(const String &desc); + static WordKeyInfo* Instance() { + if(instance) return instance; + fprintf(stderr, "WordKeyInfo::Instance: no instance\n"); + return 0; + } + + int Alloc(int nnfields); + int Set(const String &desc); + + void Show(); + + // + // Array describing the fields, in sort order. + // + WordKeyField *sort; + // + // Total number of fields + // + int nfields; + // + // Total number of bytes used by numerical fields + // + int num_length; + + // + // Unique instance pointer + // + static WordKeyInfo* instance; +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordList.cc b/debian/htdig/htdig-3.2.0b6/htword/WordList.cc new file mode 100644 index 00000000..566acb93 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordList.cc @@ -0,0 +1,436 @@ +// +// WordList.cc +// +// WordList: Interface to the word database. Previously, this wrote to +// a temporary text file. Now it writes directly to the +// word database. +// NOTE: Some code previously attempted to directly read from +// the word db. This will no longer work, so it's preferred to +// use the access methods here. +// Configuration parameter used: +// wordlist_extend +// wordlist_verbose 1 walk logic +// wordlist_verbose 2 walk logic details +// wordlist_verbose 3 walk logic lots of details +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordList.cc,v 1.13 2004/05/28 13:15:27 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "WordList.h" +#include "WordReference.h" +#include "WordRecord.h" +#include "WordType.h" +#include "WordStat.h" +#include "Configuration.h" +#include "htString.h" +#include "HtPack.h" +#include "HtTime.h" +#include "WordDBCompress.h" + +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#include <errno.h> + +// ***************************************************************************** +// +WordList::WordList(const Configuration& config_arg) : + wtype(config_arg), + config(config_arg) +{ + // The database itself hasn't been opened yet + isopen = 0; + isread = 0; + extended = config.Boolean("wordlist_extend"); + verbose = config.Value("wordlist_verbose"); + compressor = 0; +} + +// ***************************************************************************** +// +WordList::~WordList() +{ + Close(); +} + +// ***************************************************************************** +// +int WordList::Open(const String& filename, int mode, int word_only) +{ + int usecompress=0; + + // If word_only, entries compare equal if the "word" part matches. + // This should only be used for querying the database, not writing it. + // It is needed by speling to test for the existence of words. + db.set_bt_compare(word_only ? word_only_db_cmp : word_db_cmp); + + if(config.Value("wordlist_page_size", 0)) + db.set_pagesize(config.Value("wordlist_page_size")); + + if(config.Boolean("wordlist_compress") == 1) { + usecompress = DB_COMPRESS; + WordDBCompress* compressor = new WordDBCompress( + config.Boolean("wordlist_compress_zlib",0), config.Value("compression_level",0)); + + // compressor->debug = config.Value("wordlist_compress_debug"); + SetCompressor(compressor); + db.CmprInfo(compressor->CmprInfo()); + } + + int flags = (mode & O_RDWR) ? DB_CREATE : DB_RDONLY; + if(mode & O_TRUNC) { + if(flags == DB_CREATE) + flags |= DB_TRUNCATE; + else + fprintf(stderr, "WordList::Open: O_TRUNC | O_RDONLY is meaningless\n"); + } + flags |= usecompress; + + int ret = db.Open(filename, DB_BTREE, flags, 0666) == 0 ? OK : NOTOK; + + isread = mode & O_RDONLY; + isopen = 1; + + return ret; +} + +// ***************************************************************************** +// +int WordList::Close() +{ + if(isopen) { + if(db.Close() != 0) return NOTOK; + isopen = 0; + isread = 0; + } + + { + WordDBCompress* compressor = GetCompressor(); + if(compressor) { + delete compressor; + SetCompressor(0); + } + } + + return OK; +} + +// **************************************************************************** +// +int WordList::Put(const WordReference& arg, int flags) +{ + if (arg.Key().GetWord().length() == 0) { + fprintf(stderr, "WordList::Put(%s) word is zero length\n", (char*)arg.Get()); + return NOTOK; + } + if (!arg.Key().Filled()) { + fprintf(stderr, "WordList::Put(%s) key is not fully defined\n", (char*)arg.Get()); + return NOTOK; + } + + WordReference wordRef(arg); + String word = wordRef.Key().GetWord(); + if(wtype.Normalize(word) & WORD_NORMALIZE_NOTOK) + return NOTOK; + wordRef.Key().SetWord(word); + + // + // The two case could be grouped in a more compact way. + // However, the resources consumption difference between + // a Put(DB_NOOVERWRITE) and Put(0) is huge (the first is 75% + // slower than the second). Check the db_put sources for the + // explanation. + // + int ret = NOTOK; + if(flags) { + // + // First attempt tells us if the key exists. If it + // does not we just increment the reference count. + // Otherwise, and only if flags does not contain DB_NOOVERWRITE, + // we override the key/record pair. + // + int error; + if((error = db.Put(wordRef, DB_NOOVERWRITE)) != 0) { + if(error == DB_KEYEXIST && flags == 0) + ret = db.Put(wordRef, 0) == 0 ? OK : NOTOK; + } else { + ret = Ref(wordRef); + } + } else { + if((ret = db.Put(wordRef, 0)) == 0) + ret = Ref(wordRef); + } + + return ret; +} + + +// ***************************************************************************** +// +List *WordList::operator [] (const WordReference& wordRef) +{ + return Collect(wordRef); +} + +// ***************************************************************************** +// +List *WordList::Prefix (const WordReference& prefix) +{ + WordReference prefix2(prefix); + prefix2.Key().UndefinedWordSuffix(); + return Collect(prefix2); +} + +// ***************************************************************************** +// +List *WordList::WordRefs() +{ + return Collect(WordReference()); +} + +// ***************************************************************************** +// +List *WordList::Collect(const WordReference& wordRef) +{ + WordCursor *search = Cursor(wordRef.Key(), HTDIG_WORDLIST_COLLECTOR); + if(search->Walk() != OK) return 0; + List* result = search->GetResults(); + delete search; + return result; +} + +// ***************************************************************************** +// +// Callback data dedicated to Dump and dump_word communication +// +class DeleteWordData : public Object +{ +public: + DeleteWordData() { count = 0; } + + int count; +}; + +// ***************************************************************************** +// +// +static int delete_word(WordList *words, WordDBCursor &cursor, const WordReference *word, Object &data) +{ + if(words->Delete(cursor) == 0) { + words->Unref(*word); + ((DeleteWordData&)data).count++; + return OK; + } else { + fprintf(stderr, "WordList delete_word: deleting %s failed\n", (char*)word->Get()); + return NOTOK; + } +} + +// ***************************************************************************** +// +// Delete all records matching wordRef, return the number of +// deleted records. +// +int WordList::WalkDelete(const WordReference& wordRef) +{ + DeleteWordData data; + WordCursor *description = Cursor(wordRef.Key(), delete_word, &data); + description->Walk(); + delete description; + return data.count; +} + +// ***************************************************************************** +// +// +List *WordList::Words() +{ + List *list = 0; + String key; + String record; + WordReference lastWord; + WordDBCursor cursor; + + if(cursor.Open(db.db) != 0) return 0; + + // + // Move past the first word count record + // + const WordReference& last = WordStat::Last(); + last.Pack(key, record); + if(cursor.Get(key, record, DB_SET_RANGE) != 0) + return 0; + list = new List; + do { + WordReference wordRef(key, record); + if(lastWord.Key().GetWord().empty() || + wordRef.Key().GetWord() != lastWord.Key().GetWord()) + { + list->Add(new String(wordRef.Key().GetWord())); + lastWord = wordRef; + } + } while (cursor.Get(key, record, DB_NEXT) == 0); + + return list; +} + +// ***************************************************************************** +// +// Returns the reference count for word in <count> arg +// +int WordList::Noccurrence(const WordKey& key, unsigned int& noccurrence) const +{ + noccurrence = 0; + WordStat stat(key.GetWord()); + int ret; + if((ret = db.Get(stat)) != 0) { + if(ret != DB_NOTFOUND) + return NOTOK; + } else { + noccurrence = stat.Noccurrence(); + } + + return OK; +} + +// ***************************************************************************** +// +// Increment reference count for wordRef +// +int WordList::Ref(const WordReference& wordRef) +{ + if(!extended) return OK; + + WordStat stat(wordRef.Key().GetWord()); + int ret; + if((ret = db.Get(stat)) != 0 && ret != DB_NOTFOUND) + return NOTOK; + + stat.Noccurrence()++; + + return db.Put(stat, 0) == 0 ? OK : NOTOK; +} + +// ***************************************************************************** +// +// Decrement reference count for wordRef +// +int WordList::Unref(const WordReference& wordRef) +{ + if(!extended) return OK; + + WordStat stat(wordRef.Key().GetWord()); + int ret; + if((ret = db.Get(stat)) != 0) { + if(ret == DB_NOTFOUND) + fprintf(stderr, "WordList::Unref(%s) Unref on non existing word occurrence\n", (char*)wordRef.Get()); + return NOTOK; + } + + if(stat.Noccurrence() == 0) { + fprintf(stderr, "WordList::Unref(%s) Unref on 0 occurrences word\n", (char*)wordRef.Get()); + return NOTOK; + } + stat.Noccurrence()--; + + if(stat.Noccurrence() > 0) { + ret = db.Put(stat, 0) == 0 ? OK : NOTOK; + } else + ret = db.Del(stat) == 0 ? OK : NOTOK; + return ret; +} + + +// ***************************************************************************** +// +// streaming operators for ascii dumping and reading a list +class FileOutData : public Object +{ +public: + FILE* f; + FileOutData(FILE* f_arg) : f(f_arg) { } +}; + +// ***************************************************************************** +// +static int +wordlist_walk_callback_file_out(WordList *, WordDBCursor& , const WordReference *word, Object &data) +{ + fprintf(((FileOutData&)data).f, "%s\n", (char*)word->Get()); + return OK; +} + +// ***************************************************************************** +// +int +WordList::Write(FILE* f) +{ + WordKey empty; + FileOutData data(f); + WordCursor *description = Cursor(empty, wordlist_walk_callback_file_out, (Object *)&data); + description->Walk(); + delete description; + return 0; +} + +// ***************************************************************************** +// +int +WordList::Read(FILE* f) +{ + WordReference word; +#define WORD_BUFFER_SIZE 1024 + char buffer[WORD_BUFFER_SIZE + 1]; + String line; + int line_number = 0; + int inserted = 0; + + while(fgets(buffer, WORD_BUFFER_SIZE, f)) { + line_number++; + int buffer_length = strlen(buffer); + int eol = buffer[buffer_length - 1] == '\n'; + + if(eol) buffer[--buffer_length] = '\0'; + + line.append(buffer, buffer_length); + // + // Join big lines + // + if(!eol) continue; + // + // If line ends with a \ continue + // + if(line.last() == '\\') { + line.chop(1); + continue; + } + + if(!line.empty()) { + if(word.Set(line) != OK) { + fprintf(stderr, "WordList::Read: line %d : %s\n", line_number, (char*)line); + fprintf(stderr, " cannot build WordReference (ignored)\n"); + } else { + if(Insert(word) != OK) { + fprintf(stderr, "WordList::Read: line %d : %s\n", line_number, (char*)line); + fprintf(stderr, " insert failed (ignored)\n"); + } else { + inserted++; + } + if(verbose) fprintf(stderr, "WordList::Read: inserting %s\n", (char*)word.Get()); + } + + line.trunc(); + } + } + return inserted; +} diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordList.h b/debian/htdig/htdig-3.2.0b6/htword/WordList.h new file mode 100644 index 00000000..1aa87864 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordList.h @@ -0,0 +1,372 @@ +// +// WordList.h +// +// NAME +// +// manage and use an inverted index file. +// +// SYNOPSIS +// +// #include <mifluz.h> +// +// Configuration* config; +// WordReference wordRef; +// ... +// WordList* words = new WordList(config) +// +// delete words; +// +// DESCRIPTION +// +// WordList is the <i>mifluz</i> equivalent of a database handler. Each +// WordList object is bound to an inverted index file and implements the +// operations to create it, fill it with word occurrences and search +// for an entry matching a given criterion. +// +// CONFIGURATION +// +// wordlist_extend {true|false} (default false) +// If <b>true</b> maintain reference count of unique +// words. The <b>Noccurrence</b> method gives access to this count. +// +// wordlist_verbose <number> (default 0) +// Set the verbosity level of the WordList class. +// <br> +// 1 walk logic +// <br> +// 2 walk logic details +// <br> +// 3 walk logic lots of details +// +// wordlist_page_size <bytes> (default 8192) +// Berkeley DB page size (see Berkeley DB documentation) +// +// wordlist_cache_size <bytes> (default 500K) +// Berkeley DB cache size (see Berkeley DB documentation) +// Cache makes a huge difference in performance. It must be at least 2% +// of the expected total data size. Note that if compression is activated +// the data size is eight times larger than the actual file size. In this +// case the cache must be scaled to 2% of the data size, not 2% +// of the file size. See <b>Cache tuning</b> in the mifluz guide for +// more hints. +// +// wordlist_compress {true|false} (default false) +// Activate compression of the index. The resulting index is eight times +// smaller than the uncompressed index. +// +// +// END +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordList.h,v 1.10 2004/05/28 13:15:28 lha Exp $ +// + +#ifndef _WordList_h_ +#define _WordList_h_ + +#include <fcntl.h> +#include <stdio.h> + +#ifndef SWIG +#include "Dictionary.h" +#include "List.h" +#include "htString.h" +#include "WordRecord.h" +#include "WordReference.h" +#include "WordType.h" +#include "WordDB.h" +#include "WordDBCompress.h" +#include "Configuration.h" +#include "WordCursor.h" +#endif /* SWIG */ + +class List; +class WordList; +class WordDBCursor; + +// +// Inverted index interface +// +class WordList +{ +public: + //- + // Constructor. Build inverted index handling object using + // run time configuration parameters listed in the <b>CONFIGURATION</b> + // section. + // + WordList(const Configuration& config_arg); + virtual ~WordList(); + + //- + // Insert <b>wordRef</b> in index. It is an error to insert + // the same <b>wordRef</b> twice. This requires a lookup in the index + // prior to the insertion. + // Returns OK on success, NOTOK on error. + // + int Insert(const WordReference& wordRef) { return Put(wordRef, DB_NOOVERWRITE); } + //- + // Insert <b>wordRef</b> in index. If the <i>Key()</i> part of + // the <b>wordRef</b> exists in the index, override it. + // Returns OK on success, NOTOK on error. + // + int Override(const WordReference& wordRef) { return Put(wordRef, 0); } +#ifndef SWIG + int Put(const WordReference& wordRef, int flags); +#endif /* SWIG */ + + //- + // Returns OK if <b>wordRef</b> exists in the index, NOTOK otherwise. + // + int Exists(const WordReference& wordRef) { return db.Exists(wordRef) == 0 ? OK : NOTOK; } +#ifndef SWIG + //- + // Returns OK if <b>word</b> exists in the index, NOTOK otherwise. + // + int Exists(const String& word) { return Exists(WordReference(word)); } +#endif /* SWIG */ + + // + // Delete permanently + // + //- + // Delete all entries in the index whose key matches the + // <i>Key()</i> part of <b>wordRef</b>, using the <i>Walk</i> + // method. + // Returns the number of entries successfully deleted. + // + int WalkDelete(const WordReference& wordRef); + //- + // Delete the entry in the index that exactly matches the + // <i>Key()</i> part of <b>wordRef.</b> + // Returns OK if deletion is successfull, NOTOK otherwise. + // + int Delete(const WordReference& wordRef) { + if(db.Del(wordRef) == 0) + return Unref(wordRef); + else + return NOTOK; + } +#ifdef SWIG +%name(DeleteCursor) +#endif /* SWIG */ + //- + // Delete the inverted index entry currently pointed to by the + // <b>cursor.</b> + // Returns 0 on success, Berkeley DB error code on error. This + // is mainly useful when implementing a callback function for + // a <b>WordCursor.</b> + // + int Delete(WordDBCursor& cursor) { return cursor.Del(); } + + //- + // Open inverted index <b>filename.</b> <b>mode</b> + // may be <i>O_RDONLY</i> or <i>O_RDWR.</i> If mode is + // <i>O_RDWR</i> it can be or'ed with <i>O_TRUNC</i> to reset + // the content of an existing inverted index. + // If word_only is true, entries will compare equal if the "word" part + // of the key is equal, even if the numeric fields aren't. (What are the + // numeric fields, anyway??) + // Return OK on success, NOTOK otherwise. + // + int Open(const String& filename, int mode, int word_only=false); + //- + // Close inverted index. + // + int Close(); + + // + // These returns a list of all the WordReference * matching + // the constraint. + //- + // Returns the list of word occurrences exactly matching the + // <i>Key()</i> part of <b>wordRef.</b> The <i>List</i> returned + // contains pointers to <i>WordReference</i> objects. It is + // the responsibility of the caller to free the list. See List.h + // header for usage. + // + List *Find(const WordReference& wordRef) { return (*this)[wordRef]; } + //- + // Returns the list of word occurrences exactly matching the + // <b>word.</b> The <i>List</i> returned + // contains pointers to <i>WordReference</i> objects. It is + // the responsibility of the caller to free the list. See List.h + // header for usage. + // + List *FindWord(const String& word) { return (*this)[word]; } +#ifndef SWIG + //- + // Alias to the <b>Find</b> method. + // + List *operator [] (const WordReference& wordRef); + //- + // Alias to the <b>FindWord</b> method. + // + List *operator [] (const String& word) { return (*this)[WordReference(word)]; } +#endif /* SWIG */ + //- + // Returns the list of word occurrences matching the <i>Key()</i> + // part of <b>wordRef.</b> In the <i>Key()</i>, the string + // (accessed with <i>GetWord()</i>) matches any string that begins + // with it. The <i>List</i> returned contains pointers to + // <i>WordReference</i> objects. It is the responsibility of the + // caller to free the list. + // + List *Prefix (const WordReference& prefix); +#ifndef SWIG + //- + // Returns the list of word occurrences matching the + // <b>word.</b> In the <i>Key()</i>, the string (accessed with + // <i>GetWord()</i>) matches any string that begins with it. The + // <i>List</i> returned contains pointers to <i>WordReference</i> + // objects. It is the responsibility of the caller to free the + // list. + // + List *Prefix (const String& prefix) { return this->Prefix(WordReference(prefix)); } +#endif /* SWIG */ + + // + // Iterate over the complete database. + // +#ifndef SWIG + //- + // Returns a list of all unique words contained in the inverted + // index. The <i>List</i> returned contains pointers to + // <i>String</i> objects. It is the responsibility of the caller + // to free the list. See List.h header for usage. + // + List *Words(); +#endif /* SWIG */ + //- + // Returns a list of all entries contained in the + // inverted index. The <i>List</i> returned contains pointers to + // <i>WordReference</i> objects. It is the responsibility of + // the caller to free the list. See List.h header for usage. + // + List *WordRefs(); + +#ifndef SWIG + //- + // Create a cursor that searches all the occurrences in the + // inverted index and call <b>ncallback</b> with + // <b>ncallback_data</b> for every match. + // + WordCursor *Cursor(wordlist_walk_callback_t callback, Object *callback_data) { return new WordCursor(this, callback, callback_data); } +#endif /* SWIG */ + //- + // Create a cursor that searches all the occurrences in the + // inverted index and that match <b>nsearchKey.</b> If + // <b>naction</b> is set to HTDIG_WORDLIST_WALKER calls + // <b>searchKey.callback</b> with <b>searchKey.callback_data</b> + // for every match. If <b>naction</b> is set to + // HTDIG_WORDLIST_COLLECT push each match in <b>searchKey.collectRes</b> + // data member as a <b>WordReference</b> object. It is the responsibility + // of the caller to free the <b>searchKey.collectRes</b> list. + // + WordCursor *Cursor(const WordKey &searchKey, int action = HTDIG_WORDLIST_WALKER) { return new WordCursor(this, searchKey, action); } +#ifndef SWIG + //- + // Create a cursor that searches all the occurrences in the + // inverted index and that match <b>nsearchKey</b> and calls + // <b>ncallback</b> with <b>ncallback_data</b> for every match. + // + WordCursor *Cursor(const WordKey &searchKey, wordlist_walk_callback_t callback, Object * callback_data) { return new WordCursor(this, searchKey, callback, callback_data); } +#endif /* SWIG */ + + // + // Update/get global word statistics statistics + // + //- + // Add one to the reference count for the string contained + // in the <i>Key().GetWord()</i> part of <b>wordRef.</b> + // Returns OK on success, NOTOK otherwise. + // + int Ref(const WordReference& wordRef); + //- + // Substract one to the reference count for the string contained + // in the <i>Key().GetWord()</i> part of <b>wordRef.</b> + // Returns OK on success, NOTOK otherwise. + // + int Unref(const WordReference& wordRef); +#ifndef SWIG + //- + // Return in <b>noccurrence</b> the number of occurrences of the + // string contained in the <i>GetWord()</i> part of <b>key.</b> + // Returns OK on success, NOTOK otherwise. + // + int Noccurrence(const WordKey& key, unsigned int& noccurrence) const; + + // + // Accessors + // + // + // Get the Berkeley DB object + // + const WordType& GetWordType() const { return wtype; } +#endif /* SWIG */ + //- + // Return the <i>Configuration</i> object used to initialize + // the <i>WordList</i> object. + // + const Configuration& GetConfiguration() const { return config; } + +#ifndef SWIG + // + // Input/Output + // + //- + // Write on file descriptor <b>f</b> an ASCII description of the + // index. Each line of the file contains a <i>WordReference</i> + // ASCII description. + // Returns 0 on success, not 0 otherwise. + // + int Write(FILE* f); + // + //- + // Read <i>WordReference</i> ASCII descriptions from <b>f</b>, + // returns the number of inserted WordReference or < 0 if an error + // occurs. Invalid descriptions are ignored as well as empty + // lines. + // + int Read(FILE* f); + +#endif /* SWIG */ + // + // Retrieve WordReferences from the database. + // Backend of WordRefs, operator[], Prefix... + // + List *Collect(const WordReference& word); +#ifndef SWIG + // + // Compressor object accessors + // + WordDBCompress *GetCompressor() { return compressor; } + void SetCompressor(WordDBCompress* compressor_arg) { compressor = compressor_arg; } + + const WordType wtype; + const Configuration& config; + + int isopen; + int isread; + + // + // If true enable extended functionalities of WordList such + // as per-word statistics. Read from wordlist_extended configuration + // parameter. + // + int extended; + + + WordDB db; + WordDBCompress *compressor; + int verbose; +#endif /* SWIG */ +}; + +#endif /* _WordList_h_ */ + diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordListMulti.cc b/debian/htdig/htdig-3.2.0b6/htword/WordListMulti.cc new file mode 100644 index 00000000..032cb97c --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordListMulti.cc @@ -0,0 +1,599 @@ +// +// WordListMulti.cc +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordListMulti.cc,v 1.6 2004/05/28 13:15:28 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "WordListMulti.h" +#include "WordListOne.h" +#include "myqsort.h" + +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#include <errno.h> +#include <sys/stat.h> + +#ifndef _MSC_VER /* _WIN32 */ +#include <unistd.h> +#endif + +class WordDBMulti : public Object +{ +public: + WordDBMulti() { words = 0; size = 0; mode = 0; } + + WordListOne *words; + String filename; + int mode; + unsigned int size; +}; + +// ***************************************************************************** +// +WordListMulti::WordListMulti(WordContext* ncontext) +{ + dbs = new List; + context = ncontext; + // The database itself hasn't been opened yet + isopen = 0; + Configuration& config = context->GetConfiguration(); + extended = config.Boolean("wordlist_extend"); + verbose = config.Value("wordlist_verbose"); + + file_max = config.Value("wordlist_multi_max", 50); + if(file_max < 4) file_max = 4; + + file_min = config.Value("wordlist_multi_min", 4); + if(file_min < 2) file_min = 2; + + if(file_max < file_min) file_max = file_min * 2; + + put_max = config.Value("wordlist_multi_put_max", 1000); + if(put_max < 50) put_max = 50; + + compressor = 0; + serial = 0; +} + +// ***************************************************************************** +// +WordListMulti::~WordListMulti() +{ + Close(); +} + +// ***************************************************************************** +// +int WordListMulti::Open(const String& nfilename, int mode) +{ + filename = nfilename; + + char tmp[32]; + struct stat stat_buf; + int i; + // + // Open existing indexes + // + for(i = 0; i < file_max; i++) { + String filename_one(filename); + sprintf(tmp, "%08d", i); + filename_one << tmp; + if(stat((char*)filename_one, &stat_buf) == 0) { + WordDBMulti* db = new WordDBMulti(); + db->words = new WordListOne(context); + db->filename = filename_one; + db->mode = mode; + dbs->Push(db); + } else { + break; + } + } + serial = i; + // + // If no indexes exists and read-only, abort + // + if(i == 0 && (flags & DB_RDONLY)) { + fprintf(stderr, "WordListMulti::Open(%s, O_RDONLY): no index found\n", (char*)filename); + return NOTOK; + } + + isopen = 1; + + // + // If no indexes exists and read/write, create the first + // + if(i == 0) + if(AddIndex() != OK) return NOTOK; + + WordDBMulti* db = (WordDBMulti*)dbs->Last(); + if(db->words->Open(db->filename, mode) != OK) + return NOTOK; + + return OK; +} + +// ***************************************************************************** +// +int WordListMulti::Close() +{ + if(isopen) { + WordDBMulti* db; + ListCursor cursor; + for(dbs->Start_Get(cursor); (db = (WordDBMulti*)dbs->Get_Next(cursor));) { + delete db->words; + } + dbs->Destroy(); + isopen = 0; + filename.trunc(); + } + return OK; +} + +// **************************************************************************** +// +unsigned int WordListMulti::Size() const +{ + unsigned int size = 0; + if(isopen) { + WordDBMulti* db; + ListCursor cursor; + for(dbs->Start_Get(cursor); (db = (WordDBMulti*)dbs->Get_Next(cursor));) { + if(!db->words->isopen) { + if(db->words->Open(db->filename, O_RDONLY) != OK) return 0; + size += db->words->Size(); + if(db->words->Close() != OK) return 0; + } else { + size += db->words->Size(); + } + } + } + return size; +} + +int WordListMulti::AddIndex() +{ + if(Flags() & O_RDONLY) return NOTOK; + + if(serial >= file_max) + Merge(); + + char tmp[32]; + + String filename_one(filename); + sprintf(tmp, "%08d", serial); + filename_one << tmp; + serial++; + + WordDBMulti* db = new WordDBMulti(); + db->words = new WordListOne(context); + db->words->extended = extended; + db->filename = filename_one; + dbs->Push(db); + + return OK; +} + +static int merge_cmp_size(WordListMulti*, WordDBMulti* a, WordDBMulti* b) +{ + return b->size - a->size; +} + +static int merge_cmp_filename(WordListMulti*, WordDBMulti* a, WordDBMulti* b) +{ + return a->filename.compare(b->filename); +} + +int WordListMulti::Merge() +{ + if(Flags() & DB_RDONLY) return NOTOK; + + Configuration& config = context->GetConfiguration(); + int use_compress = config.Boolean("wordlist_compress"); + + WordDBMulti* db = (WordDBMulti*)dbs->Last(); + if(db->words->Close() != OK) return NOTOK; + + // + // heap lists all the files in decreasing size order (biggest first) + // + WordDBMulti* heap = new WordDBMulti[serial]; + { + int i; + WordDBMulti* db; + ListCursor cursor; + for(i = 0, dbs->Start_Get(cursor); (db = (WordDBMulti*)dbs->Get_Next(cursor)); i++) { + if(db->words->Open(db->filename, O_RDONLY) != OK) return NOTOK; + db->size = db->words->Size(); + if(db->words->Close() != OK) return NOTOK; + + heap[i] = *db; + } + dbs->Destroy(); + myqsort((void*)heap, serial, sizeof(WordDBMulti), (myqsort_cmp)merge_cmp_size, (void*)this); + } + + String tmpname = filename; + tmpname << ".tmp"; + + while(serial > file_min) { + WordDBMulti* a = &heap[serial - 1]; + WordDBMulti* b = &heap[serial - 2]; + + WordListOne tmp(context); + tmp.extended = 0; + + if(a->words->Open(a->filename, O_RDONLY) != OK) return NOTOK; + if(b->words->Open(b->filename, O_RDONLY) != OK) return NOTOK; + if(tmp.Open(tmpname, O_RDWR) != OK) return NOTOK; + if(tmp.db->CacheP() && tmp.db->CacheOff() != 0) return OK; + + WordDBCursor* cursora = a->words->db->Cursor(); + WordDBCursor* cursorb = b->words->db->Cursor(); + + if(cursora->Open() != 0) return NOTOK; + String keya; + String dataa; + + if(cursorb->Open() != 0) return NOTOK; + String keyb; + String datab; + + int reta; + int retb; + + reta = cursora->Get(keya, dataa, DB_NEXT); + retb = cursorb->Get(keyb, datab, DB_NEXT); + + // + // Merge while there are entries in both indexes + // + while(reta == 0 && retb == 0) { + // + // If keya lower than keyb + // + if(WordKey::Compare(context, keya, keyb) < 0) { + if(tmp.db->Put(0, keya, dataa, 0) != 0) return NOTOK; + reta = cursora->Get(keya, dataa, DB_NEXT); + } else { + if(tmp.db->Put(0, keyb, datab, 0) != 0) return NOTOK; + retb = cursorb->Get(keyb, datab, DB_NEXT); + } + } + + // + // Sanity check + // + if((reta != 0 && reta != DB_NOTFOUND) || + (retb != 0 && retb != DB_NOTFOUND)) + return NOTOK; + + // + // Flush the remaining entries from the index that is + // not yet empty. + // + if(reta != DB_NOTFOUND || retb != DB_NOTFOUND) { + String key = reta == 0 ? keya : keyb; + String data = reta == 0 ? data : datab; + WordDBCursor* cursor = reta == 0 ? cursora : cursorb; + int ret = 0; + while(ret == 0) { + if(tmp.db->Put(0, key, data, 0) != 0) return NOTOK; + ret = cursor->Get(key, data, DB_NEXT); + } + if(ret != DB_NOTFOUND) + return NOTOK; + } + + delete cursora; + delete cursorb; + + a->words->Close(); + b->words->Close(); + tmp.Close(); + + // + // Remove file a + // + if(unlink((char*)a->filename) != 0) { + const String message = String("WordListMulti::Merge: unlink ") + a->filename; + perror((const char*)message); + return NOTOK; + } + if(use_compress) { + if(unlink((char*)(a->filename + String("_weakcmpr"))) != 0) { + const String message = String("WordListMulti::Merge: unlink ") + a->filename + String("_weakcmpr"); + perror((const char*)message); + return NOTOK; + } + } + + // + // Remove file b + // + if(unlink((char*)b->filename) != 0) { + const String message = String("WordListMulti::Merge: unlink ") + b->filename; + perror((const char*)message); + return NOTOK; + } + if(use_compress) { + if(unlink((char*)(b->filename + String("_weakcmpr"))) != 0) { + const String message = String("WordListMulti::Merge: unlink ") + b->filename + String("_weakcmpr"); + perror((const char*)message); + return NOTOK; + } + } + + // + // Rename tmp file into file b + // + if(rename((char*)tmpname, (char*)b->filename) != 0) { + const String message = String("WordListMulti::Merge: rename ") + tmpname + String(" ") + b->filename; + perror((const char*)message); + return NOTOK; + } + if(use_compress) { + if(rename((char*)(tmpname + String("_weakcmpr")), (char*)(b->filename + String("_weakcmpr"))) != 0) { + const String message = String("WordListMulti::Merge: rename ") + tmpname + String("_weakcmpr ") + b->filename + String("_weakcmpr"); + perror((const char*)message); + return NOTOK; + } + } + + // + // Update b file size. The size need not be accurate number as long + // as it reflects the relative size of each file. + // + b->size += a->size; + + // + // The 'a' index is no longer in use + // + delete a->words; + + serial--; + // + // update heap + // + myqsort((void*)heap, serial, sizeof(WordDBMulti), (myqsort_cmp)merge_cmp_size, (void*)this); + } + + // + // Rename the indexes so that they are in increasing order + // and push them in the list of active indexes. + // + myqsort((void*)heap, serial, sizeof(WordDBMulti), (myqsort_cmp)merge_cmp_filename, (void*)this); + int i; + for(i = 0; i < serial; i++) { + WordDBMulti* db = new WordDBMulti(); + *db = heap[i]; + + String newname(filename); + char tmp[32]; + sprintf(tmp, "%08d", i); + newname << tmp; + + // + // Rename if not equal + // + if(db->filename.compare(newname)) { + // + // Rename db index into newname + // + if(rename((char*)db->filename, (char*)newname) != 0) { + const String message = String("WordListMulti::Merge: rename ") + db->filename + String(" ") + newname; + perror((const char*)message); + return NOTOK; + } + if(use_compress) { + if(rename((char*)(db->filename + String("_weakcmpr")), (char*)(newname + String("_weakcmpr"))) != 0) { + const String message = String("WordListMulti::Merge: rename ") + db->filename + String("_weakcmpr ") + newname + String("_weakcmpr"); + perror((const char*)message); + return NOTOK; + } + } + + db->filename = newname; + } + + dbs->Push(db); + } + + return OK; +} + +// **************************************************************************** +// +int WordListMulti::Override(const WordReference& arg) +{ + WordDBMulti* db = (WordDBMulti*)dbs->Last(); + + if(db->words->Size() > put_max) { + if(db->words->Close() != OK) return NOTOK; + if(AddIndex() != OK) return NOTOK; + db = (WordDBMulti*)dbs->Last(); + if(db->words->Open(db->filename, db->mode) != OK) return NOTOK; + } + + return db->words->Override(arg); +} + +// ***************************************************************************** +int WordListMulti::Exists(const WordReference& ) +{ + return 0; +} + +// ***************************************************************************** +// +List *WordListMulti::operator [] (const WordReference& ) +{ + return 0; +#if 0 + return Collect(wordRef); +#endif +} + +// ***************************************************************************** +// +List *WordListMulti::Prefix (const WordReference& ) +{ + return 0; +#if 0 + WordReference prefix2(prefix); + prefix2.Key().UndefinedWordSuffix(); + return Collect(prefix2); +#endif +} + +// ***************************************************************************** +// +List *WordListMulti::WordRefs() +{ + return 0; +#if 0 + return Collect(WordReference(context)); +#endif +} + +// ***************************************************************************** +// +List *WordListMulti::Collect(const WordReference&) +{ + return 0; +#if 0 + WordCursor *search = Cursor(wordRef.Key(), HTDIG_WORDLIST_COLLECTOR); + if(search->Walk() != OK) return 0; + List* result = search->GetResults(); + delete search; + return result; +#endif +} + +// ***************************************************************************** +// +// Delete all records matching wordRef, return the number of +// deleted records. +// +int WordListMulti::WalkDelete(const WordReference& ) +{ + return 0; +#if 0 + DeleteWordData data; + WordCursor *description = Cursor(wordRef.Key(), delete_word, &data); + description->Walk(); + delete description; + return data.count; +#endif +} + +int WordListMulti::Delete(const WordReference& ) +{ + return NOTOK; +} + +// ***************************************************************************** +// +// +List *WordListMulti::Words() +{ + return 0; +#if 0 + List *list = 0; + String key; + String record; + WordReference lastWord(context); + WordDBCursor* cursor = db.Cursor(); + + if(!cursor) return 0; + + // + // Move past the first word count record + // + const WordReference& last = WordStat::Last(context); + last.Pack(key, record); + if(cursor->Get(key, record, DB_SET_RANGE) != 0) + return 0; + list = new List; + do { + WordReference wordRef(context, key, record); + if(lastWord.Key().GetWord().empty() || + wordRef.Key().GetWord() != lastWord.Key().GetWord()) + { + list->Add(new String(wordRef.Key().GetWord())); + lastWord = wordRef; + } + } while (cursor->Get(key, record, DB_NEXT) == 0); + + return list; +#endif +} + +// ***************************************************************************** +// +// Returns the reference count for word in <count> arg +// +int WordListMulti::Noccurrence(const String& , unsigned int& ) const +{ + return 0; +#if 0 + noccurrence = 0; + WordStat stat(context, key.GetWord()); + int ret; + if((ret = db.Get(stat)) != 0) { + if(ret != DB_NOTFOUND) + return NOTOK; + } else { + noccurrence = stat.Noccurrence(); + } + + return OK; +#endif +} + +// ***************************************************************************** +// +// Increment reference count for wordRef +// +int WordListMulti::Ref(const WordReference& ) +{ + return NOTOK; +} + +// ***************************************************************************** +// +// Decrement reference count for wordRef +// +int WordListMulti::Unref(const WordReference& ) +{ + return NOTOK; +} + +// ***************************************************************************** +// +int WordListMulti::AllRef() { + if(!extended) return OK; + + Merge(); + + WordDBMulti* db; + ListCursor cursor; + for(dbs->Start_Get(cursor); (db = (WordDBMulti*)dbs->Get_Next(cursor));) { + if(!db->words->isopen) { + if(db->words->Open(db->filename, O_RDWR) != OK) return NOTOK; + if(db->words->Close() != OK) return NOTOK; + } + } + + return OK; +} diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordListMulti.h b/debian/htdig/htdig-3.2.0b6/htword/WordListMulti.h new file mode 100644 index 00000000..2aede10f --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordListMulti.h @@ -0,0 +1,252 @@ +// +// WordList.h +// +// NAME +// +// manage and use an inverted index file. +// +// SYNOPSIS +// +// #include <mifluz.h> +// +// Configuration* config; +// WordReference wordRef; +// ... +// WordList* words = new WordList(config) +// +// delete words; +// +// DESCRIPTION +// +// WordList is the <i>mifluz</i> equivalent of a database handler. Each +// WordList object is bound to an inverted index file and implements the +// operations to create it, fill it with word occurrences and search +// for an entry matching a given criterion. +// +// CONFIGURATION +// +// wordlist_extend {true|false} (default false) +// If <b>true</b> maintain reference count of unique +// words. The <b>Noccurrence</b> method gives access to this count. +// +// wordlist_verbose <number> (default 0) +// Set the verbosity level of the WordList class. +// <br> +// 1 walk logic +// <br> +// 2 walk logic details +// <br> +// 3 walk logic lots of details +// +// wordlist_page_size <bytes> (default 8192) +// Berkeley DB page size (see Berkeley DB documentation) +// +// wordlist_cache_size <bytes> (default 500K) +// Berkeley DB cache size (see Berkeley DB documentation) +// Cache makes a huge difference in performance. It must be at least 2% +// of the expected total data size. Note that if compression is activated +// the data size is eight times larger than the actual file size. In this +// case the cache must be scaled to 2% of the data size, not 2% +// of the file size. See <b>Cache tuning</b> in the mifluz guide for +// more hints. +// +// wordlist_compress {true|false} (default false) +// Activate compression of the index. The resulting index is eight times +// smaller than the uncompressed index. +// +// +// END +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordListMulti.h,v 1.4 2004/05/28 13:15:28 lha Exp $ +// + +#ifndef _WordListMulti_h_ +#define _WordListMulti_h_ + +#include <fcntl.h> +#include <stdio.h> + +#ifndef SWIG +#include "WordList.h" +#include "WordCursorOne.h" +//#include "WordCursorMulti.h" +#endif /* SWIG */ + +class WordContext; + +// +// Inverted index interface +// +class WordListMulti : public WordList +{ + public: + //- + // Constructor. Build inverted index handling object using + // run time configuration parameters listed in the <b>CONFIGURATION</b> + // section. + // + WordListMulti(WordContext* ncontext); + virtual ~WordListMulti(); + +#ifndef SWIG + virtual int Override(const WordReference& wordRef); +#endif /* SWIG */ + + //- + // Returns OK if <b>wordRef</b> exists in the index, NOTOK otherwise. + // + virtual int Exists(const WordReference& wordRef); + + // + // Delete permanently + // + //- + // Delete all entries in the index whose key matches the + // <i>Key()</i> part of <b>wordRef</b>, using the <i>Walk</i> + // method. + // Returns the number of entries successfully deleted. + // + virtual int WalkDelete(const WordReference& wordRef); + //- + // Delete the entry in the index that exactly matches the + // <i>Key()</i> part of <b>wordRef.</b> + // Returns OK if deletion is successfull, NOTOK otherwise. + // + virtual int Delete(const WordReference& wordRef); + + //- + // Open inverted index <b>filename.</b> <b>mode</b> + // may be <i>O_RDONLY</i> or <i>O_RDWR.</i> If mode is + // <i>O_RDWR</i> it can be or'ed with <i>O_TRUNC</i> to reset + // the content of an existing inverted index. + // Return OK on success, NOTOK otherwise. + // + virtual int Open(const String& filename, int mode); + //- + // Close inverted index. + // Return OK on success, NOTOK otherwise. + // + virtual int Close(); + //- + // Return the size of the index in pages. + // + virtual unsigned int Size() const; + int AddIndex(); + int Merge(); + + //- + // Alias to the <b>Find</b> method. + // + virtual List *operator [] (const WordReference& wordRef); + //- + // Returns the list of word occurrences matching the <i>Key()</i> + // part of <b>wordRef.</b> In the <i>Key()</i>, the string + // (accessed with <i>GetWord()</i>) matches any string that begins + // with it. The <i>List</i> returned contains pointers to + // <i>WordReference</i> objects. It is the responsibility of the + // caller to free the list. + // + virtual List *Prefix (const WordReference& prefix); + + // + // Iterate over the complete database. + // +#ifndef SWIG + //- + // Returns a list of all unique words contained in the inverted + // index. The <i>List</i> returned contains pointers to + // <i>String</i> objects. It is the responsibility of the caller + // to free the list. See List.h header for usage. + // + virtual List *Words(); +#endif /* SWIG */ + //- + // Returns a list of all entries contained in the + // inverted index. The <i>List</i> returned contains pointers to + // <i>WordReference</i> objects. It is the responsibility of + // the caller to free the list. See List.h header for usage. + // + virtual List *WordRefs(); + +#ifndef SWIG + //- + // Create a cursor that searches all the occurrences in the + // inverted index and call <b>ncallback</b> with + // <b>ncallback_data</b> for every match. + // + virtual inline WordCursor *Cursor(wordlist_walk_callback_t callback, Object *callback_data) { return new WordCursorOne(this, callback, callback_data); } +#endif /* SWIG */ + //- + // Create a cursor that searches all the occurrences in the + // inverted index and that match <b>nsearchKey.</b> If + // <b>naction</b> is set to HTDIG_WORDLIST_WALKER calls + // <b>searchKey.callback</b> with <b>searchKey.callback_data</b> + // for every match. If <b>naction</b> is set to + // HTDIG_WORDLIST_COLLECT push each match in <b>searchKey.collectRes</b> + // data member as a <b>WordReference</b> object. It is the responsibility + // of the caller to free the <b>searchKey.collectRes</b> list. + // + virtual inline WordCursor *Cursor(const WordKey &searchKey, int action = HTDIG_WORDLIST_WALKER) { return new WordCursorOne(this, searchKey, action); } +#ifndef SWIG + //- + // Create a cursor that searches all the occurrences in the + // inverted index and that match <b>nsearchKey</b> and calls + // <b>ncallback</b> with <b>ncallback_data</b> for every match. + // + virtual inline WordCursor *Cursor(const WordKey &searchKey, wordlist_walk_callback_t callback, Object * callback_data) { return new WordCursorOne(this, searchKey, callback, callback_data); } +#endif /* SWIG */ + + // + // Update/get global word statistics statistics + // + //- + // Add one to the reference count for the string contained + // in the <i>Key().GetWord()</i> part of <b>wordRef.</b> + // Returns OK on success, NOTOK otherwise. + // + virtual int Ref(const WordReference& wordRef); + //- + // Substract one to the reference count for the string contained + // in the <i>Key().GetWord()</i> part of <b>wordRef.</b> + // Returns OK on success, NOTOK otherwise. + // + virtual int Unref(const WordReference& wordRef); + virtual int AllRef(); + +#ifndef SWIG + //- + // Return in <b>noccurrence</b> the number of occurrences of the + // string contained in the <i>GetWord()</i> part of <b>key.</b> + // Returns OK on success, NOTOK otherwise. + // + virtual int Noccurrence(const String& key, unsigned int& noccurrence) const; + virtual int Write(FILE* f) { return NOTOK; } + virtual int Read(FILE* f) { return NOTOK; } + + virtual WordKey Key(const String& bufferin) { abort(); return WordKey(0); } + + virtual WordReference Word(const String& bufferin, int exists = 0) { abort(); return WordReference(0); } + +#endif /* SWIG */ + // + // Retrieve WordReferences from the database. + // Backend of WordRefs, operator[], Prefix... + // + virtual List *Collect(const WordReference& word); +#ifndef SWIG + List* dbs; + int serial; + int file_max; + int file_min; + unsigned int put_max; +#endif /* SWIG */ +}; + +#endif /* _WordListMulti_h_ */ + diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordListOne.cc b/debian/htdig/htdig-3.2.0b6/htword/WordListOne.cc new file mode 100644 index 00000000..34e0019a --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordListOne.cc @@ -0,0 +1,485 @@ +// +// WordListOne.cc +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordListOne.cc,v 1.6 2004/05/28 13:15:28 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "WordListOne.h" +#include "WordReference.h" +#include "WordRecord.h" +#include "WordType.h" +#include "WordContext.h" +#include "Configuration.h" +#include "htString.h" +#include "HtTime.h" +#include "WordDBCompress.h" +#include "WordDBCache.h" +#include "WordDead.h" +#include "WordMeta.h" + +#include <stdio.h> +#include <stdlib.h> + +#ifndef _MSC_VER /* _WIN32 */ +#include <unistd.h> +#endif + +#include <ctype.h> +#include <errno.h> + +// ***************************************************************************** +// +WordListOne::WordListOne(WordContext* ncontext) +{ + context = ncontext; + db = new WordDB(ncontext->GetDBInfo()); + dict = new WordDict(); + dict->Initialize(this); + meta = new WordMeta(); + meta->Initialize(this); + dead = new WordDead(); + dead->Initialize(this); + + // The database itself hasn't been opened yet + isopen = 0; + Configuration& config = context->GetConfiguration(); + extended = config.Boolean("wordlist_extend"); + verbose = config.Value("wordlist_verbose"); + compressor = 0; + caches = 0; + flags = 0; +} + +// ***************************************************************************** +// +WordListOne::~WordListOne() +{ + BatchEnd(); + Close(); + delete dead; + delete meta; + delete dict; + delete db; +} + +static int word_db_qcmp(WordContext* context, const WordDBCacheEntry *a, const WordDBCacheEntry *b) +{ + return WordKey::Compare(context, (const unsigned char*)a->key, a->key_size, (const unsigned char*)b->key, b->key_size); +} + +// ***************************************************************************** +// +int WordListOne::Open(const String& nfilename, int mode) +{ + filename = nfilename; + + int usecompress = 0; + Configuration& config = context->GetConfiguration(); + + if(config.Boolean("wordlist_compress") == 1) { + usecompress = DB_COMPRESS; + WordDBCompress* compressor = new WordDBCompress(context); + // compressor->debug = config.Value("wordlist_compress_debug"); + SetCompressor(compressor); + + context->GetDBInfo().dbenv->mp_cmpr_info = compressor->CmprInfo(); + context->GetDBInfo().dbenv->flags |= DB_ENV_CMPR; + } + + flags = (mode & O_RDWR) ? DB_CREATE : DB_RDONLY; + flags |= usecompress; + if(mode & O_TRUNC) { + if(mode & O_RDWR) { + unlink((char*)filename); + } else + fprintf(stderr, "WordListOne::Open: O_TRUNC | O_RDONLY is meaningless\n"); + } + + WordLock* lock; + Meta()->Lock("open", lock); + + db->set_bt_compare(word_db_cmp, (void*)context); + + if(config.Boolean("wordlist_cache_inserts", 0)) { + int size = config.Value("wordlist_cache_size", 0); + if(size / 2 < WORD_DB_CACHE_MINIMUM) + size = 0; + else + size /= 2; + + db->CacheOn(context, size); + db->CacheCompare(word_db_qcmp); + } + + db->set_pagesize(Pagesize()); + + int ret = db->Open(filename, "index", DB_BTREE, flags, 0666, WORD_DB_INDEX) == 0 ? OK : NOTOK; + if(ret == NOTOK) return ret; + if(dict->Open() != OK) return NOTOK; + if(meta->Open() != OK) return NOTOK; + if(dead->Open() != OK) return NOTOK; + + isopen = 1; + + Meta()->Unlock("open", lock); + + return ret; +} + +// ***************************************************************************** +// +int WordListOne::Close() +{ + if(isopen) { + if(db->Close() != 0) return NOTOK; + if(dict->Close() != 0) return NOTOK; + if(meta->Close() != 0) return NOTOK; + if(dead->Close() != 0) return NOTOK; + isopen = 0; + } + + { + WordDBCompress* compressor = GetCompressor(); + if(compressor) { + delete compressor; + SetCompressor(0); + } + delete context->GetDBInfo().dbenv->mp_cmpr_info; + context->GetDBInfo().dbenv->mp_cmpr_info = 0; + context->GetDBInfo().dbenv->flags &= ~DB_ENV_CMPR; + } + + return OK; +} + +// **************************************************************************** +// +unsigned int WordListOne::Size() const +{ + return db->Size(); +} + +// **************************************************************************** +// +int WordListOne::Override(const WordReference& arg) +{ + if (arg.GetWord().length() == 0) { + fprintf(stderr, "WordListOne::Override(%s) word is zero length\n", (char*)arg.Get()); + return NOTOK; + } + if (!arg.Key().Filled()) { + fprintf(stderr, "WordListOne::Override(%s) key is not fully defined\n", (char*)arg.Get()); + return NOTOK; + } + + WordType& wtype = context->GetType(); + WordReference wordRef(arg); + String word = wordRef.GetWord(); + if(wtype.Normalize(word) & WORD_NORMALIZE_NOTOK) + return NOTOK; + wordRef.SetWord(word); + unsigned int wordid = 0; + if(dict->SerialRef(word, wordid) != OK) return NOTOK; + wordRef.Key().Set(WORD_KEY_WORD, wordid); + + int ret = NOTOK; + + if(caches) { + String key; + String record; + if(wordRef.Pack(key, record) != OK) + return NOTOK; + ret = caches->Add(key.get(), key.length(), record.get(), record.length()) == 0 ? OK : NOTOK; + if(caches->Full()) caches->Merge(*db); + } else { + ret = db->Put(wordRef, 0) == 0 ? OK : NOTOK; + } + + return ret; +} + + +// ***************************************************************************** +// +List *WordListOne::operator [] (const WordReference& wordRef) +{ + return Collect(wordRef); +} + +// ***************************************************************************** +// +List *WordListOne::Prefix (const WordReference& prefix) +{ + List* result = new List(); + WordDictCursor* cursor = Dict()->CursorPrefix(prefix.GetWord()); + String word; + WordDictRecord record; + WordReference prefix2(prefix); + while(Dict()->NextPrefix(cursor, word, record) == 0) { + prefix2.Key().Set(WORD_KEY_WORD, record.Id()); + List* tmp_result = Collect(prefix2); + while(tmp_result->Count() > 0) { + WordReference* entry = (WordReference*)tmp_result->Shift(LIST_REMOVE_RELEASE); + entry->SetWord(word); + result->Push(entry); + } + delete tmp_result; + } + return result; +} + +// ***************************************************************************** +// +List *WordListOne::WordRefs() +{ + return Collect(WordReference(context)); +} + +// ***************************************************************************** +// +List *WordListOne::Collect(const WordReference& wordRef) +{ + WordCursor *search = Cursor(wordRef.Key(), HTDIG_WORDLIST_COLLECTOR); + if(search->Walk() != OK) return 0; + List* result = search->GetResults(); + delete search; + return result; +} + +// ***************************************************************************** +// +int +WordListOne::Read(FILE* f) +{ + WordReference wordRef(context); +#define WORD_BUFFER_SIZE 1024 + char buffer[WORD_BUFFER_SIZE + 1]; + String line; + int line_number = 0; + int inserted = 0; + + BatchStart(); + + String key; + String record; + + while(fgets(buffer, WORD_BUFFER_SIZE, f)) { + line_number++; + int buffer_length = strlen(buffer); + int eol = buffer[buffer_length - 1] == '\n'; + + if(eol) buffer[--buffer_length] = '\0'; + + line.append(buffer, buffer_length); + // + // Join big lines + // + if(!eol) continue; + // + // If line ends with a \ continue + // + if(line.last() == '\\') { + line.chop(1); + continue; + } + + if(!line.empty()) { + StringList fields(line, "\t "); + + // + // Convert the word to a wordid + // + String* word = (String*)fields.Get_First(); + unsigned int wordid; + if(dict->SerialRef(*word, wordid) != OK) return NOTOK; + word->trunc(); + (*word) << wordid; + + if(wordRef.SetList(fields) != OK) { + fprintf(stderr, "WordList::Read: line %d : %s\n", line_number, (char*)line); + fprintf(stderr, " cannot build WordReference (ignored)\n"); + } else { + if(wordRef.Pack(key, record) != OK) { + fprintf(stderr, "WordList::Read: line %d : %s\n", line_number, (char*)line); + fprintf(stderr, " pack failed (ignored)\n"); + } else { + caches->Add(key.get(), key.length(), record.get(), record.length()); + inserted++; + } + if(verbose && (inserted % 10000 == 0)) fprintf(stderr, "WordList::Read: inserted %d entries\n", inserted); + if(verbose > 1) fprintf(stderr, "WordList::Read: inserting %s\n", (char*)wordRef.Get()); + } + + line.trunc(); + } + } + + BatchEnd(); + + return inserted; +} + +// ***************************************************************************** +// +// streaming operators for ascii dumping and reading a list +class FileOutData : public Object +{ +public: + FILE* f; + String word; + FileOutData(FILE* f_arg) : f(f_arg) { } +}; + +// ***************************************************************************** +// +static int +wordlist_walk_callback_file_out(WordList *, WordDBCursor& , const WordReference *wordRef, Object &ndata) +{ + FileOutData& data = (FileOutData&)ndata; + ((WordReference*)wordRef)->SetWord(data.word); + fprintf(data.f, "%s\n", (char*)wordRef->Get()); + return OK; +} + +int WordListOne::Write(FILE* f) +{ + FileOutData data(f); + WordDictCursor* cursor = dict->Cursor(); + int ret; + String word; + WordDictRecord wordinfo; + while((ret = dict->Next(cursor, word, wordinfo)) == 0) { + WordKey key(context); + key.Set(WORD_KEY_WORD, wordinfo.Id()); + data.word = word; + WordCursor *search = Cursor(key, wordlist_walk_callback_file_out, (Object *)&data); + search->Walk(); + delete search; + } + return ret == DB_NOTFOUND ? OK : NOTOK; +} + + +// ***************************************************************************** +// +// Callback data dedicated to Dump and dump_word communication +// +class DeleteWordData : public Object +{ +public: + DeleteWordData() { count = 0; } + + int count; +}; + +// ***************************************************************************** +// +// +static int delete_word(WordList *words, WordDBCursor &cursor, const WordReference *word, Object &data) +{ + WordListOne *words_one = (WordListOne*)words; + if(words_one->DeleteCursor(cursor) == 0) { + ((DeleteWordData&)data).count++; + return OK; + } else { + fprintf(stderr, "WordList delete_word: deleting %s failed\n", (char*)word->Get()); + return NOTOK; + } +} + +// ***************************************************************************** +// +// Delete all records matching wordRef, return the number of +// deleted records. +// +int WordListOne::WalkDelete(const WordReference& wordRef) +{ + DeleteWordData data; + WordKey key = wordRef.Key(); + + if(key.IsDefined(WORD_KEY_WORD)) { + WordCursor *description = Cursor(key, delete_word, &data); + description->Walk(); + delete description; + dict->Decr(wordRef.GetWord(), data.count); + } else { + WordDictCursor* cursor = dict->Cursor(); + int ret; + String word; + WordDictRecord wordinfo; + int total = 0; + while((ret = dict->Next(cursor, word, wordinfo)) == 0) { + key.Set(WORD_KEY_WORD, wordinfo.Id()); + WordCursor *search = Cursor(key, delete_word, &data); + search->Walk(); + delete search; + dict->Decr(word, data.count); + total += data.count; + data.count = 0; + } + data.count = total; + } + return data.count; +} + +// ***************************************************************************** +// +// Returns the reference count for word in <count> arg +// +int WordListOne::Noccurrence(const String& word, unsigned int& noccurrence) const +{ + return dict->Noccurrence(word, noccurrence); +} + +WordKey WordListOne::Key(const String& bufferin) +{ + WordKey key(context); + StringList fields(bufferin, "\t "); + String* field = (String*)fields.Get_First(); + unsigned int wordid; + Dict()->Serial(*field, wordid); + field->trunc(); + (*field) << wordid; + key.SetList(fields); + return key; +} + +WordReference WordListOne::Word(const String& bufferin, int exists /* = 1 */) +{ + WordReference wordRef(context); + StringList fields(bufferin, "\t "); + String* field = (String*)fields.Get_First(); + if(context->GetType().Normalize(*field) & WORD_NORMALIZE_NOTOK) { + fprintf(stderr, "WordListOne::Word: cannot normalize word\n"); + } + String word = *field; + unsigned int wordid; + if(exists) + Dict()->SerialExists(word, wordid); + else + Dict()->Serial(word, wordid); + field->trunc(); + (*field) << wordid; + wordRef.SetList(fields); + wordRef.SetWord(word); + return wordRef; +} + +void +WordListOne::BatchEnd() +{ + if(caches) { + caches->Merge(*db); + WordList::BatchEnd(); + } +} diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordListOne.h b/debian/htdig/htdig-3.2.0b6/htword/WordListOne.h new file mode 100644 index 00000000..4d51fc81 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordListOne.h @@ -0,0 +1,142 @@ +// +// WordListOne.h +// +// NAME +// +// manage and use an inverted index file. +// +// SYNOPSIS +// +// #include <mifluz.h> +// +// WordContext context; +// +// WordList* words = context->List(); +// WordList* words = WordListOne(&context); +// +// DESCRIPTION +// +// WordList is the <i>mifluz</i> equivalent of a database handler. Each +// WordList object is bound to an inverted index file and implements the +// operations to create it, fill it with word occurrences and search +// for an entry matching a given criterion. +// +// The general behavious of WordListOne is described in the WordList +// manual page. It is prefered to create a WordListOne instance by +// setting the <i>wordlist_multi</i> configuration parameter to false +// and calling the <b>WordContext::List</b> method. +// +// Only the methods that differ from WordList are listed here. +// All the methods of WordList are implemented by WordListOne and +// you should refer to the manual page for more information. +// +// The <b>Cursor</b> methods all return a WordCursorOne instance +// cast to a WordCursor object. +// +// END +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordListOne.h,v 1.4 2004/05/28 13:15:28 lha Exp $ +// + +#ifndef _WordListOne_h_ +#define _WordListOne_h_ + +#include <fcntl.h> +#include <stdio.h> + +#include "WordList.h" +#include "WordCursorOne.h" +#include "WordDict.h" +#include "WordMeta.h" +#include "WordDead.h" + +class WordContext; + +// +// Inverted index interface +// +class WordListOne : public WordList +{ + public: + //- + // Constructor. Build inverted index handling object using + // run time configuration parameters listed in the <b>CONFIGURATION</b> + // section of the <b>WordList</b> manual page. + // + WordListOne(WordContext* ncontext); + virtual ~WordListOne(); + + virtual int Override(const WordReference& wordRef); + + virtual inline int Exists(const WordReference& wordRef) { + return (!Dead()->Exists(wordRef.Key()) && db->Exists(wordRef) == 0) ? OK : NOTOK; } + + virtual int WalkDelete(const WordReference& wordRef); + virtual inline int Delete(const WordReference& wordRef) { + if(db->Del(wordRef) == 0) + return dict->Unref(wordRef.GetWord()); + else + return NOTOK; + } + //- + // Delete the inverted index entry currently pointed to by the + // <b>cursor.</b> + // Returns 0 on success, Berkeley DB error code on error. This + // is mainly useful when implementing a callback function for + // a <b>WordCursor.</b> + // + int DeleteCursor(WordDBCursor& cursor) { return cursor.Del(); } + + virtual int Open(const String& filename, int mode); + virtual int Close(); + virtual unsigned int Size() const; + virtual int Pagesize() const { + Configuration& config = context->GetConfiguration(); + + return config.Value("wordlist_page_size", 0); + } + + virtual inline WordDict *Dict() { return dict; } + virtual inline WordMeta *Meta() { return meta; } + virtual inline WordDead *Dead() { return dead; } + + virtual List *operator [] (const WordReference& wordRef); + virtual List *Prefix (const WordReference& prefix); + + virtual List *Words() { return dict->Words(); } + virtual List *WordRefs(); + + virtual inline WordCursor *Cursor(wordlist_walk_callback_t callback, Object *callback_data) { return new WordCursorOne(this, callback, callback_data); } + virtual inline WordCursor *Cursor(const WordKey &searchKey, int action = HTDIG_WORDLIST_WALKER) { return new WordCursorOne(this, searchKey, action); } + virtual inline WordCursor *Cursor(const WordKey &searchKey, wordlist_walk_callback_t callback, Object * callback_data) { return new WordCursorOne(this, searchKey, callback, callback_data); } + + virtual WordKey Key(const String& bufferin); + + virtual WordReference Word(const String& bufferin, int exists = 0); + + virtual void BatchEnd(); + + virtual int Noccurrence(const String& key, unsigned int& noccurrence) const; + + virtual int Write(FILE* f); + + virtual inline int WriteDict(FILE* f) { return dict->Write(f); } + + virtual int Read(FILE* f); + + virtual List *Collect(const WordReference& word); + + WordDB *db; + WordDict *dict; + WordMeta *meta; + WordDead *dead; +}; + +#endif /* _WordListOne_h_ */ + diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordMeta.cc b/debian/htdig/htdig-3.2.0b6/htword/WordMeta.cc new file mode 100644 index 00000000..66741a4e --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordMeta.cc @@ -0,0 +1,182 @@ +// +// WordMeta.cc +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordMeta.cc,v 1.4 2004/05/28 13:15:28 lha Exp $ +// +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <fcntl.h> + +extern "C" { +#include "db_int.h" +#include "db_page.h" +#include "db_shash.h" +#include "lock.h" +#include "mp.h" +} + +#include "WordMeta.h" +#include "WordListOne.h" + +#define WORD_META_SERIAL_SIZE (WORD_META_SERIAL_FILE + 1) + +class WordLock { +public: + WordLock() { lock.off = LOCK_INVALID; } + + DB_LOCK lock; +}; + +// +// Total size of structure must *NOT* be over 256 bytes. +// +typedef struct _WordMetaInfo { + DBMETA meta; + unsigned int serials[WORD_META_SERIAL_SIZE]; +} WordMetaInfo; + +class WordMetaImp +{ +public: + WordMetaImp() { + mpf = 0; + pgno = PGNO_INVALID; + info = 0; + } + + DB_MPOOLFILE *mpf; + db_pgno_t pgno; + WordMetaInfo *info; +}; + +WordMeta::~WordMeta() +{ + delete imp; + delete db; +} + +int WordMeta::Initialize(WordList* nwords) +{ + words = nwords; + db = new WordDB(nwords->GetContext()->GetDBInfo()); + imp = new WordMetaImp(); + return OK; +} + +int WordMeta::Open() +{ + const String& filename = words->Filename(); + int flags = words->Flags(); + + db->set_pagesize(words->Pagesize()); + + if(db->Open(filename, "meta", DB_BTREE, flags, 0666, WORD_DB_DICT) != 0) + return NOTOK; + + imp->mpf = db->db->mpf; + + int ret; + String kpgno("pgno"); + + if((ret = db->Get(0, kpgno, imp->pgno, 0)) != 0 && ret != DB_NOTFOUND) + return NOTOK; + + /* + * First time thru, create the meta page and initialize it. + */ + if(ret == DB_NOTFOUND) { + if(CDB_memp_fget(imp->mpf, &imp->pgno, DB_MPOOL_NEW, (void**)&imp->info) != 0) + return NOTOK; + memset((char*)imp->info, '\0', sizeof(WordMetaInfo)); + imp->info->meta.type = P_INVALID; + imp->info->meta.pgno = imp->pgno; + if(CDB_memp_fput(imp->mpf, (void*)imp->info, DB_MPOOL_DIRTY) != 0) + return NOTOK; + + if(db->Put(0, kpgno, imp->pgno, 0) != 0) + return NOTOK; + } + + return OK; +} + +int WordMeta::Close() +{ + return db->Close() == 0 ? OK : NOTOK; +} + +int WordMeta::Serial(int what, unsigned int& serial) +{ + serial = WORD_META_SERIAL_INVALID; + if(CDB_memp_fget(imp->mpf, &imp->pgno, 0, (void**)&imp->info) != 0) + return NOTOK; + serial = ++imp->info->serials[what]; + if(CDB_memp_fput(imp->mpf, (void*)imp->info, DB_MPOOL_DIRTY) != 0) + return NOTOK; + + return OK; +} + +int WordMeta::GetSerial(int what, unsigned int& serial) +{ + serial = WORD_META_SERIAL_INVALID; + if(CDB_memp_fget(imp->mpf, &imp->pgno, 0, (void**)&imp->info) != 0) + return NOTOK; + serial = imp->info->serials[what]; + if(CDB_memp_fput(imp->mpf, (void*)imp->info, 0) != 0) + return NOTOK; + + return OK; +} + +int WordMeta::SetSerial(int what, unsigned int serial) +{ + if(CDB_memp_fget(imp->mpf, &imp->pgno, 0, (void**)&imp->info) != 0) + return NOTOK; + imp->info->serials[what] = serial; + if(CDB_memp_fput(imp->mpf, (void*)imp->info, DB_MPOOL_DIRTY) != 0) + return NOTOK; + + return OK; +} + +int WordMeta::Lock(const String& resource, WordLock*& lock) +{ + lock = new WordLock; + DB_ENV* dbenv = words->GetContext()->GetDBInfo().dbenv; + u_int32_t id; + if(CDB_lock_id(dbenv, &id) != 0) { + delete lock; + lock = 0; + return NOTOK; + } + DBT obj; + obj.size = resource.length(); + obj.data = (void*)resource.get(); + if(CDB_lock_get(dbenv, id, 0, &obj, DB_LOCK_WRITE, &lock->lock) != 0) { + delete lock; + lock = 0; + return NOTOK; + } + return OK; +} + +int WordMeta::Unlock(const String& resource, WordLock*& lock) +{ + DB_ENV* dbenv = words->GetContext()->GetDBInfo().dbenv; + + int ret = CDB_lock_put(dbenv, &lock->lock); + + delete lock; + lock = 0; + + return ret == 0 ? OK : NOTOK; +} diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordMeta.h b/debian/htdig/htdig-3.2.0b6/htword/WordMeta.h new file mode 100644 index 00000000..5bcc7f48 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordMeta.h @@ -0,0 +1,87 @@ +// +// WordMeta.h +// +// NAME +// +// abstract class to manage and use an inverted index file. +// +// SYNOPSIS +// +// #include <mifluz.h> +// +// WordContext context; +// +// WordMeta* words = context->Meta(); +// +// delete words; +// +// DESCRIPTION +// +// WordMeta is the <i>mifluz</i> equivalent of a database handler. Each +// WordMeta object is bound to an inverted index file and implements the +// operations to create it, fill it with word occurrences and search +// for an entry matching a given criterion. +// +// WordMeta is an abstract class and cannot be instanciated. +// The <b>Meta</b> method of the class WordContext will create +// an instance using the appropriate derived class, either WordMetaOne +// or WordMetaMulti. Refer to the corresponding manual pages for +// more information on their specific semantic. +// +// +// +// END +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordMeta.h,v 1.4 2004/05/28 13:15:28 lha Exp $ +// + +#ifndef _WordMeta_h_ +#define _WordMeta_h_ + +#include <stdio.h> + +#include "htString.h" +#include "WordDB.h" + +class WordContext; +class WordLock; +class WordMetaImp; + +// +// Serial number range [1-2^32] +// +#define WORD_META_SERIAL_INVALID 0 + +#define WORD_META_SERIAL_WORD 0 +#define WORD_META_SERIAL_FILE 1 + +class WordMeta +{ + public: + WordMeta() { words = 0; db = 0; imp = 0; } + ~WordMeta(); + + int Initialize(WordList* words); + + int Open(); + int Close(); + + int Serial(int what, unsigned int& serial); + int GetSerial(int what, unsigned int& serial); + int SetSerial(int what, unsigned int serial); + + int Lock(const String& resource, WordLock*& lock); + int Unlock(const String& resource, WordLock*& lock); + + private: + WordList *words; + WordDB *db; + WordMetaImp *imp; +}; +#endif /* _WordMeta_h_ */ diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordMonitor.cc b/debian/htdig/htdig-3.2.0b6/htword/WordMonitor.cc new file mode 100644 index 00000000..d5f342fd --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordMonitor.cc @@ -0,0 +1,272 @@ +// +// WordMonitor.cc +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordMonitor.cc,v 1.7 2004/05/28 13:15:28 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <stdlib.h> +#include <signal.h> + +#ifndef _MSC_VER /* _WIN32 */ +#include <unistd.h> +#endif + +#include "StringList.h" +#include "WordMonitor.h" + +#define WORD_MONITOR_RRD 1 +#define WORD_MONITOR_READABLE 2 + +WordMonitor* WordMonitor::instance = 0; + +char* WordMonitor::values_names[WORD_MONITOR_VALUES_SIZE] = { + "", + "C.Write", + "C.Read", + "C.Compress 1/1", + "C.Compress 1/2", + "C.Compress 1/3", + "C.Compress 1/4", + "C.Compress 1/5", + "C.Compress 1/6", + "C.Compress 1/7", + "C.Compress 1/8", + "C.Compress 1/9", + "C.Compress 1/10", + "C.Compress 1/>10", + "C.P_IBTREE", + "C.P_LBTREE", + "C.P_UNKNOWN", + "C.Put", + "C.Get (0)", + "C.Get (NEXT)", + "C.Get (SET_RANGE)", + "C.Get (Other)", + "G.LEVEL", + "G.PGNO", + "C.CMP", + 0 +}; + +WordMonitor::WordMonitor(const Configuration &config) +{ + memset((char*)values, '\0', sizeof(unsigned int) * WORD_MONITOR_VALUES_SIZE); + memset((char*)old_values, '\0', sizeof(unsigned int) * WORD_MONITOR_VALUES_SIZE); + started = elapsed = time(0); + output_style = WORD_MONITOR_READABLE; + if((period = config.Value("wordlist_monitor_period"))) { + const String& desc = config.Find("wordlist_monitor_output"); + StringList fields(desc, ','); + + if(fields.Count() > 0) { + char* filename = fields[0]; + if(filename[0] == '\0') + output = stderr; + else { + output = fopen(filename, "a"); + if(!output) { + fprintf(stderr, "WordMonitor::WordMonitor: cannot open %s for writing ", filename); + perror(""); + output = stderr; + return; + } + } + if(fields.Count() > 1) { + char* style = fields[1]; + if(!mystrcasecmp(style, "rrd")) + output_style = WORD_MONITOR_RRD; + else + output_style = WORD_MONITOR_READABLE; + } + } + TimerStart(); + } +} + +WordMonitor::~WordMonitor() +{ + TimerStop(); + if(output != stderr) + fclose(output); +} + +void +WordMonitor::Initialize(const Configuration &config_arg) +{ + if(instance != 0) + delete instance; + instance = new WordMonitor(config_arg); +} + +const String +WordMonitor::Report() const +{ + String output; + int i; + time_t now = time(0); + + if(output_style == WORD_MONITOR_RRD) + output << (int)now << ":"; + + for(i = 0; i < WORD_MONITOR_VALUES_SIZE; i++) { + if(!values_names[i]) break; + if(values_names[i][0]) { + if(output_style == WORD_MONITOR_READABLE) { + output << values_names[i] << ": " << values[i]; + if((now - elapsed) > 0) { + output << ", per sec : " << (int)(values[i] / (now - started)); + output << ", delta : " << (values[i] - old_values[i]); + output << ", per sec : " << (int)((values[i] - old_values[i]) / (now - elapsed)); + } + output << "|"; + } else if(output_style == WORD_MONITOR_RRD) { + output << values[i] << ":"; + } + } + } + memcpy((char*)old_values, (char*)values, sizeof(unsigned int) * WORD_MONITOR_VALUES_SIZE); + return output; +} + +static void handler_alarm(int signal) +{ + WordMonitor* monitor = WordMonitor::Instance(); + if(!monitor) { + fprintf(stderr, "WordMonitor::handler_alarm: no instance\n"); + return; + } + monitor->TimerClick(signal); +} + +void +WordMonitor::TimerStart() +{ + if(period < 5) { + fprintf(stderr, "WordMonitor::TimerStart: wordlist_monitor_period must be > 5 (currently %d) otherwise monitoring is not accurate\n", period); + return; + } + +#ifndef _MSC_VER /* _WIN32 */ + struct sigaction action; + struct sigaction old_action; + memset((char*)&action, '\0', sizeof(struct sigaction)); + memset((char*)&old_action, '\0', sizeof(struct sigaction)); + action.sa_handler = handler_alarm; + if(sigaction(SIGALRM, &action, &old_action) != 0) { + fprintf(stderr, "WordMonitor::TimerStart: installing SIGALRM "); + perror(""); + } + + if(old_action.sa_handler != SIG_DFL) { + fprintf(stderr, "WordMonitor::TimerStart: found an installed action while installing SIGALRM, restoring old action\n"); + if(sigaction(SIGALRM, &old_action, NULL) != 0) { + fprintf(stderr, "WordMonitor::TimerStart: installing old SIGALRM "); + perror(""); + } + return; + } +#endif + + fprintf(output, "----------------- WordMonitor starting -------------------\n"); + if(output_style == WORD_MONITOR_RRD) { + fprintf(output, "Started:%ld\n", started); + fprintf(output, "Period:%d\n", period); + fprintf(output, "Time:"); + int i; + for(i = 0; i < WORD_MONITOR_VALUES_SIZE; i++) { + if(!values_names[i]) break; + if(values_names[i][0]) + fprintf(output, "%s:", values_names[i]); + } + fprintf(output, "\n"); + } + fflush(output); + TimerClick(0); +} + +void +WordMonitor::TimerClick(int signal) +{ + if(signal) { + // + // Do not report if less than <period> since last report. + // + if(time(0) - elapsed >= period) { + fprintf(output, "%s\n", (const char*)Report()); + elapsed = time(0); + fflush(output); + } + } +#ifndef _MSC_VER /* _WIN32 */ + alarm(period); +#endif +} + +void +WordMonitor::TimerStop() +{ + if(period > 0) { +#ifndef _MSC_VER /* _WIN32 */ + alarm(0); + struct sigaction action; + memset((char*)&action, '\0', sizeof(struct sigaction)); + action.sa_handler = SIG_DFL; + if(sigaction(SIGALRM, &action, NULL) != 0) { + fprintf(stderr, "WordMonitor::TimerStart: resetting SIGALRM to SIG_DFL "); + perror(""); + } + + // Make sure last report is at least one second older than the previous one. + // + if(time(0) - elapsed < 1) + sleep(2); + fprintf(output, "%s\n", (const char*)Report()); + fprintf(output, "----------------- WordMonitor finished -------------------\n"); +#endif + } +} + +// +// C interface to WordMonitor instance +// + +extern "C" { + void word_monitor_click() + { + WordMonitor* monitor = WordMonitor::Instance(); +#ifndef _MSC_VER /* _WIN32 */ + if(monitor) + monitor->TimerClick(SIGALRM); +#endif + } + void word_monitor_add(int index, unsigned int value) + { + WordMonitor* monitor = WordMonitor::Instance(); + if(monitor) + monitor->Add(index, value); + } + void word_monitor_set(int index, unsigned int value) + { + WordMonitor* monitor = WordMonitor::Instance(); + if(monitor) + monitor->Set(index, value); + } + unsigned int word_monitor_get(int index) + { + WordMonitor* monitor = WordMonitor::Instance(); + if(monitor) + return monitor->Get(index); + else + return 0; + } +} diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordMonitor.h b/debian/htdig/htdig-3.2.0b6/htword/WordMonitor.h new file mode 100644 index 00000000..c1ce3c7e --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordMonitor.h @@ -0,0 +1,141 @@ +// +// WordMonitor.h +// +// NAME +// monitoring classes activity. +// +// SYNOPSIS +// +// Only called thru WordContext::Initialize() +// +// DESCRIPTION +// +// The test directory contains a <i>benchmark-report</i> script used to generate +// and archive graphs from the output of <i>WordMonitor</i>. +// +// CONFIGURATION +// +// wordlist_monitor_period <sec> (default 0) +// If the value <b>sec</b> is a positive integer, set a timer to +// print reports every <b>sec</b> seconds. The timer is set using +// the ALRM signal and will fail if the calling application already +// has a handler on that signal. +// +// wordlist_monitor_output <file>[,{rrd,readable] (default stderr) +// Print reports on <b>file</b> instead of the default <b>stderr</b>. +// If <b>type</b> is set to <b>rrd</b> the output is fit for the +// <i>benchmark-report</b> script. Otherwise it a (hardly :-) readable +// string. +// +// +// END +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordMonitor.h,v 1.5 2004/05/28 13:15:28 lha Exp $ +// +#ifndef _WordMonitor_h_ +#define _WordMonitor_h_ + +#include <stdio.h> +#if TIME_WITH_SYS_TIME +#include <sys/time.h> +#include <time.h> +#else +# if HAVE_SYS_TIME_H +# include <sys/time.h> +# else +# include <time.h> +# endif +#endif + +#define WORD_MONITOR_WRITE 1 +#define WORD_MONITOR_READ 2 +#define WORD_MONITOR_COMPRESS_01 3 +#define WORD_MONITOR_COMPRESS_02 4 +#define WORD_MONITOR_COMPRESS_03 5 +#define WORD_MONITOR_COMPRESS_04 6 +#define WORD_MONITOR_COMPRESS_05 7 +#define WORD_MONITOR_COMPRESS_06 8 +#define WORD_MONITOR_COMPRESS_07 9 +#define WORD_MONITOR_COMPRESS_08 10 +#define WORD_MONITOR_COMPRESS_09 11 +#define WORD_MONITOR_COMPRESS_10 12 +#define WORD_MONITOR_COMPRESS_MORE 13 +#define WORD_MONITOR_PAGE_IBTREE 14 +#define WORD_MONITOR_PAGE_LBTREE 15 +#define WORD_MONITOR_PAGE_UNKNOWN 16 +#define WORD_MONITOR_PUT 17 +#define WORD_MONITOR_GET 18 +#define WORD_MONITOR_GET_NEXT 19 +#define WORD_MONITOR_GET_SET_RANGE 20 +#define WORD_MONITOR_GET_OTHER 21 +#define WORD_MONITOR_LEVEL 22 +#define WORD_MONITOR_PGNO 23 +#define WORD_MONITOR_CMP 24 + +#define WORD_MONITOR_VALUES_SIZE 50 + +#ifdef __cplusplus +extern "C" { +#endif + + void word_monitor_click(); + void word_monitor_add(int index, unsigned int value); + void word_monitor_set(int index, unsigned int value); + unsigned int word_monitor_get(int index); + +#ifdef __cplusplus +} +#endif + +#ifdef __cplusplus + +#include "Configuration.h" +#include "htString.h" + +class WordMonitor { + public: + WordMonitor(const Configuration &config); + ~WordMonitor(); + + // + // Unique instance handlers + // + static void Initialize(const Configuration& config); + static WordMonitor* Instance() { return instance; } + + void Add(int index, unsigned int value) { values[index] += value; } + void Set(int index, unsigned int value) { values[index] = value; } + unsigned int Get(int index) { return values[index]; } + + const String Report() const; + + void TimerStart(); + void TimerClick(int signal); + void TimerStop(); + + private: + unsigned int values[WORD_MONITOR_VALUES_SIZE]; + unsigned int old_values[WORD_MONITOR_VALUES_SIZE]; + time_t started; + time_t elapsed; + int period; + FILE* output; + int output_style; + static char* values_names[WORD_MONITOR_VALUES_SIZE]; + + // + // Unique instance pointer + // + static WordMonitor* instance; +}; + +#endif /* __cplusplus */ + +#endif /* _WordMonitor_h_ */ + diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordRecord.cc b/debian/htdig/htdig-3.2.0b6/htword/WordRecord.cc new file mode 100644 index 00000000..6f5ea443 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordRecord.cc @@ -0,0 +1,144 @@ +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// +// WordRecord.cc +// +// WordRecord: data portion of the inverted index database +// +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <stdlib.h> + +#include "WordRecord.h" + +// +// WordRecord implementation +// + +// +// Convert the whole structure to an ascii string description +// +int +WordRecord::Get(String& buffer) const +{ + buffer.trunc(); + + switch(type) { + + case WORD_RECORD_DATA: + buffer << info.data; + break; + + case WORD_RECORD_STATS: + buffer << info.stats.noccurrence << "\t"; + buffer << info.stats.ndoc; + break; + + case WORD_RECORD_NONE: + break; + + default: + fprintf(stderr, "WordRecord::Get: unknown type %d\n", type); + return NOTOK; + break; + } + + return OK; +} + +String +WordRecord::Get() const +{ + String tmp; + Get(tmp); + return tmp; +} + +// +// Set a record from an ascii representation +// +int +WordRecord::Set(const String& buffer) +{ + StringList fields(buffer, "\t "); + return SetList(fields); +} + +int +WordRecord::SetList(StringList& fields) +{ + int i = 0; + + switch(type) + { + + case WORD_RECORD_DATA: + { + String* field = (String*)fields.Get_First(); + + if(field == 0) { + fprintf(stderr, "WordRecord::Set: failed to retrieve field %d\n", i); + return NOTOK; + } + info.data = (unsigned int)atoi(field->get()); + fields.Remove(field); + i++; + } + break; + + case WORD_RECORD_STATS: + { + String* field = (String*)fields.Get_First(); + + if(field == 0) { + fprintf(stderr, "WordRecord::Set: failed to retrieve field %d\n", i); + return NOTOK; + } + info.stats.noccurrence = (unsigned int)atoi(field->get()); + fields.Remove(field); + i++; + + field = (String*)fields.Get_First(); + + if(field == 0) { + fprintf(stderr, "WordRecord::Set: failed to retrieve field %d\n", i); + return NOTOK; + } + info.stats.ndoc = (unsigned int)atoi(field->get()); + fields.Remove(field); + i++; + } + break; + + case WORD_RECORD_NONE: + break; + + default: + fprintf(stderr, "WordRecord::Set: unknown type %d\n", type); + break; + } + + return OK; +} + +int +WordRecord::Write(FILE* f) const +{ + String tmp; + Get(tmp); + fprintf(f, "%s", (char*)tmp); + return 0; +} + +void +WordRecord::Print() const +{ + Write(stderr); +} diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordRecord.h b/debian/htdig/htdig-3.2.0b6/htword/WordRecord.h new file mode 100644 index 00000000..feeff089 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordRecord.h @@ -0,0 +1,198 @@ +// +// WordRecord.h +// +// NAME +// inverted index record. +// +// SYNOPSIS +// +// #include <WordRecord.h> +// +// WordRecord record(); +// if(record.DefaultType() == WORD_RECORD_DATA) { +// record.info.data = ... +// } +// +// DESCRIPTION +// +// The record can only contain one integer, if the default record +// type (see CONFIGURATION in <i>WordKeyInfo</i>) is set to <i>DATA.</i> +// If the default type is set to <i>NONE</i> the record does not contain +// any usable information. +// +// ASCII FORMAT +// +// If default type is <i>DATA</i> it is the decimal representation of +// an integer. If default type is <i>NONE</i> it is the empty string. +// +// END +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordRecord.h,v 1.10 2004/05/28 13:15:28 lha Exp $ +// + +#ifndef _WordRecord_h_ +#define _WordRecord_h_ + +#ifndef SWIG +#include "HtPack.h" +#include "StringList.h" +#include "Configuration.h" +#include "WordRecordInfo.h" +#endif /* SWIG */ + +/* And this is how we will compress this structure, for disk + storage. See HtPack.h (If there's a portable method by + which this format string does not have to be specified at + all, it should be preferred. For now, at least it is kept + here, together with the actual struct declaration.) + + Since none of the values are non-zero, we want to use + unsigned chars and unsigned short ints when possible. */ + +#ifndef SWIG +#define WORD_RECORD_DATA_FORMAT "u" +#define WORD_RECORD_STATS_FORMAT "u2" +#endif /* SWIG */ + +// +// Statistical information on a word +// +class WordRecordStat { + public: + unsigned int noccurrence; + unsigned int ndoc; +}; + +// +// The data members of WordRecord. Should really be a union but +// is quite difficult to handle properly for scripting language +// interfaces. +// +class WordRecordStorage { + public: + // + // Arbitrary data + // + unsigned int data; + // + // Statistical data used by WordStat + // + WordRecordStat stats; +}; + +// +// Describe the data associated with a key (WordKey) +// +// If type is: +// WORD_RECORD_DATA info.data is valid +// WORD_RECORD_STATS info.stats is valid +// WORD_RECORD_NONE nothing valid +// +class WordRecord +{ + public: + WordRecord() { Clear(); } + + void Clear() { memset((char*)&info, '\0', sizeof(info)); type = DefaultType(); } + +#ifndef SWIG + // + // Convenience functions to access key structure information (see WordKeyInfo.h) + // + static inline const WordRecordInfo* Info() { return WordRecordInfo::Instance(); } +#endif /* SWIG */ + static inline int DefaultType() { return Info()->default_type; } + +#ifndef SWIG + int Pack(String& packed) const { + switch(type) { + + case WORD_RECORD_DATA: + packed = htPack(WORD_RECORD_DATA_FORMAT, (char *)&info.data); + break; + + case WORD_RECORD_STATS: + packed = htPack(WORD_RECORD_STATS_FORMAT, (char *)&info.stats); + break; + + case WORD_RECORD_NONE: + packed.trunc(); + break; + + default: + fprintf(stderr, "WordRecord::Pack: unknown type %d\n", type); + return NOTOK; + break; + } + return OK; + } + + int Unpack(const String& packed) { + String decompressed; + + switch(type) { + + case WORD_RECORD_DATA: + decompressed = htUnpack(WORD_RECORD_DATA_FORMAT, packed); + if(decompressed.length() != sizeof(info.data)) { + fprintf(stderr, "WordRecord::Unpack: decoding mismatch\n"); + return NOTOK; + } + memcpy((char*)&info.data, (char*)decompressed, sizeof(info.data)); + break; + + case WORD_RECORD_STATS: + decompressed = htUnpack(WORD_RECORD_STATS_FORMAT, packed); + if(decompressed.length() != sizeof(info.stats)) { + fprintf(stderr, "WordRecord::Unpack: decoding mismatch\n"); + return NOTOK; + } + memcpy((char*)&info.stats, (char*)decompressed, sizeof(info.stats)); + break; + + case WORD_RECORD_NONE: + break; + + default: + fprintf(stderr, "WordRecord::Pack: unknown type %d\n", (int)type); + return NOTOK; + break; + } + + return OK; + } +#endif /* SWIG */ + +#ifndef SWIG + // + // Set the whole structure from ASCII string description + // + int Set(const String& bufferin); + int SetList(StringList& fields); + // + // Convert the whole structure to an ASCII string description + // + int Get(String& bufferout) const; + String Get() const; +#endif /* SWIG */ + +#ifndef SWIG + // + // Print object in ASCII form on FILE (uses Get) + // + int Write(FILE* f) const; +#endif /* SWIG */ + void Print() const; + + unsigned char type; + WordRecordStorage info; +}; + +#endif /* _WordRecord_h_ */ + diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordRecordInfo.cc b/debian/htdig/htdig-3.2.0b6/htword/WordRecordInfo.cc new file mode 100644 index 00000000..a9a25385 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordRecordInfo.cc @@ -0,0 +1,51 @@ +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// +// WordRecordInfo.cc +// +// WordRecord: data portion of the inverted index database +// +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "Configuration.h" +#include "WordRecordInfo.h" + +WordRecordInfo* WordRecordInfo::instance = 0; + +// +// WordRecordInfo implementation +// +void +WordRecordInfo::Initialize(const Configuration &config) +{ + if(instance != 0) + delete instance; + instance = new WordRecordInfo(config); +} + +WordRecordInfo::WordRecordInfo(const Configuration& config) +{ + default_type = WORD_RECORD_INVALID; + const String &recorddesc = config["wordlist_wordrecord_description"]; + if(!recorddesc.nocase_compare("data")) + { + default_type = WORD_RECORD_DATA; + } + else + if(!recorddesc.nocase_compare("none") || recorddesc.empty()) + { + default_type = WORD_RECORD_NONE; + } + else + { + fprintf(stderr, "WordRecordInfo::WordRecordInfo: invalid wordlist_wordrecord_description: %s\n", (const char*)recorddesc); + } +} + diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordRecordInfo.h b/debian/htdig/htdig-3.2.0b6/htword/WordRecordInfo.h new file mode 100644 index 00000000..7f4f59ff --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordRecordInfo.h @@ -0,0 +1,83 @@ +// +// WordRecord.h +// +// NAME +// information on the record structure of the inverted index. +// +// SYNOPSIS +// +// Only called thru WordContext::Initialize() +// +// DESCRIPTION +// +// The structure of a record is very limited. It can contain +// at most two integer (int) values. +// +// CONFIGURATION +// +// wordlist_wordrecord_description {NONE|DATA} (no default) +// NONE: the record is empty +// <br> +// DATA: the record contains two integers (int) +// +// +// END +// +// WordRecord: Record for storing word information in the word database +// Each word occurrence is stored as a separate key/record pair. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordRecordInfo.h,v 1.4 2004/05/28 13:15:28 lha Exp $ +// + +#ifndef _WordRecordInfo_h_ +#define _WordRecordInfo_h_ + +// +// Possible values of the type data field +// +#define WORD_RECORD_INVALID 0 +#define WORD_RECORD_DATA 1 +#define WORD_RECORD_STATS 2 +#define WORD_RECORD_NONE 3 + +#ifndef SWIG +// +// Meta information about WordRecord +// +// wordlist_wordrecord_description: DATA +// use WordRecordStorage::data for each word occurent +// wordlist_wordrecord_description: NONE +// or +// wordlist_wordrecord_description not specified +// the data associated with each word occurrence is empty +// +class WordRecordInfo +{ + public: + WordRecordInfo(const Configuration& config); + // + // Unique instance handlers + // + static void Initialize(const Configuration& config); + static WordRecordInfo* Instance() { + if(instance) return instance; + fprintf(stderr, "WordRecordInfo::Instance: no instance\n"); + return 0; + } + + int default_type; + + // + // Unique instance pointer + // + static WordRecordInfo* instance; +}; +#endif /* SWIG */ + +#endif /* _WordRecordInfo_h_ */ diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordReference.cc b/debian/htdig/htdig-3.2.0b6/htword/WordReference.cc new file mode 100644 index 00000000..320ff418 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordReference.cc @@ -0,0 +1,88 @@ +// +// WordReference.cc +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordReference.cc,v 1.8 2004/05/28 13:15:28 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "WordReference.h" + +int WordReference::Merge(const WordReference& other) +{ + int ret = key.Merge(other.Key()); + record = other.record; + + return ret; +} + +// +// Set the structure from an ascii representation +// +int +WordReference::Set(const String& buffer) +{ + StringList fields(buffer, "\t "); + return SetList(fields); +} + +// +// Set the structure from list of fields +// +int +WordReference::SetList(StringList& fields) +{ + Clear(); + if(key.SetList(fields) != OK || + record.SetList(fields) != OK) + return NOTOK; + else + return OK; +} + +// +// Convert the whole structure to an ascii string description +// +int +WordReference::Get(String& buffer) const +{ + String tmp; + buffer.trunc(); + + if(key.Get(tmp) != OK) return NOTOK; + buffer.append(tmp); + + if(record.Get(tmp) != OK) return NOTOK; + buffer.append(tmp); + + return OK; +} + +String +WordReference::Get() const +{ + String tmp; + key.Get(tmp); + return tmp; +} + +int WordReference::Write(FILE* f) const +{ + String tmp; + key.Get(tmp); + fprintf(f, "%s", (char*)tmp); + return 0; +} + +void WordReference::Print() const +{ + Write(stderr); +} diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordReference.h b/debian/htdig/htdig-3.2.0b6/htword/WordReference.h new file mode 100644 index 00000000..b6e1215d --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordReference.h @@ -0,0 +1,263 @@ +// +// WordReference.h +// +// NAME +// inverted index occurrence. +// +// SYNOPSIS +// +// #include <WordReference.h> +// +// WordReference wordRef("word"); +// WordReference wordRef(); +// WordReference wordRef(WordKey("key <DEF> 1 2"), WordRecord()); +// +// WordKey& key = wordRef.Key(); +// WordKey& record = wordRef.Record(); +// +// wordRef.Clear(); +// +// DESCRIPTION +// +// A <i>WordReference</i> object is an agregate of a <i>WordKey</i> object +// and a <i>WordRecord</i> object. +// +// ASCII FORMAT +// +// The ASCII description is a string with fields separated by tabs or +// white space. It is made of the ASCII description of a +// <i>WordKey</i> object immediately followed by the ASCII +// description of a <i>WordRecord</i> object. See the corresponding +// manual pages for more information. +// +// END +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordReference.h,v 1.7 2004/05/28 13:15:28 lha Exp $ +// +#ifndef _WordReference_h_ +#define _WordReference_h_ + +#ifndef SWIG +#include "htString.h" +#include "WordRecord.h" +#include "WordKey.h" +#endif /* SWIG */ + +// +// Describe the WordKey/WordRecord pair +// +class WordReference : public Object +{ + public: + // + // Construction/Destruction + //- + // Constructor. Build an object with empty key and empty record. + // + WordReference() {} +#ifndef SWIG + //- + // Constructor. Build an object from disk representation of <b>key</b> + // and <b>record</b>. + // + WordReference(const String& key0, const String& record0) { + Unpack(key0, record0); + } + //- + // Constructor. Build an object with key word set to <b>word</b> + // and otherwise empty and empty record. + // + WordReference(const String& word) { + Clear(); + key.SetWord(word); + } +#endif /* SWIG */ + ~WordReference() {} + + //- + // Reset to empty key and record + // + void Clear() { key.Clear(); record.Clear(); } + + // + // Accessors + //- + // Return the key object. + // + WordKey& Key() { return key; } +#ifndef SWIG + //- + // Return the key object as const. + // + const WordKey& Key() const { return key; } +#endif /* SWIG */ + //- + // Return the record object. + // + WordRecord& Record() { return record; } +#ifndef SWIG + //- + // Return the record object as const. + // + const WordRecord& Record() const { return record; } +#endif /* SWIG */ + + // + // Conversion + // +#ifdef SWIG +%name(SetKey) +#endif /* SWIG */ + //- + // Copy <b>arg</b> in the key part of the object. + // + void Key(const WordKey& arg) { key = arg; } +#ifndef SWIG + //- + // Set key structure from disk storage format as found in + // <b>packed</b> string. + // Return OK if successfull, NOTOK otherwise. + // + int KeyUnpack(const String& packed) { return key.Unpack(packed); } + // + //- + // Convert key object into disk storage format as found in + // return the resulting string. + // + String KeyPack() const { String tmp; key.Pack(tmp); return tmp; } + //- + // Convert key object into disk storage format as found in + // and place the result in <b>packed</b> string. + // Return OK if successfull, NOTOK otherwise. + // + int KeyPack(String& packed) const { return key.Pack(packed); } +#endif /* SWIG */ + +#ifdef SWIG +%name(SetRecord) +#endif /* SWIG */ + //- + // Copy <b>arg</b> in the record part of the object. + // + void Record(const WordRecord& arg) { record = arg; } +#ifndef SWIG + //- + // Set record structure from disk storage format as found in + // <b>packed</b> string. + // Return OK if successfull, NOTOK otherwise. + // + int RecordUnpack(const String& packed) { return record.Unpack(packed); } + //- + // Convert record object into disk storage format as found in + // return the resulting string. + // + String RecordPack() const { String tmp; record.Pack(tmp); return tmp; } + //- + // Convert record object into disk storage format as found in + // and place the result in <b>packed</b> string. + // Return OK if successfull, NOTOK otherwise. + // + int RecordPack(String& packed) const { return record.Pack(packed); } + + //- + // Short hand for KeyPack(<b>ckey</b>) RecordPack(<b>crecord</b>). + // + inline int Pack(String& ckey, String& crecord) const { + if(key.Pack(ckey) == NOTOK) return NOTOK; + if(record.Pack(crecord) == NOTOK) return NOTOK; + return OK; + } + //- + // Short hand for KeyUnpack(<b>ckey</b>) RecordUnpack(<b>crecord</b>). + // + int Unpack(const String& ckey, const String& crecord) { + if(key.Unpack(ckey) == NOTOK) return NOTOK; + if(record.Unpack(crecord) == NOTOK) return NOTOK; + return OK; + } +#endif /* SWIG */ + + // + // Transformations + // + //- + // Merge key with other.Key() using the <i>WordKey::Merge</i> method: + // key.Merge(other.Key()). + // See the corresponding manual page for details. Copy other.record + // into the record part of the object. + // + int Merge(const WordReference& other); +#ifndef SWIG + //- + // Copy <b>master</b> before merging with <b>master.</b>Merge(<b>slave</b>) + // and return the copy. Prevents alteration of <b>master</b>. + // + static WordReference Merge(const WordReference& master, const WordReference& slave) { + WordReference tmp(master); + tmp.Merge(slave); + return tmp; + } +#endif /* SWIG */ + +#ifndef SWIG + int compare(Object *to) { String word(((WordReference *) to)->key.GetWord()); return key.GetWord().nocase_compare(word); } +#endif /* SWIG */ + +#ifndef SWIG + // + // Set the whole structure from ASCII string description + // + //- + // Set the whole structure from ASCII string in <b>bufferin</b>. + // See <i>ASCII FORMAT</i> section. + // Return OK if successfull, NOTOK otherwise. + // + int Set(const String& bufferin); + int SetList(StringList& fields); + //- + // Convert the whole structure to an ASCII string description + // in <b>bufferout.</b> + // See <i>ASCII FORMAT</i> section. + // Return OK if successfull, NOTOK otherwise. + // + int Get(String& bufferout) const; + //- + // Convert the whole structure to an ASCII string description + // and return it. + // See <i>ASCII FORMAT</i> section. + // + String Get() const; +#endif /* SWIG */ + + // + // Debuging + // +#ifndef SWIG + //- + // Print object in ASCII form on <b>f</b> (uses <i>Get</i> method). + // See <i>ASCII FORMAT</i> section. + // + int Write(FILE* f) const; +#endif /* SWIG */ + //- + // Print object in ASCII form on <b>stdout</b> (uses <i>Get</i> method). + // See <i>ASCII FORMAT</i> section. + // + void Print() const; + + protected: + +#ifndef SWIG + WordKey key; + WordRecord record; +#endif /* SWIG */ +}; + +#endif /* _WordReference_h */ + diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordStat.cc b/debian/htdig/htdig-3.2.0b6/htword/WordStat.cc new file mode 100644 index 00000000..cd9cb358 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordStat.cc @@ -0,0 +1,19 @@ +// +// WordStat.cc +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordStat.cc,v 1.5 2004/05/28 13:15:28 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "WordStat.h" + +WordReference* WordStat::word_stat_last = 0; diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordStat.h b/debian/htdig/htdig-3.2.0b6/htword/WordStat.h new file mode 100644 index 00000000..b2889687 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordStat.h @@ -0,0 +1,60 @@ +// +// WordStat.h +// +// WordStat: Kind of record that holds statistics about each distinct word +// in the database. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordStat.h,v 1.5 2004/05/28 13:15:28 lha Exp $ +// +#ifndef _WordStat_h_ +#define _WordStat_h_ + +#include "WordReference.h" + +class WordStat : public WordReference +{ + public: + // + // Construction/Destruction + // + WordStat() { record.type = WORD_RECORD_STATS; } + WordStat(const String& key_arg, const String& record_arg) : WordReference(key_arg, record_arg) { + record.type = WORD_RECORD_STATS; + } + WordStat(const String& word) { + Clear(); + key.SetWord(String("\001") + word); + record.type = WORD_RECORD_STATS; + } + + ~WordStat() {} + + // + // Accessors + // + unsigned int Noccurrence() const { return record.info.stats.noccurrence; } + unsigned int &Noccurrence() { return record.info.stats.noccurrence; } + + // + // Return upper boundary key of reference count records + // + static inline const WordReference& Last() { + if(!word_stat_last) + word_stat_last = new WordReference("\002"); + return *word_stat_last; + } + + protected: + + static WordReference* word_stat_last; +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordType.cc b/debian/htdig/htdig-3.2.0b6/htword/WordType.cc new file mode 100644 index 00000000..355f1380 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordType.cc @@ -0,0 +1,219 @@ +// +// WordType.cc +// +// WordType: Wrap some attributes to make is...() type +// functions and other common functions without having to manage +// the attributes or the exact attribute combination semantics. +// Configuration parameter used: +// valid_punctuation,extra_word_characters,minimum_word_length, +// maximum_word_length,allow_numbers,bad_word_list +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordType.cc,v 1.9 2004/05/28 13:15:28 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <ctype.h> +#include <stdio.h> + +#include "WordType.h" + +WordType* WordType::instance = 0; + +void +WordType::Initialize(const Configuration &config_arg) +{ + if(instance != 0) + delete instance; + instance = new WordType(config_arg); +} + +WordType::WordType(const Configuration &config) +{ + const String valid_punct = config["valid_punctuation"]; + const String extra_word_chars = config["extra_word_characters"]; + + minimum_length = config.Value("minimum_word_length", 3); + maximum_length = config.Value("maximum_word_length", 12); + allow_numbers = config.Boolean("allow_numbers", 0); + + extra_word_characters = extra_word_chars; + valid_punctuation = valid_punct; + other_chars_in_word = extra_word_chars; + other_chars_in_word.append(valid_punct); + + chrtypes[0] = 0; + for (int i = 1; i < 256; i++) + { + chrtypes[i] = 0; + if (isalpha(i)) + chrtypes[i] |= WORD_TYPE_ALPHA; + if (isdigit(i)) + chrtypes[i] |= WORD_TYPE_DIGIT; + if (iscntrl(i)) + chrtypes[i] |= WORD_TYPE_CONTROL; + if (strchr(extra_word_chars, i)) + chrtypes[i] |= WORD_TYPE_EXTRA; + if (strchr(valid_punct, i)) + chrtypes[i] |= WORD_TYPE_VALIDPUNCT; + } + + { + const String filename = config["bad_word_list"]; + FILE *fl = fopen(filename, "r"); + char buffer[1000]; + char *word; + String new_word; + + // Read in the badwords file (it's just a text file) + while (fl && fgets(buffer, sizeof(buffer), fl)) + { + word = strtok(buffer, "\r\n \t"); + if (word && *word) + { + int flags; + new_word = word; + if((flags = Normalize(new_word)) & WORD_NORMALIZE_NOTOK) { + fprintf(stderr, "WordType::WordType: reading bad words from %s found %s, ignored because %s\n", (const char*)filename, word, (char*)NormalizeStatus(flags & WORD_NORMALIZE_NOTOK)); + } else { + badwords.Add(new_word, 0); + } + } + } + + if (fl) + fclose(fl); + } +} + +WordType::~WordType() +{ +} + +// +// Normalize a word according to configuration specifications and +// builting transformations. +// *EVERY* word inserted in the inverted index goes thru this. If +// a word is rejected by Normalize there is 0% chance to find it +// in the word database. +// +int +WordType::Normalize(String& word) const +{ + int status = WORD_NORMALIZE_GOOD; + + // + // Reject empty strings, always + // + if(word.empty()) + return status | WORD_NORMALIZE_NULL; + + // + // Always convert to lowercase + // + if(word.lowercase()) + status |= WORD_NORMALIZE_CAPITAL; + + // + // Remove punctuation characters according to configuration + // + if(StripPunctuation(word)) + status |= WORD_NORMALIZE_PUNCTUATION; + + // + // Truncate words too long according to configuration + // + if(word.length() > maximum_length) { + word.chop(word.length() - maximum_length); + status |= WORD_NORMALIZE_TOOLONG; + } + + // + // Reject words too short according to configuration + // + if(word.length() < minimum_length) + return status | WORD_NORMALIZE_TOOSHORT; + + // + // Reject if contains control characters + // + int alpha = 0; + for(const unsigned char *p = (const unsigned char*)(const char*)(char *)word; *p; p++) { + if(IsStrictChar(*p) && (allow_numbers || !IsDigit(*p))) { + alpha = 1; + } else if(IsControl(*p)) { + return status | WORD_NORMALIZE_CONTROL; + } + } + + // + // Reject if contains no alpha characters (according to configuration) + // + if(!alpha) return status | WORD_NORMALIZE_NOALPHA; + + // + // Reject if listed in config[bad_word_list] + // + if(badwords.Exists(word)) + return status | WORD_NORMALIZE_BAD; + + // + // Accept and report the transformations that occured + // + return status; +} + +// +// Convert the integer status into a readable string +// +String +WordType::NormalizeStatus(int flags) +{ + String tmp; + + if(flags & WORD_NORMALIZE_TOOLONG) tmp << "TOOLONG "; + if(flags & WORD_NORMALIZE_TOOSHORT) tmp << "TOOSHORT "; + if(flags & WORD_NORMALIZE_CAPITAL) tmp << "CAPITAL "; + if(flags & WORD_NORMALIZE_NUMBER) tmp << "NUMBER "; + if(flags & WORD_NORMALIZE_CONTROL) tmp << "CONTROL "; + if(flags & WORD_NORMALIZE_BAD) tmp << "BAD "; + if(flags & WORD_NORMALIZE_NULL) tmp << "NULL "; + if(flags & WORD_NORMALIZE_PUNCTUATION) tmp << "PUNCTUATION "; + if(flags & WORD_NORMALIZE_NOALPHA) tmp << "NOALPHA "; + + if(tmp.empty()) tmp << "GOOD"; + + return tmp; +} + +// +// Non-destructive tokenizer using external int as pointer into String +// does word separation by our rules (so it can be subclassed too) +// +String +WordType::WordToken(const String tokens, int ¤t) const +{ + unsigned char text = tokens[current]; + String ret; + + while (text && !IsStrictChar(text)) + text = tokens[++current]; + + if (text) + { + while (text && IsChar(text)) + { + ret << text; + text = tokens[++current]; + } + } + return ret; +} diff --git a/debian/htdig/htdig-3.2.0b6/htword/WordType.h b/debian/htdig/htdig-3.2.0b6/htword/WordType.h new file mode 100644 index 00000000..8406104e --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/WordType.h @@ -0,0 +1,157 @@ +// +// WordType.h +// +// WordType: Wrap some attributes to make is...() type +// functions and other common functions without having to manage +// the attributes or the exact attribute combination semantics. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordType.h,v 1.5 2004/05/28 13:15:28 lha Exp $ +// + +#ifndef _WordType_h +#define _WordType_h + +#include "htString.h" +#include "Configuration.h" +// +// Return values of Normalize, to get them in string form use NormalizeStatus +// +#define WORD_NORMALIZE_GOOD 0x0000 +#define WORD_NORMALIZE_TOOLONG 0x0001 +#define WORD_NORMALIZE_TOOSHORT 0x0002 +#define WORD_NORMALIZE_CAPITAL 0x0004 +#define WORD_NORMALIZE_NUMBER 0x0008 +#define WORD_NORMALIZE_CONTROL 0x0010 +#define WORD_NORMALIZE_BAD 0x0020 +#define WORD_NORMALIZE_NULL 0x0040 +#define WORD_NORMALIZE_PUNCTUATION 0x0080 +#define WORD_NORMALIZE_NOALPHA 0x0100 + +// +// Under these conditions the word is said to be invalid. +// Some conditions (NUMBER,TOOSHORT and BAD) depends on the configuration +// parameters. +// +#define WORD_NORMALIZE_NOTOK (WORD_NORMALIZE_TOOSHORT| \ + WORD_NORMALIZE_NUMBER| \ + WORD_NORMALIZE_CONTROL| \ + WORD_NORMALIZE_BAD| \ + WORD_NORMALIZE_NULL| \ + WORD_NORMALIZE_NOALPHA) + +class WordType +{ +public: + // + // Constructors + // + WordType(const Configuration& config); + + // + // Destructor + // + virtual ~WordType(); + + // + // Unique instance handlers + // + static void Initialize(const Configuration& config); + static WordType* Instance() { + if(instance) return instance; + fprintf(stderr, "WordType::Instance: no instance\n"); + return 0; + } + + // + // Predicates + // + virtual int IsChar(int c) const; + virtual int IsStrictChar(int c) const; + virtual int IsDigit(int c) const; + virtual int IsControl(int c) const; + + // + // Transformations + // + virtual int StripPunctuation(String &s) const; + virtual int Normalize(String &s) const; + + // + // Splitting + // + virtual String WordToken(const String s, int &pointer) const; + + // + // Error handling + // + static String NormalizeStatus(int flags); + +private: + + String valid_punctuation; // The same as the attribute. + String extra_word_characters; // Likewise. + String other_chars_in_word; // Attribute "valid_punctuation" plus + // "extra_word_characters". + char chrtypes[256]; // quick lookup table for types + int minimum_length; // Minimum word length + int maximum_length; // Maximum word length + int allow_numbers; // True if a word may contain numbers + Dictionary badwords; // List of excluded words + + // + // Unique instance pointer + // + static WordType* instance; +}; + +// Bits to set in chrtypes[]: +#define WORD_TYPE_ALPHA 0x01 +#define WORD_TYPE_DIGIT 0x02 +#define WORD_TYPE_EXTRA 0x04 +#define WORD_TYPE_VALIDPUNCT 0x08 +#define WORD_TYPE_CONTROL 0x10 + +// One for characters that when put together are a word +// (including punctuation). +inline int +WordType::IsChar(int c) const +{ + return (chrtypes[(unsigned char)c] & (WORD_TYPE_ALPHA|WORD_TYPE_DIGIT|WORD_TYPE_EXTRA|WORD_TYPE_VALIDPUNCT)) != 0; +} + +// Similar, but no punctuation characters. +inline int +WordType::IsStrictChar(int c) const +{ + return (chrtypes[(unsigned char)c] & (WORD_TYPE_ALPHA|WORD_TYPE_DIGIT|WORD_TYPE_EXTRA)) != 0; +} + +// Reimplementation of isdigit() using the lookup table chrtypes[] +inline int +WordType::IsDigit(int c) const +{ + return (chrtypes[(unsigned char)c] & WORD_TYPE_DIGIT) != 0; +} + +// Similar to IsDigit, but for iscntrl() +inline int +WordType::IsControl(int c) const +{ + return (chrtypes[(unsigned char)c] & WORD_TYPE_CONTROL) != 0; +} + +// Let caller get rid of getting and holding a configuration parameter. +inline int +WordType::StripPunctuation(String &s) const +{ + return s.remove(valid_punctuation); +} + + +#endif /* __WordType_h */ diff --git a/debian/htdig/htdig-3.2.0b6/htword/word.desc b/debian/htdig/htdig-3.2.0b6/htword/word.desc new file mode 100644 index 00000000..7de66973 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htword/word.desc @@ -0,0 +1,15 @@ +# +# Structure of a key +# + +nfields 4 + +#NAME SIZE SORTPOSITION + +Location 16 3 + +Flags 8 2 + +DocID 32 1 + +Word 0 0 |