diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/contrib')
59 files changed, 7957 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/contrib/README b/debian/htdig/htdig-3.2.0b6/contrib/README new file mode 100644 index 00000000..d7c57ea3 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/README @@ -0,0 +1,34 @@ +ht://Dig contributed scripts + +This directory tree contains perl and shell programs that attempt to +do things with the generated databases. Most of these were written +for a very specific purpose for the specific version of ht://Dig that +was current at that point. This means that some of these programs +will be severely broken! Do not expect them to work; use them only as +examples of the types of things you can do with the ht://Dig +databases. + +More contributed work is available on the ht://Dig website: +<http://www.htdig.org/contrib/> + +What's here: + +acroconv.pl An external converter script that uses acroread to parse PDFs +autorun An example of automating the database building +changehost A script to change hostnames of URLs in the databases +conv_doc.pl A sample script to use the conversion features of external_parsers +doclist List the information in the doc db (or after a certain date) +ewswrap Two sample htsearch wrappers to emulate Excite for Web + Servers (EWS) and to simplify queries +handler.pl A sample external_protocols script to handle HTTP/HTTPS using curl +htparsedoc A sample shell script to parse Word documents +multidig A set of scripts to simplify updating multiple databases +parse_doc.pl A general external parser script that handles MS Word documents + (among others) +run-robot.sh Another example of automating the database building +scriptname An example of using htsearch within dynamic SSI pages +status.pl Build a status page of last 5 runs and top 10 + servers (by # URLs) +urlindex Build an index of all the URLs in the database +whatsnew Build a "what's new" page with custom header and footer +wordfreq Build a list of words and frequency in the database diff --git a/debian/htdig/htdig-3.2.0b6/contrib/acroconv.pl b/debian/htdig/htdig-3.2.0b6/contrib/acroconv.pl new file mode 100755 index 00000000..ad7d4d79 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/acroconv.pl @@ -0,0 +1,93 @@ +#!/usr/local/bin/perl +# +# Sample external converter for htdig 3.1.4 or later, to convert PDFs +# using Adobe Acrobat 3's acroread -toPostScript option on UNIX systems. +# (Use it in place of conv_doc.pl if you have acroread but not pdftotext.) +# Written by Gilles Detillieux. +# +# Usage: (in htdig.conf) +# +# external_parsers: application/pdf->text/html /usr/local/bin/acroconv.pl +# +# This is a pretty quick and dirty implementation, but it does seem to +# give functionality equivalent to the now defunct htdig/PDF.cc parser. +# I'm not a Perl expert by any stretch of the imagination, so the code +# could probably use a lot of optimization to make it work better. +# + +$watch = 0; +$bigspace = 0; +$putspace = 0; +$putbody = 1; + +system("ln $ARGV[0] $ARGV[0].pdf; acroread -toPostScript $ARGV[0].pdf"); +open(INP, "< $ARGV[0].ps") || die "Can't open $ARGV[0].ps\n"; + +print "<HTML>\n<head>\n"; +while (<INP>) { + if (/^%%Title: / && $putbody) { + s/^%%Title: \((.*)\).*\n/$1/; + s/\\222/'/g; + s/\\267/*/g; + s/\\336/fi/g; + s/\\([0-7]{3})/pack(C, oct($1))/eig; + s/\\([0-7]{2})/pack(C, oct($1))/eig; + s/\\([0-7])/pack(C, oct($1))/eig; + s/\\[nrtbf]/ /g; + s/\\(.)/$1/g; + s/&/\&\;/g; + s/</\<\;/g; + s/>/\>\;/g; + print "<title>$_</title>\n"; + print "</head>\n<body>\n"; + $putbody = 0; + } elsif (/^BT/) { + $watch = 1; + } elsif (/^ET/) { + $watch = 0; + if ($putspace) { + print "\n"; + $putspace = 0; + } + } elsif ($watch) { + if (/T[Jj]$/) { + s/\)[^(]*\(//g; + s/^[^(]*\((.*)\).*\n/$1/; + s/\\222/'/g; + s/\\267/*/g; + s/\\336/fi/g; + s/\\([0-7]{3})/pack(C, oct($1))/eig; + s/\\([0-7]{2})/pack(C, oct($1))/eig; + s/\\([0-7])/pack(C, oct($1))/eig; + s/\\[nrtbf]/ /g; + s/\\(.)/$1/g; + if ($bigspace) { + s/(.)/$1 /g; + } + s/&/\&\;/g; + s/</\<\;/g; + s/>/\>\;/g; + if ($putbody) { + print "</head>\n<body>\n"; + $putbody = 0; + } + print "$_"; + $putspace = 1; + } elsif (/T[Ddm*]$/ && $putspace) { + print "\n"; + $putspace = 0; + } elsif (/Tc$/) { + $bigspace = 0; + if (/^([3-9]|[1-9][0-9]+)\..*Tc$/) { + $bigspace = 1; + } + } + } +} +if ($putbody) { + print "</head>\n<body>\n"; +} +print "</body>\n</HTML>\n"; + +close(INP); +system("rm -f $ARGV[0].pdf $ARGV[0].ps"); diff --git a/debian/htdig/htdig-3.2.0b6/contrib/autorun/README b/debian/htdig/htdig-3.2.0b6/contrib/autorun/README new file mode 100644 index 00000000..44686879 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/autorun/README @@ -0,0 +1,16 @@ +README for autorun. + +The autorun program is an attempt at automatic the steps +needed to build a complete search database. + +If the search domain is not too big, this can be run on a +daily (nightly) basis. + + +Usage: + autorun + +Configuration: + Edit the autorun script and change things to your + liking... + diff --git a/debian/htdig/htdig-3.2.0b6/contrib/autorun/autorun b/debian/htdig/htdig-3.2.0b6/contrib/autorun/autorun new file mode 100755 index 00000000..6014073a --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/autorun/autorun @@ -0,0 +1,46 @@ +#!/bin/sh + +## +## Configurable variables +## + + +## +## Specify the location of the htdig and htmerge binaries +## +htbin=/opt/www/bin + +## +## Specify the configuration file to use for digging and merging +## +conffile=/opt/www/htdig/sdsu.conf + +## +## Specify the location where the temporary database is +## +source=/tmp + +## +## Specify the location of the target search database +## +target=/gopher/www/htdig + +## +## Specify the host of the target search database +## +search_host=athena + +## +## Specify how to copy the new database to the location +## where the search engine can get at it. +## +docopy() { + rcp $source/*.docdb $source/*.docs.index $source/*.words.gdbm ${search_host}:$target +} + + +$htbin/htdig -i -c $conffile +$htbin/htmerge -c $conffile +$htbin/htnotify -vv -c $conffile + +docopy diff --git a/debian/htdig/htdig-3.2.0b6/contrib/changehost/changehost.pl b/debian/htdig/htdig-3.2.0b6/contrib/changehost/changehost.pl new file mode 100755 index 00000000..3bd6c44d --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/changehost/changehost.pl @@ -0,0 +1,298 @@ +#!/usr/local/bin/perl + +## +## changehost.pl (C) 1995 Andrew Scherpbier +## +## This program will change hostnames of URLs in the document database and index. +## +## usage: +## changehost.pl database_base from to +## +## example: +## changehost.pl /opt/www/htdig/sdsu www.sdsu.edu www.northpole.net +## +## Two new database will be created with a base of '/tmp/new'. +## These databases can then be used by htsearch. +## + +use GDBM_File; + +$base = $ARGV[0]; +$from = $ARGV[1]; +$to = $ARGV[2]; + +$dbfile = "$base.docdb"; +$newfile = "/tmp/new.docdb"; + +## +## Convert the document database first. +## +tie(%newdb, GDBM_File, $newfile, GDBM_NEWDB, 0644) || die "$newfile: '$!'"; +tie(%docdb, GDBM_File, $dbfile, GDBM_READER, 0) || die "$dbfile: $!"; + + +while (($key, $value) = each %docdb) +{ + if ($key =~ /http:\/\/$from/i) + { + %record = parse_ref_record($value); + $key =~ s/http:\/\/$from/http:\/\/$to/i; + print "$key\n"; + $t = $record{"URL"}; + $t =~ s/http:\/\/$from/http:\/\/$to/i; + $record{"URL"} = $t; + + $value = create_ref_record(%record); + } + + $newdb{$key} = $value; +} + +untie %newdb; +untie %docdb; + +## +## Now create the document index +## +$newfile = "/tmp/new.docs.index"; +$dbfile = "$base.docs.index"; + +tie(%newdb, GDBM_File, $newfile, GDBM_NEWDB, 0644) || die "$newfile: '$!'"; +tie(%docdb, GDBM_File, $dbfile, GDBM_READER, 0) || die "$dbfile: $!"; + +while (($key, $value) = each %docdb) +{ + if ($value =~ /http:\/\/$from/i) + { + $value =~ s/http:\/\/$from/http:\/\/$to/i; + } + $newdb{$key} = $value; +} + +untie %newdb; +untie %docdb; + +###################################################################### +sub create_ref_record +{ + local(%rec) = @_; + local($s); + + if (exists $rec{"ID"}) + { + $s .= pack("Ci", 0, $rec{"ID"}); + } + if (exists $rec{"TIME"}) + { + $s .= pack("Ci", 1, $rec{"TIME"}); + } + if (exists $rec{"ACCESSED"}) + { + $s .= pack("Ci", 2, $rec{"ACCESSED"}); + } + if (exists $rec{"STATE"}) + { + $s .= pack("Ci", 3, $rec{"STATE"}); + } + if (exists $rec{"SIZE"}) + { + $s .= pack("Ci", 4, $rec{"SIZE"}); + } + if (exists $rec{"LINKS"}) + { + $s .= pack("Ci", 5, $rec{"LINKS"}); + } + if (exists $rec{"IMAGESIZE"}) + { + $s .= pack("Ci", 6, $rec{"IMAGESIZE"}); + } + if (exists $rec{"HOPCOUNT"}) + { + $s .= pack("Ci", 7, $rec{"HOPCOUNT"}); + } + if (exists $rec{"URL"}) + { + $s .= pack("Ci", 8, length($rec{"URL"})); + $s .= $rec{"URL"}; + } + if (exists $rec{"HEAD"}) + { + $s .= pack("Ci", 9, length($rec{"HEAD"})); + $s .= $rec{"HEAD"}; + } + if (exists $rec{"TITLE"}) + { + $s .= pack("Ci", 10, length($rec{"TITLE"})); + $s .= $rec{"TITLE"}; + } + if (exists $rec{"DESCRIPTIONS"}) + { + @v = split('', $rec{"DESCRIPTIONS"}); + $s .= pack("Ci", 11, $#v - 1); + foreach (@v) + { + $s .= pack("i", length($_)); + $s .= $_; + } + } + if (exists $rec{"ANCHORS"}) + { + @v = split('', $rec{"ANCHORS"}); + $s .= pack("Ci", 12, $#v - 1); + foreach (@v) + { + $s .= pack("i", length($_)); + $s .= $_; + } + } + if (exists $rec{"EMAIL"}) + { + $s .= pack("Ci", 13, length($rec{"EMAIL"})); + $s .= $rec{"EMAIL"}; + } + if (exists $rec{"NOTIFICATION"}) + { + $s .= pack("Ci", 14, length($rec{"NOTIFICATION"})); + $s .= $rec{"NOTIFICATION"}; + } + if (exists $rec{"SUBJECT"}) + { + $s .= pack("Ci", 15, length($rec{"SUBJECT"})); + $s .= $rec{"SUBJECT"}; + } + + return $s; +} + +sub parse_ref_record +{ + local($value) = @_; + local(%rec, $length, $count, $result); + + while (length($value) > 0) + { + $what = unpack("C", $value); + $value = substr($value, 1); + if ($what == 0) + { + # ID + $rec{"ID"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 1) + { + # TIME + $rec{"TIME"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 2) + { + # ACCESSED + $rec{"ACCESSED"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 3) + { + # STATE + $rec{"STATE"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 4) + { + # SIZE + $rec{"SIZE"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 5) + { + # LINKS + $rec{"LINKS"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 6) + { + # IMAGESIZE + $rec{"IMAGESIZE"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 7) + { + # HOPCOUNT + $rec{"HOPCOUNT"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 8) + { + # URL + $length = unpack("i", $value); + $rec{"URL"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 9) + { + # HEAD + $length = unpack("i", $value); + $rec{"HEAD"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 10) + { + # TITLE + $length = unpack("i", $value); + $rec{"TITLE"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 11) + { + # DESCRIPTIONS + $count = unpack("i", $value); + $value = substr($value, 4); + $result = ""; + foreach (1 .. $count) + { + $length = unpack("i", $value); + $result = $result . unpack("x4 A$length", $value) . ""; + $value = substr($value, 4 + $length); + } + chop $result; + $rec{"DESCRIPTIONS"} = $result; + } + elsif ($what == 12) + { + # ANCHORS + $count = unpack("i", $value); + $value = substr($value, 4); + $result = ""; + foreach (1 .. $count) + { + $length = unpack("i", $value); + $result = $result . unpack("x4 A$length", $value) . ""; + $value = substr($value, 4 + $length); + } + chop $result; + $rec{"ANCHORS"} = $result; + } + elsif ($what == 13) + { + # EMAIL + $length = unpack("i", $value); + $rec{"EMAIL"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 14) + { + # NOTIFICATION + $length = unpack("i", $value); + $rec{"NOTIFICATION"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 15) + { + # SUBJECT + $length = unpack("i", $value); + $rec{"SUBJECT"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + } + return %rec; +} diff --git a/debian/htdig/htdig-3.2.0b6/contrib/conv_doc.pl b/debian/htdig/htdig-3.2.0b6/contrib/conv_doc.pl new file mode 100755 index 00000000..78d8a985 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/conv_doc.pl @@ -0,0 +1,214 @@ +#!/usr/local/bin/perl + +# +# Sample external converter for htdig 3.1.4 or later. +# Usage: (in htdig.conf) +# +# external_parsers: application/msword->text/html /usr/local/bin/conv_doc.pl \ +# application/postscript->text/html /usr/local/bin/conv_doc.pl \ +# application/pdf->text/html /usr/local/bin/conv_doc.pl +# +# Written by Gilles Detillieux <[email protected]>. +# Based in part on the parse_word_doc.pl script, written by +# Jesse op den Brouw <[email protected]> but heavily revised. +# +# 1998/12/11 +# Added: catdoc test (is catdoc runnable?) <[email protected]> +# 1999/02/09 +# Added: uses ps2ascii to handle PS files <[email protected]> +# 1999/02/15 +# Added: check for some file formats <[email protected]> +# 1999/02/25 +# Added: uses pdftotext to handle PDF files <[email protected]> +# 1999/03/01 +# Added: extra checks for file "wrappers" <[email protected]> +# & check for MS Word signature (no longer defaults to catdoc) +# 1999/03/05 +# Changed: rejoin hyphenated words across lines <[email protected]> +# (in PDFs) +# 1999/08/12 +# Changed: adapted for xpdf 0.90 release <[email protected]> +# Added: uses pdfinfo to handle PDF titles <[email protected]> +# Changed: change dashes to hyphens <[email protected]> +# 1999/09/09 +# Changed: fix to handle empty PDF title right <[email protected]> +# 1999/12/01 +# Changed: rewritten as external converter <[email protected]> +# stripped out all parser-related code +# Added: test to silently ignore wrapped EPS files < " > +# Added: test for null device on Win32 env. <[email protected]> +# 2000/01/12 +# Changed: "break" to "last" (no break in Perl) <[email protected]> +# 2001/07/12 +# Changed: fix "last" handling in dehyphenation <[email protected]> +# Added: handle %xx codes in title from URL <[email protected]> +######################################### +# +# set this to your MS Word to text converter +# get it from: http://www.fe.msk.ru/~vitus/catdoc/ +# +$CATDOC = "/usr/local/bin/catdoc"; +# +# set this to your WordPerfect to text converter, or /bin/true if none available +# this nabs WP documents with .doc suffix, so catdoc doesn't see them +# +$CATWP = "/bin/true"; +# +# set this to your RTF to text converter, or /bin/true if none available +# this nabs RTF documents with .doc suffix, so catdoc doesn't see them +# +$CATRTF = "/bin/true"; +# +# set this to your PostScript to text converter +# get it from the ghostscript 3.33 (or later) package +# +$CATPS = "/usr/bin/ps2ascii"; +# +# set this to your PDF to text converter, and pdfinfo tool +# get it from the xpdf 0.90 package at http://www.foolabs.com/xpdf/ +# +$CATPDF = "/usr/bin/pdftotext"; +$PDFINFO = "/usr/bin/pdfinfo"; +#$CATPDF = "/usr/local/bin/pdftotext"; +#$PDFINFO = "/usr/local/bin/pdfinfo"; + +######################################### +# +# need some var's +$dehyphenate = 0; # set if we must dehyphenate text output +$ishtml = 0; # set if converter produces HTML +$null = ""; +$magic = ""; +$type = ""; +$cvtr = ""; +$cvtcmd = ""; +$title = ""; +@parts = (); + +# make portable to win32 platform or unix +$null = "/dev/null"; +if ($^O eq "MSWin32") {$null = "nul";} + + +######################################### +# +# Read first bytes of file to check for file type (like file(1) does) +open(FILE, "< $ARGV[0]") || die "Can't open file $ARGV[0]: $!\n"; +read FILE,$magic,8; +close FILE; + +if ($magic =~ /^\0\n/) { # possible MacBinary header + open(FILE, "< $ARGV[0]") || die "Can't open file $ARGV[0]: $!\n"; + read FILE,$magic,136; # let's hope converters can handle them! + close FILE; +} + +if ($magic =~ /%!|^\033%-12345/) { # it's PostScript (or HP print job) + $cvtr = $CATPS; # gs 3.33 leaves _temp_.??? files in . +# keep quiet even if PS gives errors... + $cvtcmd = "(cd /tmp; $cvtr; rm -f _temp_.???) < $ARGV[0] 2>$null"; +# allow PS interpreter to give error messages... +# $cvtcmd = "(cd /tmp; $cvtr; rm -f _temp_.???) < $ARGV[0]"; + $type = "PostScript"; + $dehyphenate = 0; # ps2ascii already does this + if ($magic =~ /^\033%-12345/) { # HP print job + open(FILE, "< $ARGV[0]") || die "Can't open file $ARGV[0]: $!\n"; + read FILE,$magic,256; + close FILE; + exit unless $magic =~ /^\033%-12345X\@PJL.*\n*.*\n*.*ENTER\s*LANGUAGE\s*=\s*POSTSCRIPT.*\n*.*\n*.*\n%!/ + } +} elsif ($magic =~ /\305\320\323\306\036/) { # it's a wrapped EPS - ignore + exit +} elsif ($magic =~ /%PDF-/) { # it's PDF (Acrobat) + $cvtr = $CATPDF; + $cvtcmd = "$cvtr -raw $ARGV[0] -"; +# to handle single-column, strangely laid out PDFs, use coalescing feature... +# $cvtcmd = "$cvtr $ARGV[0] -"; + $type = "PDF"; + $dehyphenate = 1; # PDFs often have hyphenated lines + if (open(INFO, "$PDFINFO $ARGV[0] 2>$null |")) { + while (<INFO>) { + if (/^Title:/) { + s/^Title:\s+//; + s/\s+$//; + s/\s+/ /g; + s/&/\&\;/g; + s/</\<\;/g; + s/>/\>\;/g; + $title = $_; + last; + } + } + close INFO; + } +# to use coalescing feature conditionally... +# if ($title =~ /...Title of Corel DRAW output.../) { +# $cvtcmd = "$cvtr $ARGV[0] -"; +# } +} elsif ($magic =~ /WPC/) { # it's WordPerfect + $cvtr = $CATWP; + $cvtcmd = "$cvtr $ARGV[0]"; + $type = "WordPerfect"; + $dehyphenate = 0; # WP documents not likely hyphenated +} elsif ($magic =~ /^{\\rtf/) { # it's Richtext + $cvtr = $CATRTF; + $cvtcmd = "$cvtr $ARGV[0]"; + $type = "RTF"; + $dehyphenate = 0; # RTF documents not likely hyphenated +} elsif ($magic =~ /\320\317\021\340/) { # it's MS Word + $cvtr = $CATDOC; + $cvtcmd = "$cvtr -a -w $ARGV[0]"; + $type = "Word"; + $dehyphenate = 0; # Word documents not likely hyphenated +} else { + die "Can't determine type of file $ARGV[0]; content-type: $ARGV[1]; URL: $ARGV[2]\n"; +} + +die "$cvtr is absent or unwilling to execute.\n" unless -x $cvtr; + +############################################# +# +# Start output. + +# if running as a converter for "user-defined" output type... +#print "Content-Type: text/html\n\n"; + +if ($ishtml) { + # converter will give its own HTML output + system("$cvtcmd") || die "$cvtr doesn't want to be run from shell.\n"; + exit; +} + +# Produce HTML output from converter's text output, so we can add title. +print "<HTML>\n<head>\n"; + +# print out the title, if it's set, and not just a file name, or make one up +if ($title eq "" || $title =~ /^[A-G]:[^\s]+\.[Pp][Dd][Ff]$/) { + @parts = split(/\//, $ARGV[2]); # get the file basename + $parts[-1] =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/gie; + $title = "$type Document $parts[-1]"; # use it in title +} +print "<title>$title</title>\n"; + +print "</head>\n<body>\n"; + +# Open file via selected converter, output its text. +open(CAT, "$cvtcmd |") || die "$cvtr doesn't want to be opened using pipe.\n"; +while (<CAT>) { + while (/[A-Za-z\300-\377]-\s*$/ && $dehyphenate) { + $_ .= <CAT>; + last if eof; + s/([A-Za-z\300-\377])-\s*\n\s*([A-Za-z\300-\377])/$1$2/s + } + s/[\255]/-/g; # replace dashes with hyphens + s/\f/\n/g; # replace form feed + s/&/\&\;/g; # HTMLify text + s/</\<\;/g; + s/>/\>\;/g; + print; +} + +print "</body>\n</HTML>\n"; + +close CAT; + diff --git a/debian/htdig/htdig-3.2.0b6/contrib/doc2html/DETAILS b/debian/htdig/htdig-3.2.0b6/contrib/doc2html/DETAILS new file mode 100644 index 00000000..35300c03 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/doc2html/DETAILS @@ -0,0 +1,399 @@ +INTRODUCTION +============ + +This DETAILS file accompanies doc2html version 3.0.1. + +Read this file for instructions on the installation and use of the +doc2html scripts. + +The set of files is: + + DETAILS - this file + doc2html.pl - the main Perl script + doc2html.cfg - configuration file for use with wp2html + doc2html.sty - style file for use with wp2html + pdf2html.pl - Perl script for converting PDF files to HTML + swf2html.pl - Perl script for extracting links from Shockwave flash files. + README - brief description + +doc2html.pl is a Perl5 script for use as an external converter with +htdig 3.1.4 or later. It takes as input the name of a file containing a +document in a number of possible formats and its MIME type. It uses +the appropriate conversion utility to convert it to HTML on standard +output. + +doc2html.pl was designed to be easily adapted to use whatever conversion +utilities are available, and although it has been written around the +"wp2html" utility, it does not require wp2html to function. + +NOTE: version 3.0.1 has only been tested on Unix. + +pdf2html.pl is a Perl script which uses a pair of utilities (pdfinfo and +pdf2text) to extract information and text from an Adobe PDF file and +write HTML output. It can be called directly from htdig, but you are +recommended to call it via doc2html.pl. + +swf2html.pl is a Perl script which calls a utility (swfparse) and +outputs HTML containing links to the URL's found in a Shockwave flash +file. It can be called directly from htdig, but you are recommended to +call it via doc2html.pl. + + +ABOUT DOC2HTML.PL +================= + +doc2html.pl is essentially a wrapper script, and is itself only capable +of reading plain text files. It requires the utility programs described +below to work properly. + +doc2html.pl was written by David Adams <[email protected]>, it is +based on conv_doc.pl written by Gilles Detillieux <[email protected]>. +This in turn was based on the parse_word_doc.pl script, written by +Jesse op den Brouw <[email protected]>. + +doc2html.pl makes up to three attempts to read a file. It first tries +utilities which convert directly into HTML. If one is not found, or no +output is produced, it then tries utilities which output plain text. If +none is found, and the file is not of a type known to be unconvertable, +then doc2html.pl attempts to read the file itself, stripping out any +control characters. + +doc2html.pl is written to be flexible and easy to adapt to whatever +conversion utilites are available. New conversion utilities may be +added simply by making additions to routine 'store_methods', with no +other changes being necessary. The existing lines in store_methods +should provide sufficient examples on how to add more converters. Note +that converters which produce HTML are entered differently to those that +produce plain text. + +htdig provides three arguments which are read by doc2html.pl: + +1) the name of a temporary file containing a copy of the + document to be converted. + +2) the MIME type of the document. + +3) the URL of the document (which is used in generating the + title in the output). + +The test for document type uses both the MIME-type passed as second +argument and the "Magic number" of the file. + + +INSTALLATION +============ + +Installation requires that you acquire, compile and install the utilities +you need to do the conversions. Those already setup in the Perl scripts are +described below. + +If you don't have Perl module Sys::AlarmCall installed, then consider +installing it, see section "TIMEOUT" below. + +You may need to change the first line of each script to the location of +Perl on your system. + +Edit doc2html.pl to include the full pathname of each utility you have +installed. For example: + +my $WP2HTML = '/opt/local/wp2html-3.2/bin/wp2html'; + +If you don't have a particular utility then leave its location as a null +string. + +Then place doc2html.pl and the other scripts where htdig can access them. + +If you are going to convert PDF files then you will need to edit pdf2html.pl +and include its full path name in doc2html.pl. + +If you are going to extract links from Shockwave flash files then you will +need to edit swf2html.pl and include its full path name in doc2html.pl. + +Edit the htdig.conf configuration file to use the script, as in this example: + +external_parsers: application/rtf->text/html /usr/local/scripts/doc2html.pl \ + text/rtf->text/html /usr/local/scripts/doc2html.pl \ + application/pdf->text/html /usr/local/scripts/doc2html.pl \ + application/postscript->text/html /usr/local/scripts/doc2html.pl \ + application/msword->text/html /usr/local/scripts/doc2html.pl \ + application/Wordperfect5.1->text/html /usr/local/scripts/doc2html.pl \ + application/msexcel->text/html /usr/local/scripts/doc2html.pl \ + application/vnd.ms-excel->text/html /usr/local/scripts/doc2html.pl \ + application/vnd.ms-powerpoint->text/html /usr/local/scripts/doc2html.pl \ + application/x-shockwave-flash->text/html /usr/local/scripts/doc2html.pl \ + application/x-shockwave-flash2-preview->text/html /usr/local/scripts/doc2html.pl + +If you are using wp2html then place the files doc2html.cfg and doc2html.sty in the +wp2html library directory. + + +UTILITY WP2HTML +=============== + +Obtain wp2html from http://www.res.bbsrc.ac.uk/wp2html/ + +Note that wp2html is not free; its author charges a small fee for +"registration". Various pre-compiled versions and the source code are +available, together with extensive documentation. Upgrades are +available at no further charge. + +wp2html converts WordPerfect documents (5.1 and later) to HTML. +Versions 3.2 and later will also convert Word7 and Word97 documents to +HTML. A feature of wp2html which doc2html.pl exploits is that the -q +option will result in either good HTML or no output at all. + +wp2html is very flexible in the output it creates. The two files, +doc2html.cfg and doc2html.sty, should be placed in the wp2html library +directory along with the .cfg and .sty files supplied with wp2html. + +Edit the line in doc2html.pl: + +my $WP2HTML = ''; + +to set $WP2HTML to the full pathname of wp2html. + +wp2html will look for the title in a document, and if it is found then +output it in <TITLE>....</TITLE> markup. If a title is not found +then it defaults to the file name in square brackets. + +If wp2html is unable to convert a document, or is not installed, +then doc2html.pl can use the "catdoc" or "catwpd" utilities instead. + + +UTILITY CATDOC +============== + +Obtain catdoc from http://www.ice.ru/~vitus/catdoc/, it is available +under the terms of the Gnu Public License. + +Edit the line in doc2html.pl: + +my $CATDOC = ''; + +to set the variables to the full pathname of catdoc. You might want +to use a different version of catdoc for Word2 documents or for MAC Word +files. + +catdoc converts MS Word6, Word7, etc., documents to plain text. The +latest beta version is also able to convert Word2 documents. catdoc +also produces a certaint amount of "garbage" as well as the text of the +document. The -b option improves the likelihood that catdoc will +extract all the text from the document, but at the expense of increasing +the garbage as well. doc2html.pl removes some non-printing characters +to minimise the garbage. If a later version of catdoc than 0.91.4 is +obtained then the use of the -b option should be reviewed. + + +UTILITY CATWPD +============== + +Obtain catwpd from the contribs section of the Ht://Dig web site where +you obtained doc2html. It extracts words from some versions of WordPerfect +files. You won't need it if you buy the superior wp2html. + +If you do use it, then edit the line in doc2html.pl: + +my $CATWPD = ''; + +to set the variables to the full pathname of catwpd. + + +UTILITY PPTHTML +=============== + +obtain ppthtml from http://www.xlhtml.org, where it is bundled in with +xlhtml. + +In doc2html.pl, edit the line: + +my $PPT2HTML = ''; + +to set $PPT2HTML to the full pathname of ppthtml. + +ppthtml converts Microsoft Powerpoint files into HTML. It uses the input +filename as the title. doc2html.pl replaces this with the original +filename from the URL in square brackets. + + +UTILITY XLHTML +============== + +Obtain xlhtml from http://www.xlhtml.org + +In doc2html.pl, edit the line: + +my $XLS2HTML = ''; + +to set $XLS2HTML to the full pathname of xlhtml. + +xlhtml converts Microsoft Excel spreadsheets into HTML. It uses the input +filename as the title. doc2html.pl replaces this with the original +filename from the URL in square brackets. + +The present version of xlHtml (0.4) writes HTML output, but does not +mark up hyperlinks in .xls files as links in its output. + +An alternative to xlHtml is xls2csv, see below. + + +UTILITY RTF2HTML +================ + +Obtain rtf2html from http://www.ice.ru/~vitus/catdoc/ + +In doc2html.pl, edit the line: + +my $RTF2HTML = ''; + +to set $RTF2HTML to the full pathname of rtf2html. + +rtf2html converts Rich Text Font documents into HTML. It uses the input +filename as the title, doc2html.pl replaces this with the original +filename from the URL within square brackets. + + +UTILITY PS2ASCII +================ + +Ps2ascii is a PostScript to text converter. + +In doc2html.pl, edit the line: + +my $CATPS = ''; + +to the correct full pathname of ps2ascii. + +ps2ascii comes with ghostscript 3.33 (or later) package, which is +pre-installed on many Unix systems. Commonly, it is a Bourne-shell +script which invokes "gs", the Ghostscript binary. doc2html.pl has +provision for adding the location of gs to the search path. + + +UTILITY PDFTOTEXT +================= + +pdftotext converts Adobe PDF files to text. pdfinfo is a tool which +displays information about the document, and is used to obtain its +title, etc. Get them from the xpdf package at +http://www.foolabs.com/xpdf/ + +In script pdf2html.pl, change the lines: + +my $PDFTOTEXT = "/... .../pdftotext"; +my $PDFINFO = "/... .../pdfinfo"; + +to the correct full pathnames. + +Edit doc2html.pl to include the full pathname of the pdf2html.pl script. + +pdf2text may fail to convert PDF documents which have been truncated +because htdig has max_doc_size set to smaller than the documents full +size. Some PDF documents do not allow text to be extracted. + + +UTILITY CATXLS +============== + +The Excel to .csv converter, xls2csv, is included with recent versions of +catdoc. This is an alternative to xlhtml (see above). + +Edit the line: + +my $CATXLS = ''; + +to the full pathname of xls2csv. + +Xls2csv translates Excel spread sheets into comma-separated data. + + +UTILITY SWFPARSE +================ + +swfparse (aka swfdump) extracts information from Shockwave flash files, +and can be obtained from the contribs section of the Ht://Dig web site, +where you obtained doc2html. + +Perl script swf2html.pl calls swfparse and writes HTML output containing +links to the URLs found in the Shockwave file. It does NOT extract text +from the file. + +In script swf2html.pl, change the line: + +my $SWFPARSE = "/... .../swfdump"; + +to the full pathname. + +Edit doc2html.pl to include the full pathname of the swf2html.pl script. + + +LOGGING +======= + +Output of logging information and error messages is controlled by the +environmental variable DOC2HTML_LOG, which may be set in the rundig +script. If it is not set then only error messages output by doc2html.pl +and by the conversion utilities it calls are returned to htdig and will +appear in its STDOUT. If DOC2HTML_LOG is set to a filename, then +doc2html.pl appends logging information and any error messages to the +file. If DOC2HTML_LOG is set but blank, or the file cannot be opened +for writing, logging information and error messages are passed back to +htdig and will appear its STDOUT. + +In doc2html.pl, the variables $Emark and $EEmark, set in subroutine init, +are used to highlight error messages. + +The number of lines of STDERR output from a utility which is logged or +passed back to htdig is controlled by the variable $Maxerr set in +routine "init" of doc2html.pl. This is provided in order to curb the +large number of error messages which some utilities can produce from +processing a single file. + + +TIMEOUT +======= + +If possible, install Perl module Sys::AlarmCall, obtainable from CPAN if +you don't already have it. This module is used by doc2html.pl to +terminate a utility if it takes too long to finish. The line in +doc2html.pl: + + $Time = 60; # allow 60 seconds for external utility to complete + +may be altered to suit. + + +LIMITING INPUT AND OUTPUT +========================= + +The environmental variable DOC2HTML_IP_LIMIT may be set in the rundig +script to limit the size of the file which doc2html.pl will attempt to +convert. The default value is 20000000. Doc2html.pl will return no +output to htdig if the file size is equal to or greater than this size. + +You are recommended to set DOC2HTML_IP_LIMIT to the same as the +"max_doc_size" parameter in the htdig configuration file. Then no +attempt wil be made to extract text from files which have been truncated +by htdig. It is not possible to extract any text from .PDF files, for +example, if they have been truncated. + +The environmental variable DOC2HTML_OP_LIMIT may be set in the rundig +script to limit the output sent back to htdig by a single call to +doc2html.pl. The default value is 10000000. Doc2html.pl will stop +returning output to htdig once the DOC2HTML_OP_LIMIT has been reached. +This is precaution against the unlikely event of a conversion utility +returning disproportionately large amounts of data. + + +CONTACT +======= + +Any queries regarding doc2html are best sent to the mailing list + +The author can be emailed at [email protected] + +David Adams +Information Systems Services +University of Southampton + +27-November-2002 diff --git a/debian/htdig/htdig-3.2.0b6/contrib/doc2html/README b/debian/htdig/htdig-3.2.0b6/contrib/doc2html/README new file mode 100644 index 00000000..427eb8ce --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/doc2html/README @@ -0,0 +1,25 @@ +Readme for doc2html + +External converter scripts for ht://Dig (version 3.1.4 and later), that +convert Microsoft Word, Excel and Powerpoint files, and PDF, +PostScript, RTF, and WordPerfect files to text (in HTML form) so they +can be indexed. Uses a variety of conversion programs: + + wp2html - to convert Wordperfect and Word7 & 97 documents to HTML + catdoc - to extract text from Word documents + catwpd - to extract text from WordPerfect documents [alternative to wp2html] + rtf2html - to convert RTF documents to HTML + pdftotext - to extract text from Adobe PDFs + ps2ascii - to extract text from PostScript + pptHtml - to convert Powerpoint files to HTML + xlHtml - to convert Excel spreadsheets to HTML + xls2csv - to extract data from Excel spreadsheets [alternative to xlHtml] + swfparse - to extract links from Shockwave flash files. + +The main script, doc2html.pl, is easily edited to include the available +utlitities, and new utilities are easily incorporated. + +Written by David Adams (University of Southampton), and based on the +conv_doc.pl script by Gilles Detillieux. + +For more information see the DETAILS file. diff --git a/debian/htdig/htdig-3.2.0b6/contrib/doc2html/doc2html.cfg b/debian/htdig/htdig-3.2.0b6/contrib/doc2html/doc2html.cfg new file mode 100644 index 00000000..0bff981a --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/doc2html/doc2html.cfg @@ -0,0 +1,413 @@ +# Configuration file for use with doc2html.pl, which is used +# to index Word, WordPerfect , etc. files using Ht://dig. +# +# Based on wp2html.cfg file supplied with wp2html version 3.0 + +# The special token "typeout" simply outputs the given text +# and can be used to inform users of versions, configuration changes etc. +typeout="" + +#------------------- Single character translations --------------- +# Protect HTML magic symbols. +'<'="<" +'>'=">" +'&'="&" +'"'=""" + +#------------------- WP code translations --------------- +# File header. BEGIN is called before any text is output +# BEGIN is passed three strings being the +# Input Directory, Input file name and Input file type. +# Do what you like with them! + +BEGIN="<HTML> +<HEAD> +<Title>%X<XDocSubject></Title> +<META name=\"keywords\" content=\"%X<XDocKeywords>\"> +<META name=\"description\" content=\"%X<XDocAbstract>\"> +</HEAD> +<BODY> +<p> +%xH +" +# Beginning of a subpart. This is called for each file. +begin="<html> +<head> +<title>%X<XDocSubject - %O</title> +</head> +<body>\n" + +# File end. END is called at the end of the document +# You may wish to insert signatures etc. +END="<p> +%xf +%xF +%X<XDocAuthor> +</BODY> +</HTML>\n" + +# End of a subpart. This is called for each sub part of a file except last. +end="\n%xf\n</body>\n</html>\n" + +# End of the last subpart. This is only for the final sub-part +# which may wish to have a different ending to the others (like +# perhaps not refering to the NEXT chapter?) +End="\n%xf\n</body>\n</html>\n" + +# Message output by wp2html into output file but not to be displayed +# or for "hidden" WP text +Message="<!-- " +message="-->" + +# PageNo="\\folio{}" # insert page number +# RomanPage="%\n%% Set roman page number %1\n" # set roman numerals +# ArabicPage="%\n%% Set arabic page number %1\n" # set arabic numerals + +HSpace=" " # Unbreakable (Hard) space + +# Tabs in Netscape (before HTML3) cannot be done properly +# We fudge them here with non breaking space until Netscape 1.2? +Tab=" " + +# Hard page looks quite good as a line +HPg="\n" +# Soft page is usually ignored +SPg=" " + +CondEOP=" " + +HRt="<br>\n" # Single Hard return just a break +HRt2="<p>\n" # Two or more terminates paragraph +SRt="\n" # Soft return is a newline +DSRt="\n" # Deletable return at end of line +DSPg="\n" # Deletable return at end of page + +softHyphen="" # Hyphens are ignored, since wrapping is up the clients. +softHyphenEOL="" # same for hyphens at end of a line +hardHyphen="-" # Nonbreaking hyphen, must put in +autoHyphen="" # auto Hyphens will be ignored too +NoHyphWord="" # Inhibit hyphenation of this word + +# Margins are left as comments until HTML3 arrives +Marg=" " +TopMarg=" " +PageLength=" " + +# Line spacing changes are ignored + +SS="" +1.5S="" +DS="" +TS="" +LS="" +LPI="" + +# Font changes mapped to Netscape font size defn +ExtraLarge="<H1>" +extralarge="</h1>" +VeryLarge="<h2>" +verylarge="</h2>" +Large="<h3>" +large="</h3>" +Small="<h5>" +small="</h5>" +Fine="</h6>" +fine="<h6>" + +FontColour="\n<font color=\"#%s\">" +fontColour="\n</font>" + +Font=" " +font=" " + +Bold="<b>" # Boldface +bold="</b>" +Und="<u>" +und="</u>" +Red="" # Redlining +red="" +Strike="" # Strikeout XXX +strike="" +Italics="<i>" +italics="</i>" +Rev="" # Reverse video XXX +rev="" +Over="" # overprinting not supported XXX +over="" +# Netscape 2 and after can use +Sup="<sup>" +sup="</sup>" +Sub="<sub>" +sub="</sub>" + +# UpHalfLine, DownHalfLine, AdvanceToHalf -- undefined + +# Indent mapped to unordered lists, good for blocked indents +#Indent="\n<ul>\n" +#indent="\n</ul>\n" +#DIndent="\n<ul>\n" +#dindent="\n</ul>\n" + +# Indents as space, use if indents are like TABS for you +Indent=" " +indent="" +DIndent=" " +dindent="" + +# Margin release is passed one parameter, the number of characters. +MarginRelease=" " + +Center="<Div align=\"center\">" # centering, Netscape way +center="</Div>\n" + +Left="<Div align=\"left\">" # Netscape left justify +left="</Div>" + +Right="<Div align=\"right\">" # Netscape right justify +right="</Div>" + +Full="<Div align=\"full\">" # Netscape full justify +full="</Div>" + +# Can use also +# Left +# left +# Full +# full + + +# Math, math, MathCalc, MathCalcColumn, SubTtl, IsSubTtl, Ttl, IsTtl, GrandTtl +# -- undefined +Column="<MULTICOL COLS=%1>" +column="</MULTICOL>" + +Header="\n" +header="\n" +Footer="\n" +footer="\n" + +Footnote=" <Font size=2> +<a href=\"#Footnote%1\" name=\"Footref%1\"> +</Font> " +footnote="</a>" +FootnoteFormat="<a href=\"#Footref%1\" name=\"Footnote%1\">%1</i></a><i>" +footnoteFormat="</i>" + +# Displays for various automatic numbers +# uncomment these to "eat" the autonumbers inserted by WP6 +FootnoteDisplay="%e" +footnoteDisplay="%f" + +#EndnoteDisplay="%e" +#endnoteDisplay="%f" +#ParanumDisplay="%e" +#paranumDisplay="%f" +#LinenumDisplay="%e" +#linenumDisplay="%f" + +BeginTabs="" +SetTab="" +SetTabCenter="" +SetTabRight="" +SetTabDecimal="" +EndTabs="" + +Hyph="" # Allow hyphenation +hyph="" # Disable hyphenation +Wid="" # Widow protection +wid="" # Allow widows + +# HZone, DAlign -- undefined + +Supp=" " +CtrPg=" " +SetFont=" " +SetBin=" " + +# True table definitions, these are Netscape style (HTML3) +# Start of a table +Table="\n<Table>" +# end of a table +table="\n</Table>\n" + +# New row +Row="\n<Tr>" +# End row +row="\n</Tr>" + +# New cell. Is passed the col and row spans integers and align flags +#Cell="\n<Td ColSpan=%1 RowSpan=%2 Align=%u Valign=%v>\n" +Cell="\n<Td>\n" +# End cell +cell="\n</Td>" + +# Table header cells. +HeadCell="\n<Th ColSpan=%1 RowSpan=%2 Align=%u Valign=%v>\n" +HeadCell="\n<Th>\n" +# End header cell +headCell="\n</Th>" + +# Ordinary WP comment anywhere in the document, passed comment text +Comment="\n<!-- WP Comment " +comment=" -->\n" + +# default Style operation for styles user has not names below +defaultStyleOn=" " +defaultStyleOff=" " +defaultStyleOpen=" " +defaultStyleEnd=" " + +# Set defaults for TOC markers here +ToC=" " +toC=" " +ToC1=" " +toC1=" " +ToC2=" " +toC2=" " +ToC3=" " +toC3=" " +ToC4=" " +toC4=" " +ToC5=" " +toC5=" " + +# Detect start and end of index in document +Index=" " +index=" " + +# Set defaults for List markers here +List=" " +list=" " +List1=" " +list1=" " +List2=" " +list2=" " +List3=" " +list3=" " +List4=" " +list4=" " +List5=" " +list5=" " + +ToA=" " +toA=" " +ToAMark=" " + +XrefMark=" " +xrefMark=" " +XrefTarget=" " + +# Figure inside WP. +# Right now we have a confusing 3 options, that is we have +# A WPG image inside Wordperfect, +# 1. With no GIF/JPEG conversion available +# 2. Of unknown size but a GIF/JPEG conversion exists +# 3. With known (set) size and with GIF/JPEG conversion +# Depending on which of the above we find we call one of the next +# three tags, Figure, Image, ImageSized +# NO GIF/JPEG +Figure=" " + +# GIF/JPEG available, but sizes and alignment unknown +# Image="<a href=\"%s.%t\"><img src=\"%s.%u\"></a> +# Click thumbnail picture to see full size version" +# You can also do thumbnails too, like this +#Image="<a href=\"%s.%t\" target=\"Graphics\"><img src=\"%s.%u\"></a> +#Click thumbnail picture to see full size in separate window" +Image="<img src=\"%s.%t\"></a>" + + +# GIF/JPEG available, and sizes and alignment known +# If this is NOT given, Image will be used instead +#ImageSized="<img src=\"%s.%t\" height=%3 width=%4 align=%w>" + +# Boxes, Table, Text and User +TableBox="\n" +tableBox="\n" +TextBox="\n" +textBox="\n" +UserBox="\n" +userBox="\n" + +# Equations are rendered as rough text right now, wait for HTML3 +# and we can switch this on properly +Equation=" " +equation=" " + +# Captions for all boxes +Caption="<br><i>[" +caption="]</i><br>\n" + +HLine="<hr width=%1 size=%2 align=%u>" + +ParaNum1="%s " +ParaNum2="%s " +ParaNum3="%s " +ParaNum4="%s " +ParaNum5="%s " +ParaNum6="%s " +ParaNum7="%s " +ParaNum8="%s " +ParaNumEnd=" " + +PN0=" " +PN1=" " +PN2=" " +PN3=" " +PN4=" " +PN5=" " +PN6=" " +PN7=" " +PN8=" " + +#********************************************************************* +# End of required parameters -- start of optional entries + +# Here is the complete list of fields which can optionally be +# extracted from the extended document summary area of the file +# If they are not defined (to be anything) they will not be extracted +#!XDocName="Filename" +#!XDocType="Type" +XDocAuthor="" +XDocAbstract="" +#!XDocAccount="Account" +XDocKeywords="" +XDocSubject="%X<Title>" +#!XDocTypist="Typist" + +# These tags allow you to convert WP6 hypertext refs into HTML hypertext +# But you should be careful how you use them as documents NOT designed +# to be HTML (perhaps linking to other documents in unreachable +# directories) would generate confusing links. +#!HyperBegin="<a href=\"%s.htm#%t\">%e" +#!hyperBegin="%f" +#!HyperEnd="</a>%e" +#!hyperEnd="%f" + +# Or you can use WP hypertext refs as direct hypertext +# but make sure you insert the URL in the BOOKMARK field +HyperBegin="<a href=\"%t\">%e" +hyperBegin="%f" +HyperEnd="</a>%e" +hyperEnd="%f" + +# These tags allow you to convert WP6 bookmarks into appropriate +# HTML anchors which are needed if you use the Hyper tags above. +BookMark="<a name=\"%s\">" +bookMark="</a>" + +# Hypertext references in Word Documents (real URL) +Href="<a href=\"%s\">" +href="</a>" + +# These tags allow you to find the file name of included files +#SubDoc="<hr>Start Included file %s<hr>\n" +#subDoc="<hr>End Included file %s<hr>\n" +SubDoc="\n" +subDoc="\n" + +# These tags are trigger by WP Small Caps attributes (On/Off) +#!SmallCaps="" +#!smallCaps="" + +# End of main configuration file diff --git a/debian/htdig/htdig-3.2.0b6/contrib/doc2html/doc2html.pl b/debian/htdig/htdig-3.2.0b6/contrib/doc2html/doc2html.pl new file mode 100755 index 00000000..c69f00cc --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/doc2html/doc2html.pl @@ -0,0 +1,676 @@ +#!/usr/bin/perl +use strict; +# +# Version 3.0.1 19-September-2002 +# +# External converter for htdig 3.1.4 or later (Perl5 or later) +# Usage: (in htdig.conf) +# +#external_parsers: application/rtf->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl \ +# text/rtf->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl \ +# application/pdf->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl \ +# application/postscript->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl \ +# application/msword->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl \ +# application/wordperfect5.1->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl \ +# application/wordperfect6.0->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl \ +# application/msexcel->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl \ +# application/vnd.ms-excel->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl \ +# application/vnd.ms-powerpoint->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl +# application/x-shockwave-flash->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl \ +# application/x-shockwave-flash2-preview->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl +# +# Uses wp2html to convert Word and WordPerfect documents into HTML, and +# falls back to using Catdoc for Word and Catwpd for WordPerfect if +# Wp2html is unavailable or unable to convert. +# +# Uses range of other converters as available. +# +# If all else fails, attempts to read file without conversion. +# +######################################################################################## +# Written by David Adams <[email protected]>. +# Based on conv_doc.pl written by Gilles Detillieux <[email protected]>, +# which in turn was based on the parse_word_doc.pl script, written by +# Jesse op den Brouw <[email protected]>. +######################################################################################## + +# Install Sys::AlarmCall if you can +eval "use Sys::AlarmCall"; + +######## Full paths of conversion utilities ########## +######## YOU MUST SET THESE ########## +######## (leave null those you don't have) ########## + +# Wp2html converts Word & Wordperfect to HTML +# (get it from: http://www.res.bbsrc.ac.uk/wp2html/): +my $WP2HTML = ''; + +#Catwpd for WordPerfect to text conversion +# (you don't need this if you have wp2html) +# (get it from htdig site) +my $CATWPD = ''; + +# rtf2html converts Rich Text Font documents to HTML +# (get it from http://www.ice.ru/~vitus/catdoc/): +my $RTF2HTML = ''; + +# Catdoc converts Word (MicroSoft) to plain text +# (get it from: http://www.ice.ru/~vitus/catdoc/): + +#version of catdoc for Word6, Word7 & Word97 files: +my $CATDOC = ''; + +#version of catdoc for Word2 files: +my $CATDOC2 = $CATDOC; + +#version of catdoc for Word 5.1 for MAC: +my $CATDOCM = $CATDOC; + +# PostScript to text converter +# (get it from the ghostscript 3.33 (or later) package): +my $CATPS = ''; + +# add to search path the directory which contains gs: +#$ENV{PATH} .= ":/usr/freeware/bin"; + +# PDF to HTML conversion script: +my $PDF2HTML = ''; # full pathname of pdf2html/pl script + +# Excel (MicroSoft) to HTML converter +# (get it from www.xlhtml.org) +my $XLS2HTML = ''; + +# Excel (MicroSoft) to .CSV converter +# (you don't need this if you have xlhtml) +# (if you do want it, you can get it with catdoc) +my $CATXLS = ''; + +# Powerpoint (MicroSoft) to HTML converter +# (get it from www.xlhtml.org) +my $PPT2HTML = ''; + +# Shockwave Flash +# (extracts links from file) +my $SWF2HTML = ''; # full pathname of swf2html.pl script + +# OpenOffice.org files +#my $OpenOffice2XML = '/usr/bin/unzip'; +my $OpenOffice2XML = ''; +# (remove multi-byte unicode from XML in OOo documents) +#my $strip_unicode = '| /usr/bin/iconv -c -s -f UTF-8 -t ISO-8859-1'; +my $strip_unicode = ''; + + +######################################################################## + +# Other Global Variables +my ($Success, $LOG, $Verbose, $CORE_MESS, $TMP, $RM, $ED, $Magic, $Time, + $Count, $Prog, $Input, $MIME_type, $URL, $Name, $Efile, $Maxerr, + $Redir, $Emark, $EEmark, $Method, $OP_Limit, $IP_Limit); +my (%HTML_Method, %TEXT_Method, %BAD_type); + + +&init; # initialise +my $size = -s $Input; +&quit("Input file size of $size at or above $IP_Limit limit" ) if $size >= $IP_Limit; +&store_methods; # +&read_magic; # Magic reveals type +&error_setup; # re-route standard error o/p from utilities + +# see if a document -> HTML converter will work: +&run('&try_html'); +if ($Success) { &quit(0) } + +# try a document -> text converter: +&run('&try_text'); +if ($Success) { &quit(0) } + +# see if a known problem +my $fail = &cannot_do; +if ($fail) { &quit($fail) } + +# last-ditch attempt, try copying document +&try_plain; +if ($Success) {&quit(0)} + +&quit("UNABLE to convert"); + +#------------------------------------------------------------------------------ + +sub init { + + # Doc2html log file + $LOG = $ENV{'DOC2HTML_LOG'} || ''; + # + if ($LOG) { + open(STDERR,">>$LOG"); # ignore possible failure to open + } # else O/P really does go to STDERR + + # Set to 1 for O/P to STDERR or Log file + $Verbose = exists($ENV{'DOC2HTML_LOG'}) ? 1 : 0; + + # Limiting size of file doc2html.pl will try to process (default 20Mbyte) + $IP_Limit = $ENV{'DOC2HTML_IP_LIMIT'} || 20000000; + + # Limit for O/P returned to htdig (default 10Mbyte) + $OP_Limit = $ENV{'DOC2HTML_OP_LIMIT'} || 10000000; + + # Mark error message produced within doc2html script + $Emark = "!\t"; + # Mark error message produced by conversion utility + $EEmark = "!!\t"; + + # Message to STDERR if core dump detected + $CORE_MESS = "CORE DUMPED"; + + # Directory for temporary files + $TMP = "/tmp/htdig"; + if (! -d $TMP) { + mkdir($TMP,0700) or die "Unable to create directory \"$TMP\": $!"; + } + # Current directory during run of script: + chdir $TMP or warn "Cannot change directory to $TMP\n"; + + # File for error output from utility + $Efile = 'doc_err.' . $$; + + # Max. number of lines of error output from utility copied + $Maxerr = 10; + + # System command to delete a file + $RM = "/bin/rm -f"; + + # Line editor to do substitution + $ED = "/bin/sed -e"; + if ($^O eq "MSWin32") {$ED = "$^X -pe"} + + $Time = 60; # allow 60 seconds for external utility to complete + + $Success = 0; + $Count = 0; + $Method = ''; + $Prog = $0; + $Prog =~ s#^.*/##; + $Prog =~ s/\..*?$//; + + $Input = $ARGV[0] or die "No filename given\n"; + $MIME_type = $ARGV[1] or die "No MIME-type given"; + $URL = $ARGV[2] || '?'; + $Name = $URL; + $Name =~ s#^.*/##; + $Name =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/gie; + + if ($Verbose and not $LOG) { print STDERR "\n$Prog: [$MIME_type] " } + if ($LOG) { print STDERR "$URL [$MIME_type] " } + +} + +#------------------------------------------------------------------------------ + +sub store_methods { +# The method of dealing with each file type is set up here. +# Edit as necessary + + my ($mime_type,$magic,$cmd,$cmdl,$type,$description); + + my $name = quotemeta($Name); + + ####Document -> HTML converters#### + + # WordPerfect documents + if ($WP2HTML) { + $mime_type = "application/wordperfect|application/msword"; + $cmd = $WP2HTML; + $cmdl = "($cmd -q -DTitle=\"[$name]\" -c doc2html.cfg -s doc2html.sty -i $Input -O; $RM CmdLine.ovr)"; + $magic = '\377WPC'; + &store_html_method('WordPerfect (wp2html)',$cmd,$cmdl,$mime_type,$magic); + } + + # Word documents + if ($WP2HTML) { + $mime_type = "application/msword"; + $cmd = $WP2HTML; + $cmdl = "($cmd -q -DTitle=\"[$name]\" -c doc2html.cfg -s doc2html.sty -i $Input -O; $RM CmdLine.ovr)"; + $magic = '^\320\317\021\340'; + &store_html_method('Word (wp2html)',$cmd,$cmdl,$mime_type,$magic); + } + + # RTF documents + if ($RTF2HTML) { + $mime_type = "application/msword|application/rtf|text/rtf"; + $cmd = $RTF2HTML; + # Rtf2html uses filename as title, change this: + $cmdl = "$cmd $Input | $ED \"s#^<TITLE>$Input</TITLE>#<TITLE>[$name]</TITLE>#\""; + $magic = '^{\134rtf'; + &store_html_method('RTF (rtf2html)',$cmd,$cmdl,$mime_type,$magic); + } + + # Microsoft Excel spreadsheet + if ($XLS2HTML) { + $mime_type = "application/msexcel|application/vnd.ms-excel"; + $cmd = $XLS2HTML; + # xlHtml uses filename as title, change this: + $cmdl = "$cmd -fw $Input | $ED \"s#<TITLE>$Input</TITLE>#<TITLE>[$name]</TITLE>#\""; + $magic = '^\320\317\021\340'; + &store_html_method('Excel (xlHtml)',$cmd,$cmdl,$mime_type,$magic); + } + + # Microsoft Powerpoint Presentation + if ($PPT2HTML) { + $mime_type = "application/vnd.ms-powerpoint"; + $cmd = $PPT2HTML; + # xlHtml uses filename as title, change this: + $cmdl = "$cmd $Input | $ED \"s#<TITLE>$Input</TITLE>#<TITLE>[$name]</TITLE>#\""; + $magic = '^\320\317\021\340'; + &store_html_method('Powerpoint (pptHtml)',$cmd,$cmdl,$mime_type,$magic); + } + + # Adobe PDF file using Perl script + if ($PDF2HTML) { + $mime_type = "application/pdf"; + $cmd = $PDF2HTML; + # Replace default title (if used) with filename: + $cmdl = "$cmd $Input $mime_type $name"; + $magic = '%PDF-|\0PDF CARO\001\000\377'; + &store_html_method('PDF (pdf2html)',$cmd,$cmdl,$mime_type,$magic); + } + + # Shockwave Flash file using Perl script + if ($SWF2HTML) { + $mime_type = "application/x-shockwave-flash"; + $cmd = $SWF2HTML; + $cmdl = "$cmd $Input"; + $magic = '^FWS[\001-\010]'; # versions 1 to 5, perhaps some later versions + &store_html_method('Shockwave-Flash (swf2html)',$cmd,$cmdl,$mime_type,$magic); + } + + # OpenOffice Documents + if ($OpenOffice2XML) { + $mime_type = "application/vnd.sun.xml.writer|application/vnd.sun.xml.impress|application/vnd.sun.xml.calc|application/vnd.sun.xml.draw|application/vnd.sun.xml.math"; + $cmd = $OpenOffice2XML; + $cmdl = "$cmd -p -qq $Input content.xml | /bin/sed -r 's/<[^>]*>/ /gi' $strip_unicode"; + $magic = 'PK'; + &store_html_method('OpenOffice XML (oo2xml)',$cmd,$cmdl,$mime_type,$magic); + } + + ####Document -> Text converters#### + + # Word6, Word7 & Word97 documents + if ($CATDOC) { + $mime_type = "application/msword"; + $cmd = $CATDOC; + # -b option increases chance of success: + $cmdl = "$cmd -a -b -w $Input"; + $magic = '^\320\317\021\340'; + &store_text_method('Word (catdoc)',$cmd,$cmdl,$mime_type,$magic); + } + + # Word2 documents + if ($CATDOC2) { + $mime_type = "application/msword"; + $cmd = $CATDOC2; + $cmdl = "$cmd -a -b -w $Input"; + $magic = '^\333\245-\000'; + &store_text_method('Word2 (catdoc)',$cmd,$cmdl,$mime_type,$magic); + } + + # Word 5.1 for MAC documents + if ($CATDOCM) { + $mime_type = "application/msword"; + $cmd = $CATDOCM; + $cmdl = "$cmd -a -b -w $Input"; + $magic = '^\3767\000#\000\000\000\000'; + &store_text_method('MACWord (catdoc)',$cmd,$cmdl,$mime_type,$magic); + } + + # PostScript files + if ($CATPS) { + $mime_type = "application/postscript"; + $cmd = $CATPS; + # allow PS interpreter to give error messages + $cmdl = "($cmd; $RM _temp_.???) < $Input"; + $magic = '^.{0,20}?%!|^\033%-12345.*\n%!'; + &store_text_method('PostScript (ps2ascii)',$cmd,$cmdl,$mime_type,$magic); + } + + # Microsoft Excel file + if ($CATXLS) { + $mime_type = "application/vnd.ms-excel"; + $cmd = $CATXLS; + $cmdl = "$cmd $Input"; + $magic = '^\320\317\021\340'; + &store_text_method('MS Excel (xls2csv)',$cmd,$cmdl,$mime_type,$magic); + } + + # WordPerfect document + if ($CATWPD) { + $mime_type = "application/wordperfect|application/msword"; + $cmd = $CATWPD; + $cmdl = "$cmd $Input"; + $magic = '\377WPC'; + &store_text_method('WordPerfect (catwpd)',$cmd,$cmdl,$mime_type,$magic); + } + + + ####Documents that cannot be converted#### + + # wrapped encapsulated Postscript + $type = "EPS"; + $magic = '^\305\320\323\306 \0'; + $description = 'wrapped Encapsulated Postscript'; + &store_cannot_do($type,$magic,$description); + + # Shockwave Flash version 6 + $type = "SWF6"; + $description = 'Shockwave-Flash Version 6'; + $magic = '^CWS\006'; + &store_cannot_do($type,$magic,$description); + +#### Binary (data or whatever) +###$type = "BIN"; +###$magic = '[\000-\007\016-\037\177]'; # rather crude test! +###$description = 'apparently binary'; +###&store_cannot_do($type,$magic,$description); + + return; +} + +#------------------------------------------------------------------------------ + +sub read_magic { + + # Read first bytes of file to check for file type + open(FILE, "< $Input") || die "Can't open file $Input\n"; + read FILE,$Magic,256; + close FILE; + + return; +} + +#------------------------------------------------------------------------------ + +sub error_setup { + + if ($Efile) { + open SAVERR, ">&STDERR"; + if (open STDERR, "> $Efile") { + print SAVERR " Overwriting $Efile\n" if (-s $Efile); + $Redir = 1; + } else { close SAVERR } + } + +} + +#------------------------------------------------------------------------------ + +sub run { + + my $routine = shift; + my $return; + + if (defined &alarm_call) { + $return = alarm_call($Time, $routine); + } else { + eval $routine; + $return = $@ if $@; + } + + if ($return) { &quit($return) } + +} + +#------------------------------------------------------------------------------ + +sub try_html { + + my($set,$cmnd,$type); + + $Success = 0; + foreach $type (keys %HTML_Method) { + $set = $HTML_Method{$type}; + if (($MIME_type =~ m/$set->{'mime'}/i) and + ($Magic =~ m/$set->{'magic'}/s)) { # found the method to use + $Method = $type; + my $cmnd = $set->{'cmnd'}; + if (! -x $cmnd) { + warn "Unable to execute $cmnd for $type document\n"; + return; + } + if (not open(CAT, "$set->{'command'} |")) { + warn "$cmnd doesn't want to be opened using pipe\n"; + return; + } + while (<CAT>) { + # getting something, so it is working + $Success = 1; + if ($_ !~ m/^<!--/) { # skip comment lines inserted by converter + print; + $Count += length; + if ($Count > $OP_Limit) { last } + } + } + close CAT; + last; + } + } + return; +} + +#------------------------------------------------------------------------------ + +sub try_text { + + my($set,$cmnd,$type); + + $Success = 0; + foreach $type (keys %TEXT_Method) { + $set = $TEXT_Method{$type}; + if (($MIME_type =~ m/$set->{'mime'}/i) and + ($Magic =~ m/$set->{'magic'}/s)) { # found the method to use + $Method = $type; + my $cmnd = $set->{'cmnd'}; + if (! -x $cmnd) { die "Unable to execute $cmnd for $type document\n" } + + # Open file via selected converter, output head, then its text: + open(CAT, "$set->{'command'} |") or + die "$cmnd doesn't want to be opened using pipe\n"; + &head; + print "<BODY>\n<PRE>\n"; + $Success = 1; + while (<CAT>) { + s/\255/-/g; # replace dashes with hyphens + # replace bell, backspace, tab. etc. with single space: + s/[\000-\040]+/ /g; + if (length > 1) { # if not just a single character, eg space + print &HTML($_), "\n"; + $Count += length; + if ($Count > $OP_Limit) { last } + } + } + close CAT; + + print "</PRE>\n</BODY>\n</HTML>\n"; + last; + } + + } + + return; +} + +#------------------------------------------------------------------------------ + +sub cannot_do { + + my ($type,$set); + + # see if known, unconvertable type + $Method = ''; + foreach $type (keys %BAD_type) { + $set = $BAD_type{$type}; + if ($Magic =~ m/$set->{'magic'}/s) { # known problem + return "CANNOT DO $set->{'desc'} "; + } + } + + return 0; +} + +#------------------------------------------------------------------------------ + +sub try_plain { + + $Success = 0; + ####### if ($Magic !~ m/^[\000-\007\016-\037\177]) { + if (-T $Input) { # Looks like text, so go for it: + $Method = 'Plain Text'; + open(FILE, "<$Input") || die "Error reading $Input\n"; + $Success = 1; + $Method = 'Plain Text'; + &head; + print "<BODY>\n<PRE>\n"; + + while (<FILE>) { + # replace bell, backspace, tab. etc. with single space: + s/[\000-\040\177]+/ /g; + if (length > 1) { + print &HTML($_), "\n"; + $Count += length; + if ($Count > $OP_Limit) { last } + } + } + close FILE; + print "</PRE>\n</BODY>\n</HTML>\n"; + + } else { $Method = '' } + + return; +} + +#------------------------------------------------------------------------------ + +sub HTML { + + my $text = shift; + + $text =~ s/\f/\n/gs; # replace form feed + $text =~ s/\s+/ /g; # replace multiple spaces, etc. with a single space + $text =~ s/\s+$//gm; # remove trailing spaces + $text =~ s/&/&/g; + $text =~ s/</</g; + $text =~ s/>/>/g; + + return $text; +} + +#------------------------------------------------------------------------------ + +sub store_html_method { + + my $type = shift; + my $cmnd = shift; + my $cline = shift; + my $mime = shift; + my $magic = shift; + + $HTML_Method{$type} = { + 'mime' => $mime, + 'magic' => $magic, + 'cmnd' => $cmnd, + 'command' => $cline, + }; + + return; +} + +#------------------------------------------------------------------------------ + +sub store_text_method { + + my $type = shift; + my $cmnd = shift; + my $cline = shift; + my $mime = shift; + my $magic = shift; + + $TEXT_Method{$type} = { + 'mime' => $mime, + 'magic' => $magic, + 'cmnd' => $cmnd, + 'command' => $cline, + }; + + return; +} + +#------------------------------------------------------------------------------ + +sub store_cannot_do { + + my $type = shift; + my $magic = shift; + my $desc = shift; + + $BAD_type{$type} = { + 'magic' => $magic, + 'desc' => $desc, + }; + + return; + +} + +#------------------------------------------------------------------------------ + +sub head { + + print "<HTML>\n<HEAD>\n"; + print "<TITLE>[" . $Name . "]</TITLE>\n"; + print "</HEAD>\n"; + +} + +#------------------------------------------------------------------------------ + +sub quit { + + if ($Redir) { # end redirection of STDERR to temporary file + close STDERR; + open STDERR, ">&SAVERR"; + } + + if ($Verbose) { + print STDERR "$Method $Count" if ($Success); + print STDERR "\n"; + } + + if ($Count > $OP_Limit) { + print STDERR $Emark, "Output truncated after limit $OP_Limit reached\n"; + } + + my $return = shift; + if ($return) { + print STDERR $Emark, $return, "\n"; + $return = 1; + } + + chdir $TMP; + if ($Efile && -s $Efile) { + open EFILE, "<$Efile"; + my $c = 0; + while (<EFILE>) { + $c++; + if ($c <= $Maxerr) { + print STDERR $EEmark, $_; + } + } + close EFILE; + print STDERR $Emark, " ... (total of $c lines of error messages)\n" if ($c > $Maxerr); + } + unlink $Efile if ($Efile && -e $Efile); + + if (-e "core" && (-M "core" < 0)) { + print STDERR $Emark, "$CORE_MESS\n"; + } + exit $return; +} diff --git a/debian/htdig/htdig-3.2.0b6/contrib/doc2html/doc2html.sty b/debian/htdig/htdig-3.2.0b6/contrib/doc2html/doc2html.sty new file mode 100644 index 00000000..fccfb8ee --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/doc2html/doc2html.sty @@ -0,0 +1,40 @@ +Any Font 8 On ="<H6>" +Any Font 8 Off ="</H6>" +Any Font 9 On =" " +Any Font 9 Off =" " +Any Font 10 On =" " +Any Font 10 Off =" " +Any Font 11 On =" " +Any Font 11 Off =" " +Any Font 12 On ="<H4>" +Any Font 12 Off ="</H4>" +Any Font 14 On ="<H3>" +Any Font 14 Off ="</H3>" +Any Font 18 On ="<H3>" +Any Font 18 Off ="</H3>" +Any Font 24 On ="<H2>" +Any Font 24 Off ="</H2>" +Any Font 28 On ="<H2>" +Any Font 28 Off ="</H2>" +Any Font 32 On ="<H1>" +Any Font 32 Off ="</H1>" +Any Font 36 On ="<H1>" +Any Font 36 Off ="</H1>" + +# Now the really specific stuff for WWW Urls +# This one decodes the special Url macro which puts the URL reference +# inside a WP Comment (so it is hidden but editable) and makes the +# link text blue and underline +# If we find a comment inside an Url style pair defined by the user +# we can be pretty sure it was deliberate ( done by MACRO), so we +# have this special translation just for Comments inside Url Styles +# Course, if not defined (UrlComment) it will default to standard +# +UrlOn="%e" # Eat style codes +UrlOnEnd="%f" # Style end for UrlOn, restart output +# the comment text, passed as parameter 2 text, is the URL +UrlCommentOn="<a href=\"" +UrlCommentOff="\">" # URL link +UrlOff="</a>%e" # Anchor Off and eat style codes +UrlOffEnd="%f" # Style end for UrlOff, restart output + diff --git a/debian/htdig/htdig-3.2.0b6/contrib/doc2html/pdf2html.pl b/debian/htdig/htdig-3.2.0b6/contrib/doc2html/pdf2html.pl new file mode 100755 index 00000000..fee93282 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/doc2html/pdf2html.pl @@ -0,0 +1,161 @@ +#!/usr/bin/perl -w +use strict; +# +# Version 1.0.1 12-Feb-2002 +# Written by David Adams <[email protected]> +# +# Uses pdftotext & pdfinfo utilities from the xpdf package +# to read an Adobe Acrobat file and produce HTML output. +# +# Can be called directly from htdig as an external converter, +# or may be called by doc2html.pl converter script. +# + +####--- Configuration ---#### +# Full paths of pdtotext and pdfinfo +# (get them from the xpdf package at http://www.foolabs.com/xpdf/): + +#### YOU MUST SET THESE #### + +my $PDFTOTEXT = "/... .../pdftotext"; +my $PDFINFO = "/... .../pdfinfo"; +# +# De-hyphenation option (only affects end-of-line hyphens): +my $Dehyphenate = 1; +# +# Set title to be used when none is found: +my $Default_title = "Adobe Acrobat Document"; +# +# make portable to win32 platform or unix: +my $null = "/dev/null"; +if ($^O eq "MSWin32") {$null = "nul";} +####--- End of configuration ---### + +if (! -x $PDFTOTEXT) { die "Unable to execute pdftotext" } + +my $Input = $ARGV[0] || die "Usage: pdf2html.pl filename [mime-type] [URL]"; +my $MIME_type = $ARGV[1] || ''; +if ($MIME_type and ($MIME_type !~ m#^application/pdf#i)) { + die "MIME/type $MIME_type wrong"; +} + +my $Name = $ARGV[2] || ''; +$Name =~ s#^(.*/)##; +# decode if 2nd argument was a URL +$Name =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/gie if $1; + +&pdf_head; +&pdf_body; +exit; + +#------------------------------------------------------------------------------ + +sub pdf_head { +# +# Contributed by Greg Holmes and Michael Fuller +# (any errors by David Adams) +# + my $title = ''; + my $subject = ''; + my $keywords = ''; + if (open(INFO, "$PDFINFO '$Input' 2>$null |")) { + while (<INFO>) { + if (m/^title:/i) { + s/^title:\s+//i; + $title = &clean_pdf($_); + } elsif (m/^subject:/i) { + s/^subject:\s+//i; + $subject = &clean_pdf($_); + } elsif (m/^keywords:/i) { + s/^keywords:\s+//i; + $keywords = &clean_pdf($_); + } + + } + close INFO; + } else { warn "cannot execute pdfinfo" } + if (not length $title) { + if ($Name) { + $title = '[' . $Name . ']'; + } else { + $title = $Default_title; + } + } + + print "<HTML>\n<HEAD>\n"; + print "<TITLE>$title</TITLE>\n"; + if (length $subject) { + print '<META NAME="DESCRIPTION" CONTENT="' . $subject. "\">\n"; + } + if (length $keywords) { + print '<META NAME="KEYWORDS" CONTENT="' . $keywords . "\">\n"; + } + print "</HEAD>\n"; + +###print STDERR "\n$Name:\n"; +###print STDERR "\tTitle:\t$title\n"; +###print STDERR "\tDescription:\t$subject\n"; +###print STDERR "\tKeywords:\t$keywords\n"; + +} + +#------------------------------------------------------------------------------ + +sub pdf_body { + + my $bline = ''; + open(CAT, "$PDFTOTEXT -raw '$Input' - |") || + die "$PDFTOTEXT doesn't want to be opened using pipe\n"; + print "<BODY>\n"; + while (<CAT>) { + while ( m/[A-Za-z\300-\377]-\s*$/ && $Dehyphenate) { + $_ .= <CAT>; + last if eof; + s/([A-Za-z\300-\377])-\s*\n\s*([A-Za-z\300-\377])/$1$2/s; + } + s/\255/-/g; # replace dashes with hyphens + # replace bell, backspace, tab. etc. with single space: + s/[\000-\040]+/ /g; + $_ = &HTML($_); + if (length) { + print $bline, $_, "\n"; + $bline = "<br>\n"; + } else { + $bline = "<p>\n"; + } + } + close CAT; + + print "</BODY>\n</HTML>\n"; + return; +} + +#------------------------------------------------------------------------------ + +sub HTML { + + my $text = shift; + + $text =~ s/\f/\n/gs; # replace form feed + $text =~ s/\s+/ /g; # replace multiple spaces, etc. with a single space + $text =~ s/\s+$//gm; # remove trailing space + $text =~ s/&/&/g; + $text =~ s/</</g; + $text =~ s/>/>/g; + chomp $text; + + return $text; +} + +#------------------------------------------------------------------------------ + +sub clean_pdf { +# removes odd pair of characters that may be in pdfinfo output +# Any double quotes are replaced with single + + my $text = shift; + chomp $text; + $text =~ s/\376\377//g; + $text =~ s/\"/\'/g; + return $text; +} diff --git a/debian/htdig/htdig-3.2.0b6/contrib/doc2html/swf2html.pl b/debian/htdig/htdig-3.2.0b6/contrib/doc2html/swf2html.pl new file mode 100755 index 00000000..5f0cdb07 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/doc2html/swf2html.pl @@ -0,0 +1,67 @@ +#!/usr/bin/perl -w +use strict; +# +# Version 1.1 17-May-2002 +# Written by David Adams <[email protected]> +# +# Uses swfparse utlity to extract URL's from Shockwave flash files +# +# Can be called directly from htdig as an external converter, +# or may be called by doc2html.pl converter script. +# + +####--- Configuration ---#### +# Full path of swfparse +# (get it from http:/www.htdig.org/files/contrib/contrib/parsers/) + +##### YOU MUST SET THIS #### + +my $SWFPARSE = "/.. .../swfdump"; + +####--- End of configuration ---### + +if (! -x $SWFPARSE) { die "Unable to execute swfparse" } + +my $Input = $ARGV[0] || die "Usage: swf2html.pl filename [mime-type] [URL]"; +my $MIME_type = $ARGV[1] || ''; +if ($MIME_type and ($MIME_type !~ m#^application/x-shockwave-flash#i)) { + die "MIME/type $MIME_type wrong"; +} + +my $Name = $ARGV[2] || ''; +$Name =~ s#^(.*/)##; +# decode if 2nd argument was a URL +$Name =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/gie if $1; + +print <<"HEAD"; +<HTML> +<HEAD> +<TITLE>SWF $Name</TITLE> +<META NAME="robots" CONTENT="follow, noindex"> +</HEAD> +HEAD + +open(CAT, "$SWFPARSE -t '$Input'|") || + die "$SWFPARSE doesn't want to be opened using pipe\n"; + +print "<BODY>\n"; +my $c = 0; +while (<CAT>) { +### if ($_ !~ m/\s+getUrl\s+(.*?)\s+.*$/) { next } + if ($_ !~ m/\s+getUrl\s+(.*)$/) { next } + my $link = $1 . ' '; + if ($link =~ m/^FSCommand:/) { next } + if ($link =~ m/\s+target\s+/) { + $link =~ s/^(.*)\s+target\s+.*$/$1/; + } else { + $link =~ s/^(.*?)\s+.*$/$1/; + } + print '<A href="', $link, '"> </a>', "\n"; + $c++; +} +close CAT; + +print "</BODY>\n</HTML>\n"; +print STDERR "No links extracted\n" if ($c == 0); + +exit; diff --git a/debian/htdig/htdig-3.2.0b6/contrib/doclist/doclist.pl b/debian/htdig/htdig-3.2.0b6/contrib/doclist/doclist.pl new file mode 100755 index 00000000..ef933de8 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/doclist/doclist.pl @@ -0,0 +1,183 @@ +#!/usr/local/bin/perl + +## +## doclist.pl (C) 1995 Andrew Scherpbier +## +## This program will list the information in the documentdb generated by htdig. +## + +use GDBM_File; + +$dbfile = $ARGV[0]; + +tie(%docdb, GDBM_File, $dbfile, GDBM_READER, 0) || die "Unable to open $dbfile: $!"; + + +while (($key, $value) = each %docdb) +{ + next if $key =~ /^nextDocID/; + %record = parse_ref_record($value); + print "Title: $record{'TITLE'}\n"; + print "Descriptions: $record{'DESCRIPTIONS'}\n"; + print "URL: $record{'URL'}\n"; + print "\n"; +} + +sub parse_ref_record +{ + local($value) = @_; + local(%rec, $length, $count, $result); + + while (length($value) > 0) + { + $what = unpack("C", $value); + $value = substr($value, 1); + if ($what == 0) + { + # ID + $rec{"ID"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 1) + { + # TIME + $rec{"TIME"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 2) + { + # ACCESSED + $rec{"ACCESSED"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 3) + { + # STATE + $rec{"STATE"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 4) + { + # SIZE + $rec{"SIZE"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 5) + { + # LINKS + $rec{"LINKS"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 6) + { + # IMAGESIZE + $rec{"IMAGESIZE"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 7) + { + # HOPCOUNT + $rec{"HOPCOUNT"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 8) + { + # URL + $length = unpack("i", $value); + $rec{"URL"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 9) + { + # HEAD + $length = unpack("i", $value); + $rec{"HEAD"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 10) + { + # TITLE + $length = unpack("i", $value); + $rec{"TITLE"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 11) + { + # DESCRIPTIONS + $count = unpack("i", $value); + $value = substr($value, 4); + $result = ""; + foreach (1 .. $count) + { + $length = unpack("i", $value); + $result = $result . unpack("x4 A$length", $value) . ""; + $value = substr($value, 4 + $length); + } + chop $result; + $rec{"DESCRIPTIONS"} = $result; + } + elsif ($what == 12) + { + # ANCHORS + $count = unpack("i", $value); + $value = substr($value, 4); + $result = ""; + foreach (1 .. $count) + { + $length = unpack("i", $value); + $result = $result . unpack("x4 A$length", $value) . ""; + $value = substr($value, 4 + $length); + } + chop $result; + $rec{"ANCHORS"} = $result; + } + elsif ($what == 13) + { + # EMAIL + $length = unpack("i", $value); + $rec{"EMAIL"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 14) + { + # NOTIFICATION + $length = unpack("i", $value); + $rec{"NOTIFICATION"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 15) + { + # SUBJECT + $length = unpack("i", $value); + $rec{"SUBJECT"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 16) + { + # STRING (ignore, but unpack) + $length = unpack("i", $value); + $rec{"STRING"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 17) + { + # METADSC + $length = unpack("i", $value); + $rec{"METADSC"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 18) + { + # BACKLINKS + $rec{"BACKLINKS"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 19) + { + # SIGNATURE + $rec{"SIG"} = unpack("i", $value); + $value = substr($value, 4); + } + } + return %rec; +} diff --git a/debian/htdig/htdig-3.2.0b6/contrib/doclist/listafter.pl b/debian/htdig/htdig-3.2.0b6/contrib/doclist/listafter.pl new file mode 100755 index 00000000..976cf333 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/doclist/listafter.pl @@ -0,0 +1,201 @@ +#!/usr/local/bin/perl + +## +## listafter.pl (C) 1996 Andrew Scherpbier +## +## This program will list all URLs which were modified after a specified date. +## For each URL, the following fields are displayed: +## Title +## Descriptions +## URL +## Last modification date (in ctime format) +## +## The date is specified as mm/dd/yyyy +## +## Example usage: +## listafter.pl 1/1/1996 /opt/www/htdig/sdsu.docdb +## + +use GDBM_File; +require('timelocal.pl'); + +$t = $ARGV[0]; +$t =~ m,([0-9]+)/([0-9]+)/([0-9]+),; +$when = timelocal(0, 0, 0, $2, $1 - 1, $3 - 1900); +$dbfile = $ARGV[1]; + +tie(%docdb, GDBM_File, $dbfile, GDBM_READER, 0) || die "Unable to open $dbfile: $!"; + +while (($key, $value) = each %docdb) +{ + next if $key =~ /^nextDocID/; + %record = parse_ref_record($value); + if ($record{'TIME'} >= $when) + { + print "Title: $record{'TITLE'}\n"; + print "Descriptions: $record{'DESCRIPTIONS'}\n"; + print "URL: $record{'URL'}\n"; + $w = localtime($record{'TIME'} * 1); + print "Modified: $w\n"; + print "\n"; + } +} + +sub parse_ref_record +{ + local($value) = @_; + local(%rec, $length, $count, $result); + + while (length($value) > 0) + { + $what = unpack("C", $value); + $value = substr($value, 1); + if ($what == 0) + { + # ID + $rec{"ID"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 1) + { + # TIME + $rec{"TIME"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 2) + { + # ACCESSED + $rec{"ACCESSED"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 3) + { + # STATE + $rec{"STATE"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 4) + { + # SIZE + $rec{"SIZE"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 5) + { + # LINKS + $rec{"LINKS"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 6) + { + # IMAGESIZE + $rec{"IMAGESIZE"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 7) + { + # HOPCOUNT + $rec{"HOPCOUNT"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 8) + { + # URL + $length = unpack("i", $value); + $rec{"URL"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 9) + { + # HEAD + $length = unpack("i", $value); + $rec{"HEAD"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 10) + { + # TITLE + $length = unpack("i", $value); + $rec{"TITLE"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 11) + { + # DESCRIPTIONS + $count = unpack("i", $value); + $value = substr($value, 4); + $result = ""; + foreach (1 .. $count) + { + $length = unpack("i", $value); + $result = $result . unpack("x4 A$length", $value) . ""; + $value = substr($value, 4 + $length); + } + chop $result; + $rec{"DESCRIPTIONS"} = $result; + } + elsif ($what == 12) + { + # ANCHORS + $count = unpack("i", $value); + $value = substr($value, 4); + $result = ""; + foreach (1 .. $count) + { + $length = unpack("i", $value); + $result = $result . unpack("x4 A$length", $value) . ""; + $value = substr($value, 4 + $length); + } + chop $result; + $rec{"ANCHORS"} = $result; + } + elsif ($what == 13) + { + # EMAIL + $length = unpack("i", $value); + $rec{"EMAIL"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 14) + { + # NOTIFICATION + $length = unpack("i", $value); + $rec{"NOTIFICATION"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 15) + { + # SUBJECT + $length = unpack("i", $value); + $rec{"SUBJECT"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 16) + { + # STRING (ignore, but unpack) + $length = unpack("i", $value); + $rec{"STRING"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 17) + { + # METADSC + $length = unpack("i", $value); + $rec{"METADSC"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 18) + { + # BACKLINKS + $rec{"BACKLINKS"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 19) + { + # SIGNATURE + $rec{"SIG"} = unpack("i", $value); + $value = substr($value, 4); + } + } + return %rec; +} diff --git a/debian/htdig/htdig-3.2.0b6/contrib/ewswrap/README b/debian/htdig/htdig-3.2.0b6/contrib/ewswrap/README new file mode 100644 index 00000000..0889e245 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/ewswrap/README @@ -0,0 +1,3 @@ +ewswrap.cgi = Excite for Web Servers (EWS) to htsearch wrapper +htwrap.cgi = htsearch wrapper to do some basic + sanity checking on the query diff --git a/debian/htdig/htdig-3.2.0b6/contrib/ewswrap/ewswrap.cgi b/debian/htdig/htdig-3.2.0b6/contrib/ewswrap/ewswrap.cgi new file mode 100755 index 00000000..f3f9419e --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/ewswrap/ewswrap.cgi @@ -0,0 +1,118 @@ +#!/usr/bin/perl -w + +# ewswrap.cgi +# +# by John Grohol ([email protected]) +# Freeware +# v1.00 - 5 Oct 1998 +# +# Simple wrapper script for htsearch to parse old +# Excite for Web Servers (EWS) forms as-is +# This only makes sense if your want to upgrade +# your search engine but can't upgrade every form +# which points to it (e.g., external sites are +# pointing to your EWS CGI. +# +# As an added bonus, given the differences from how +# EWS handles queries to how htsearch handles them, +# it does some basic sanity checking on the query +# and tries to re-form it into a valid htsearch query. +# +# This script must be called using the POST method! +# +#_______________________________________________________ +# Set some defaults here +# These can be overridden in the calling form + +$config = "htdig"; # htDig config file +$exclude = ""; # exclude this url +$restrict = ""; # restrict to this url +$format = "builtin-long"; # results format +$method = "and"; # default method +$dir = "/usr/httpd/cgi-bin"; # Set cgi-bin dir + +#_______________________________________________________ +# Rest of program + + $| = 1; + +# Get the form variables from POST form + + read(STDIN, $buffer, $ENV{'CONTENT_LENGTH'}); + @pairs = split(/&/, $buffer); + + foreach $pair (@pairs) { + ($name, $value) = split(/=/, $pair); + $value =~ tr/+/ /; + $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; + $value =~ s/<!--(.|\n)*-->//g; + $value =~ s/<([^>]|\n)*>//g; + $tags{$name} = $value; + } + +$squery = $tags{'search'}; # Set search query +$page = $tags{'page'}; +if (not($page)) { $page=1; } + + $squery =~ s/\+//g; + $squery =~ s/\-//g; + $squery =~ s/the//g; + $squery =~ s/not//g; + $squery =~ s/what//g; + +# If someone puts "and" or "or" in the query, +# then it should be a boolean query + + if (($squery =~ " and ") || ($squery =~ " or ")) { + $method = "boolean"; } + +# Count the number of words in the query + + @words = split(/ /,$squery); + foreach $word (@words) { $xwd++; } + +# If there are quotes in the query, we have to +# turn them into parantheses and make it boolean + +if (($squery =~ "\"")) { + $oo = (index($squery,"\""))+1; + $od = (index($squery,"\"",$oo))-1; + $op = $od - $oo +1; + $yty = substr($squery,$oo,$op); + @wrds = split(/ /,$yty); + foreach $wrd (@wrds) { $xww++; } + + if ($xww eq 2) { # Right now, can only handle 2-word phrases + $oi = (index($yty," ")); + if ($oi > -1) { + $ytt = substr($yty,0,$oi); + $john = $od - $oi +1; + $yte = substr($yty,$oi+1,$john); + $james = substr($squery,$od+2); + $james =~ s/ and//g; + $james =~ s/ / and /g; + $squery = "($ytt and $yte) $james"; # We turn it into a + $method = "boolean"; # boolean query + } + +# More than 2 words in quotes (phrase), just +# turn it into one big string of words and set method to "and" + + } else { + $squery =~ s/\"//g; + $squery =~ s/ and//g; + $method = "and"; + $yty = ""; + } +} + +# Set the environmental variables + +$ENV{'REQUEST_METHOD'} = 'GET'; +$ENV{'QUERY_STRING'} = "config=$config&restrict=$restrict&exclude=$exclude&words=$squery&method=$method&format=$format&page=$page"; + +# Run htsearch + +system("$dir/htsearch"); + +1; diff --git a/debian/htdig/htdig-3.2.0b6/contrib/ewswrap/htwrap.cgi b/debian/htdig/htdig-3.2.0b6/contrib/ewswrap/htwrap.cgi new file mode 100755 index 00000000..1e7ea66b --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/ewswrap/htwrap.cgi @@ -0,0 +1,125 @@ +#!/usr/bin/perl -w + +# htwrap.cgi +# +# by John Grohol ([email protected]) +# Freeware +# v1.00 - 5 Oct 1998 +# +# Simple wrapper script for htsearch to +# do some basic sanity checking on the query +# and tries to re-form it into a valid htsearch query. +# +# This script must be called using the GET method! +# +#_______________________________________________________ +# Set some defaults here +# These can be overridden in the calling form + +$config = "htdig"; # htDig config file +$exclude = ""; # exclude this url +$restrict = ""; # restrict to this url +$format = "builtin-long"; # results format +$method = "and"; # default method +$dir = "/usr/httpd/cgi-bin"; # Set cgi-bin dir + +#_______________________________________________________ +# Rest of program + + $| = 1; + +# Get the form variables + +&ParseTags($ENV{'PATH_INFO'}); +&ParseTags($ENV{'QUERY_STRING'}); + +$squery = $tags{'words'}; +$restrict = $tags{'restrict'}; +$method = $tags{'method'}; +$format = $tags{'format'}; +$page = $tags{'page'}; + +if (not($page)) { $page=1; } + + $squery =~ s/\+//g; + $squery =~ s/\-//g; + $squery =~ s/the//g; + $squery =~ s/not//g; + $squery =~ s/what//g; + +# If someone puts "and" or "or" in the query, +# then it should be a boolean query + + if (($squery =~ " and ") || ($squery =~ " or ")) { + $method = "boolean"; } + +# How many words are there in the query? + @words = split(/ /,$squery); + foreach $word (@words) { $xwd++; } + +# If there are quotes in the query, we have to +# turn them into parantheses and make it boolean + +if (($squery =~ "\"")) { + $oo = (index($squery,"\""))+1; + $od = (index($squery,"\"",$oo))-1; + $op = $od - $oo +1; + $yty = substr($squery,$oo,$op); + @wrds = split(/ /,$yty); + foreach $wrd (@wrds) { $xww++; } + + + if ($xww eq 2) { # Right now, can only handle 2-word phrases + $oi = (index($yty," ")); + if ($oi > -1) { + $ytt = substr($yty,0,$oi); + $john = $od - $oi +1; + $yte = substr($yty,$oi+1,$john); + $james = substr($squery,$od+2); + $james =~ s/ and//g; + $james =~ s/ / and /g; + $squery = "($ytt and $yte) $james"; # We turn it into a + $method = "boolean"; # boolean query + } + +# More than 2 words in quotes (phrase), just +# turn it into one big string of words and set method to "and" + + } else { + $squery =~ s/\"//g; # Dump quotes + $squery =~ s/ and//g; # Dump and's + $squery =~ s/ or//g; # Dump or's + $method = "and"; + $yty = ""; + } +} + +# Set the environmental variables + +$ENV{'REQUEST_METHOD'} = 'GET'; +$ENV{'QUERY_STRING'} = "config=$config&restrict=$restrict&exclude=$exclude&words=$squery&method=$method&format=$format&page=$page" +; + +# Run htsearch + +system("$dir/htsearch"); + +exit; + +sub ParseTags { + local($_) = @_; + local(@terms, $tag, $val); + s|^/||; + @terms = split('&'); + foreach $term (@terms) { + ($tag,$val) = split('=',$term,2); + $val =~ tr/+/ /; + $val =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; + $val =~ s/<!--(.|\n)*-->//g; + $val =~ s/<([^>]|\n)*>//g; + # may override previous value + $tags{$tag} = $val; + } +} + +1; diff --git a/debian/htdig/htdig-3.2.0b6/contrib/examples/badwords b/debian/htdig/htdig-3.2.0b6/contrib/examples/badwords new file mode 100644 index 00000000..9912e646 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/examples/badwords @@ -0,0 +1,349 @@ +a +above +about +according +across +actually +adj +after +afterwards +again +against +all +almost +alone +along +already +also +although +always +among +amongst +an +and +another +any +anyhow +anyone +anything +anywhere +are +aren +arent +around +as +at +be +became +because +become +becomes +becoming +been +before +beforehand +begin +beginning +behind +being +below +beside +besides +between +beyond +billion +both +but +by +can +cant +cannot +caption +co +could +couldnt +did +didnt +do +does +doesnt +dont +down +during +each +eg +eight +eighty +either +else +elsewhere +end +ending +enough +etc +even +ever +every +everyone +everything +everywhere +except +few +fifty +first +five +for +former +formerly +forty +found +four +from +further +had +has +hasnt +have +havent +he +hence +her +here +hereafter +hereby +herein +heres +hereupon +hers +herself +hes +him +himself +his +how +however +hundred +ie +if +in +inc +indeed +instead +into +is +isnt +it +its +itself +last +later +latter +latterly +least +less +let +like +likely +ltd +made +make +makes +many +may +maybe +me +meantime +meanwhile +might +million +miss +more +moreover +most +mostly +mr +mrs +much +must +my +myself +namely +neither +never +nevertheless +next +nine +ninety +no +nobody +none +nonetheless +noone +nor +not +nothing +now +nowhere +of +off +often +on +once +one +only +onto +or +others +otherwise +our +ours +ourselves +out +over +overall +own +page +per +perhaps +rather +re +recent +recently +same +seem +seemed +seeming +seems +seven +seventy +several +she +shes +should +shouldnt +since +six +sixty +so +some +somehow +someone +something +sometime +sometimes +somewhere +still +stop +such +taking +ten +than +that +the +their +them +themselves +then +thence +there +thereafter +thereby +therefore +therein +thereupon +these +they +thirty +this +those +though +thousand +three +through +throughout +thru +thus +tips +to +together +too +toward +towards +trillion +twenty +two +under +unless +unlike +unlikely +until +up +update +updated +updates +upon +us +used +using +ve +very +via +want +wanted +wants +was +wasnt +way +ways +we +wed +well +were +werent +what +whats +whatever +when +whence +whenever +where +whereafter +whereas +whereby +wherein +whereupon +wherever +wheres +whether +which +while +whither +who +whoever +whole +whom +whomever +whose +why +will +with +within +without +wont +work +worked +works +working +would +wouldnt +yes +yet +you +youd +youll +your +youre +yours +yourself +yourselves +youve diff --git a/debian/htdig/htdig-3.2.0b6/contrib/examples/rundig.sh b/debian/htdig/htdig-3.2.0b6/contrib/examples/rundig.sh new file mode 100644 index 00000000..7a78955d --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/examples/rundig.sh @@ -0,0 +1,96 @@ +#! /bin/sh + +# rundig.sh +# a script to drive ht://Dig updates +# Copyright (c) 1998 Colin Viebrock <[email protected]> +# Copyright (c) 1998-1999 Geoff Hutchison <[email protected]> +# Updated for ht://Dig 3.2.0b3 Feb 2001, Copyright (c) 2001 Geoff Hutchison +# Distributed under the GNU GPL version 2 or later + +if [ "$1" = "-v" ]; then + verbose="-v" +fi + +# This is the directory where htdig lives +BASEDIR=/export/htdig + +# This is the db dir +DBDIR=$BASEDIR/db/ + +# This is the name of a temporary report file +REPORT=/tmp/htdig.report + +# This is who gets the report +REPORT_DEST="[email protected]" +export REPORT_DEST + +# This is the subject line of the report +SUBJECT="cron: htdig report for domain" + +# This is the name of the conf file to use +CONF=htdig.conf + +# This is the directory htdig will use for temporary sort files +TMPDIR=$DBDIR +export TMPDIR + +# This is the PATH used by this script. Change it if you have problems +# with not finding wc or grep. +PATH=/usr/local/bin:/usr/bin:/bin + +##### Dig phase +STARTTIME=`date` +echo Start time: $STARTTIME +echo rundig: Start time: $STARTTIME > $REPORT +$BASEDIR/bin/htdig $verbose -s -a -c $BASEDIR/conf/$CONF >> $REPORT +TIME=`date` +echo Done Digging: $TIME +echo rundig: Done Digging: $TIME >> $REPORT + +##### Purge Phase +# (clean out broken links, etc.) +$BASEDIR/bin/htpurge $verbose -a -c $BASEDIR/conf/$CONF >> $REPORT +TIME=`date` +echo Done Purging: $TIME +echo rundig: Done Purging: $TIME >> $REPORT + +##### Cleanup Phase +# To enable htnotify or the soundex search, uncomment the following lines +# $BASEDIR/bin/htnotify $verbose >>$REPORT +# $BASEDIR/bin/htfuzzy $verbose soundex +# To get additional statistics, uncomment the following line +# $BASEDIR/bin/htstat $verbose >>$REPORT + +# Move 'em into place. Since these are only used by htdig for update digs +# and we always use -a, we just leave them as .work +# mv $DBDIR/db.docs.index.work $DBDIR/db.docs.index +# (this is just a mapping from a URL to a DocID) +# We need the .work for next time as an update dig, plus the copy for searching +cp $DBDIR/db.docdb.work $DBDIR/db.docdb +cp $DBDIR/db.excerpts.work $DBDIR/db.excerpts +cp $DBDIR/db.words.db.work $DBDIR/db.words.db +test -f $DBDIR/db.words.db.work_weakcmpr && + cp $DBDIR/db.words.db.work_weakcmpr $DBDIR/db.words.db_weakcmpr + +END=`date` +echo End time: $END +echo rundig: End time: $END >> $REPORT +echo + +# Grab the important statistics from the report file +# All lines begin with htdig: or htmerge: +fgrep "htdig:" $REPORT +echo +fgrep "htmerge:" $REPORT +echo +fgrep "rundig:" $REPORT +echo + +WC=`wc -l $REPORT` +echo Total lines in $REPORT: $WC + +# Send out the report ... +mail -s "$SUBJECT - $STARTTIME" $REPORT_DEST < $REPORT + +# ... and clean up +rm $REPORT diff --git a/debian/htdig/htdig-3.2.0b6/contrib/examples/updatedig b/debian/htdig/htdig-3.2.0b6/contrib/examples/updatedig new file mode 100755 index 00000000..1bcc3e08 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/examples/updatedig @@ -0,0 +1,53 @@ +#! /bin/sh + +# +# updatedig +# +# This is a script to update the search database for ht://Dig. +# Copyright (c) 1998 David Robley [email protected] +# +if [ "$1" = "-v" ]; then + verbose=-v +fi + +# -a: run using alternate work files so search can still be done during index run +# -t: create an ASCII version of document database in doc_list as specified +# in the config file +# -s: print stats after completion +/web/webdocs/htdig/bin/htdig -a -t $verbose -s +/web/webdocs/htdig/bin/htmerge -a $verbose -s +/web/webdocs/htdig/bin/htnotify $verbose + +# Because the -a switch creates alternate work files, but doesn't seem to move +# them into the correct place, we will do it here. +mv /web/webdocs/htdig/db/db.docdb /web/webdocs/htdig/db/db.docdb.old +mv /web/webdocs/htdig/db/db.docdb.work /web/webdocs/htdig/db/db.docdb + +mv /web/webdocs/htdig/db/db.docs.index /web/webdocs/htdig/db/db.docs.index.old +mv /web/webdocs/htdig/db/db.docs.index.work /web/webdocs/htdig/db/db.docs.index + +mv /web/webdocs/htdig/db/db.wordlist /web/webdocs/htdig/db/db.wordlist.old +mv /web/webdocs/htdig/db/db.wordlist.work /web/webdocs/htdig/db/db.wordlist + +mv /web/webdocs/htdig/db/db.words.gdbm /web/webdocs/htdig/db/db.words.gdbm.old +mv /web/webdocs/htdig/db/db.words.gdbm.work /web/webdocs/htdig/db/db.words.gdbm + +# +# Only create the endings database if it doesn't already exist. +# This database is static, so even if pages change, this database will not +# need to be rebuilt. +# +if [ ! -f /web/webdocs/htdig/common/word2root.gdbm ] +then + /web/webdocs/htdig/bin/htfuzzy $verbose endings +fi + +# This next needs to be run if synonyms are added/modified/removed +# Guess the best way would be to delete synonyms.gdbm before +# running this script?? + +if [ ! -f /web/webdocs/htdig/common/synonyms.gdbm ] +then + /web/webdocs/htdig/bin/htfuzzy $verbose synonyms +fi +# end updatedig diff --git a/debian/htdig/htdig-3.2.0b6/contrib/handler.pl b/debian/htdig/htdig-3.2.0b6/contrib/handler.pl new file mode 100755 index 00000000..53ec7f34 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/handler.pl @@ -0,0 +1,45 @@ +#!/usr/bin/perl +# +# handler.pl +# Sample ExternalTransport handler for HTTP and HTTPS using curl +# for the ht://Dig package 3.2.x and higher +# by Geoffrey Hutchison <[email protected]> +# Copyright (c) 1999 under the terms of the GNU Public License vesion 2 (GPL) +# +# handler.pl protocol url config_file +# +# Really a simplistic example--this should probably use Perl's LWP for HTTP/HTTPS/FTP +# Right now it uses the program 'curl' to do HTTP or HTTPS transactions. +# + +my $curl_path="/usr/local/bin/curl"; +my $protocol=$ARGV[0]; +my $url=$ARGV[1]; +my $config_file=$ARGV[2]; + +open (DOC, "$curl_path -i $url |") || die "s:\t404\nr:\tCan't open curl!\n"; +while ( my $line = <DOC> ) { + if ( $line =~ /^HTTP.?\/\d.\d\s(\d\d\d)\s(.*)/io ) { + print "s:\t$1\n"; + print "r:\t$2\n"; + } elsif ( $line =~ /^last-modified: (.*)$/io ) { + print "m:\t$1\n"; + } elsif ( $line =~ /^content-type: (.*)$/io ) { + print "t:\t$1\n"; + } elsif ( $line =~ /^content-length: (.*)$/io ) { + print "l:\t$1\n"; + } elsif ( $line =~ /^location: (.*)$/io ) { + print "u:\t$1\n"; + } + + last if ( $line =~ /^\s*$/ ) +} + +local($/) = undef; +my $text = <DOC>; +close(DOC); + +print "\n$text"; + + + diff --git a/debian/htdig/htdig-3.2.0b6/contrib/htdig-3.2.0.spec b/debian/htdig/htdig-3.2.0b6/contrib/htdig-3.2.0.spec new file mode 100644 index 00000000..1631164f --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/htdig-3.2.0.spec @@ -0,0 +1,184 @@ +# Last definitions below override, so change the order to redefine. You can't +# comment them out because %defines are parsed inside comments. +# For Red Hat [456].x... +%define contentdir /home/httpd +%define commondir /var/lib/htdig/common +%define databasedir /var/lib/htdig/db +%define searchdir %{contentdir}/html +%define configdir /etc/htdig +%define bindir /usr/sbin +%define mandir /usr/man +%define docdir /usr/doc +# For Red Hat [789].x, FCx... +%define contentdir /var/www +%define commondir %{_prefix}/share/htdig +%define databasedir /var/lib/htdig +%define searchdir %{contentdir}/html/htdig +%define configdir %{_sysconfdir}/htdig +%define bindir %{_bindir} +%define mandir %{_mandir} +%define docdir %{_docdir} +Summary: A web indexing and searching system for a small domain or intranet +Name: htdig +Version: 3.2.0b6 +Release: 8 +Copyright: GPL +Group: Networking/Utilities +BuildRoot: /var/tmp/htdig-root +Source0: http://www.htdig.org/files/htdig-%{PACKAGE_VERSION}.tar.gz +URL: http://www.htdig.org/ +Packager: Gilles Detillieux <[email protected]> + +%description +The ht://Dig system is a complete world wide web indexing and searching +system for a small domain or intranet. This system is not meant to replace +the need for powerful internet-wide search systems like Lycos, Infoseek, +Webcrawler and AltaVista. Instead it is meant to cover the search needs for +a single company, campus, or even a particular sub section of a web site. + +As opposed to some WAIS-based or web-server based search engines, ht://Dig +can span several web servers at a site. The type of these different web +servers doesn't matter as long as they understand the HTTP 1.0 protocol. +%prep +%setup -q -n htdig-%{PACKAGE_VERSION} +#%patch0 -p0 -b .noparse + +%build +CFLAGS="$RPM_OPT_FLAGS" ./configure --prefix=/usr --mandir=%{mandir} \ + --bindir=%{bindir} --libexec=/usr/lib --libdir=/usr/lib \ + --with-image-dir=%{contentdir}/html/htdig \ + --with-cgi-bin-dir=%{contentdir}/cgi-bin \ + --with-search-dir=%{searchdir} \ + --with-config-dir=%{configdir} \ + --with-common-dir=%{commondir} \ + --with-database-dir=%{databasedir} +#rm -f htlib/langinfo.h # conflicts with libc5 headers +#echo '#include "/usr/include/langinfo.h"' > htlib/langinfo.h # to keep htlib/Makefile happy +make + +%install + +rm -rf $RPM_BUILD_ROOT + +make DESTDIR=$RPM_BUILD_ROOT install-strip +mkdir -p $RPM_BUILD_ROOT/etc/cron.daily +ln -s ../..%{bindir}/rundig $RPM_BUILD_ROOT/etc/cron.daily/htdig-dbgen +ln -s ../../../..%{docdir}/htdig-%{PACKAGE_VERSION} \ + $RPM_BUILD_ROOT%{contentdir}/html/htdig/htdoc + +%clean +rm -rf $RPM_BUILD_ROOT + +%post +# Only run this if installing for the first time +if [ "$1" = 1 ]; then + SERVERNAME="`grep '^ServerName' /etc/httpd/conf/httpd.conf | awk 'NR == 1 {print $2}'`" + [ -z "$SERVERNAME" ] && SERVERNAME="`hostname -f`" + [ -z "$SERVERNAME" ] && SERVERNAME="localhost" + TMPFILE=$(mktemp /tmp/ht.XXXXXX) || exit 1 + sed 's/^start_url:.*/#&\ +# (See end of file for this parameter.)/' %{configdir}/htdig.conf > $TMPFILE + cat $TMPFILE > %{configdir}/htdig.conf + rm $TMPFILE + cat >> %{configdir}/htdig.conf <<! + +# Automatically set up by htdig RPM, from your current Apache httpd.conf... +# Verify and configure these, and set maintainer above, before running +# %{bindir}/rundig. +# See %{docdir}/htdig*/attrs.html for descriptions of attributes. + +# The URL(s) where htdig will start. See also limit_urls_to above. +start_url: http://$SERVERNAME/ + +# These attributes allow indexing server via local filesystem rather than HTTP. +local_urls: http://$SERVERNAME/=%{contentdir}/html/ +local_user_urls: http://$SERVERNAME/=/home/,/public_html/ +! + +fi + +%files +%defattr(-,root,root) +%config %{configdir}/htdig.conf +%config %{configdir}/mime.types +%config %{configdir}/HtFileType-magic.mime +%config %{configdir}/cookies.txt +%config %{bindir}/rundig +%config %{searchdir}/search.html +%config %{commondir}/[a-rt-z]*.html +%config %{commondir}/s[a-df-z]*.html +%config %{commondir}/english* +%config %{commondir}/synonyms +%config %{commondir}/bad_words +%config(missingok) /etc/cron.daily/htdig-dbgen +%{bindir}/[Hh]t* +/usr/lib/* +/usr/include/* +%dir %{databasedir} +%{contentdir}/cgi-bin/htsearch +%{contentdir}/cgi-bin/qtest +%{contentdir}/html/htdig/*.gif +%{contentdir}/html/htdig/*.png +%{contentdir}/html/htdig/htdoc +%{mandir}/man* + +%doc README htdoc/* + +%changelog +* Thu Jun 10 2004 Gilles Detillieux <[email protected]> + - built with 3.2.0b6, adding man pages & include files + - updated pathnames for current systems (/usr/share/htdig for common dir) + - used variable for configdir, mandir & docdir + - used mktemp to create safe temp file in post script + +* Wed Jul 4 2001 Gilles Detillieux <[email protected]> + - used variables for many pathnames, to allow easy switchover to 7.x + (using Powertools-like pathnames for Red Hat 7) + +* Thu Jun 7 2001 Gilles Detillieux <[email protected]> + - updated to 3.2.0b4 + +* Fri Dec 1 2000 Gilles Detillieux <[email protected]> + - updated to 3.2.0b3 + +* Mon Feb 21 2000 Gilles Detillieux <[email protected]> + - fixed post script to add more descriptive entries in htdig.conf + - made cron script a config file + - updated to 3.2.0b2 + +* Thu Feb 3 2000 Gilles Detillieux <[email protected]> + - added mime.types as a config file + +* Mon Jan 17 2000 Gilles Detillieux <[email protected]> + - updated to 3.2.0b1 + +* Fri Aug 13 1999 Gilles Detillieux <[email protected]> + - changed configure & install options and got rid of conf.patch file + to work with latest 3.2 code + +* Mon Jun 7 1999 Gilles Detillieux <[email protected]> + - fixed post script to use only first ServerName directive in httpd.conf + +* Tue Mar 23 1999 Gilles Detillieux <[email protected]> + - updated to 3.2.0dev, for testing + +* Thu Feb 4 1999 Gilles Detillieux <[email protected]> + - put web stuff back in /home/httpd/html & /home/httpd/cgi-bin, so it can + go over a standard Apache installation on Red Hat + - cleaned up install to make use of new features + +* Thu Feb 4 1999 Ric Klaren <[email protected]> + - changed buildroot stuff + - minor spec file fixes + - install web stuff in /home/httpd/htdig + - made rundig config file + +* Tue Sep 22 1998 Gilles Detillieux <[email protected]> + - Added local_urls stuff to generated htdig.conf file + +* Fri Sep 18 1998 Gilles Detillieux <[email protected]> + - Built the rpm from latest htdig source (3.1.0b1), using earlier + versions of rpms by Mihai Ibanescu <[email protected]> and Elliot Lee + <[email protected]> as a model, incorporating ideas from both. I've + made the install locations as FSSTND compliant as I can think of. + diff --git a/debian/htdig/htdig-3.2.0b6/contrib/htparsedoc/README b/debian/htdig/htdig-3.2.0b6/contrib/htparsedoc/README new file mode 100644 index 00000000..4ec0f6ab --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/htparsedoc/README @@ -0,0 +1,38 @@ + +> Subject: htdig: HTDIG: Searching Word files +> To: [email protected] +> From: Richard Jones <[email protected]> +> Date: Tue, 15 Jul 1997 12:44:03 +0100 +> +> I'm currently trying to hack together a script to search +> Word files. I have a little program called `catdoc' (attached) +> which takes Word files and turns them into passable text files. +> What I did was write a shell script around this called +> `htparsedoc' (also attached) and add it as an external +> parser: +> +> --- /usr/local/lib/htdig/conf/htdig.conf --- +> +> # External parser for Word documents. +> external_parsers: "applications/msword" +> "/usr/local/lib/htdig/bin/htparsedoc" +> +> This script produces output like this: +> +> t Word document http://annexia.imcl.com/test/comm.doc +> w INmEDIA 1 - +> w Investment 2 - +> w Ltd 3 - +> w Applications 4 - +> w Subproject 5 - +> w Terms 6 - +> w of 7 - +> [...] +> w Needed 994 - +> w Tbd 995 - +> w Resources 996 - +> w Needed 997 - +> w Tbd 998 - +> w i 1000 - +> + diff --git a/debian/htdig/htdig-3.2.0b6/contrib/htparsedoc/catdoc.c b/debian/htdig/htdig-3.2.0b6/contrib/htparsedoc/catdoc.c new file mode 100644 index 00000000..93bf02f8 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/htparsedoc/catdoc.c @@ -0,0 +1,197 @@ + +From [email protected] Fri Jul 3 09:52:34 1998 +Date: Fri, 3 Jul 1998 17:20:50 +0200 (MET DST) +From: Valerio Di Giampietro <[email protected]> +To: [email protected] +Subject: htdig: Searching Word files +/* catdoc.c version 0.3 */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#define TEXT_WIDTH 72 +/* #define LATIN1 */ +/* enable this define, if you don't want cyrillic code page translations */ + +unsigned char specs[]={7, /* tab columns separator - handled specially*/ + '\n',/* hook to handle end of line in tables */ + 0x1E,/* unbreakable defis */ + 0x1F,/* soft hyphen */ + 0x85,/* dots */ + 0x91,/* opening single quote */ + 0x92,/* closing single quote */ + 0x93,/* opening double quote */ + 0x94,/* closing double quote */ + 0x96,/* em-dash (or em-space)*/ + 0x97,/* en-dash */ + 0x99,/* Trade Mark sign */ + 0xA0,/* unbreakable space */ + 0xA9,/* Copyright sign */ + 0xAE,/* Reserved sign */ + 0xAB,/* opening << quote*/ + 0xBB,/* closing >> quote*/ + /* The rest is translated into itself unless TeX mode is selected */ + '%','$','_','{','}','\\', + }; + +char *ascii_specs[]={"\t","\n","-","","...","`","'","``","''","-","-","tm", + " ","(c)","(R)","\"","\"","%","$","_","{","}","\\"}; +char *TeX_specs[]={"\t&","\\\\\n","-","\\-","\\dots{}","`","'","``","''","---","--", +"${}^{\\scriptscriptstyle\\mathrm{TM}}$",/* this is my idea about tm sign*/ +"~", +"{\\copyright}", +"(R)",/* to be replaced with correct command */ +"<",">","\\%","\\$","$\\{$","$\\}$","$\\backslash$",}; +#ifndef LATIN1 +#ifdef unix +unsigned char table[256]={ +/* Windows cyrillic code page to KOI-8 */ +0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0D,0x0C,0x0D,0x0E,0x0F, +0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x2D,0x20, +0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F, +0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F, +0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F, +0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F, +0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F, +0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F, +0x80,0x81,0x82,0xAA,0x8F,0x90,0xA9,0x93,0x84,0x92,0x91,0x94,0x83,0x95,0x99,0x8B, +0x98,0x60,0x27,0x22,0x22,0x9A,0x2D,0x2D,0x9E,0xA6,0x87,0xB0,0x8D,0x97,0x86,0xA2, +0x20,0xA7,0xA5,0x88,0xA4,0x8E,0x96,0x85,0xB3,0xA1,0x9F,0x22,0xAB,0xAC,0xAD,0xAE, +0xAF,0xB2,0xB1,'i',0xB5,0xB6,0xB7,0xB8,0xA3,0xB9,0xBA,0x22,0xBC,0xBD,0xBE,0x9B, +0xE1,0xE2,0xF7,0xE7,0xE4,0xE5,0xF6,0xFA,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,0xF0, +0xF2,0xF3,0xF4,0xF5,0xE6,0xE8,0xE3,0xFE,0xFB,0xFD,0xFF,0xF9,0xF8,0xFC,0xE0,0xF1, +0xC1,0xC2,0xD7,0xC7,0xC4,0xC5,0xD6,0xDA,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,0xD0, +0xD2,0xD3,0xD4,0xD5,0xC6,0xC8,0xC3,0xDE,0xDB,0xDD,0xDF,0xD9,0xD8,0xDC,0xC0,0xD1}; +#else +unsigned char table[256]={ +0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0D,0x0c,0x0d,0x0e,0x0f, +0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x2D,0x20, +0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f, +0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f, +0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f, +0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f, +0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f, +0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f, +0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f, +0x90,0x60,0x27,0x22,0x22,0x95,0x2D,0x2D,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f, +0x20,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0x22,0xac,0xad,0xae,0xaf, +0xb0,0xb1,0xb2,0xb3,'i',0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0x22,0xbc,0xbd,0xbe,0xbf, +0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f, +0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f, +0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf, +0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef}; +#endif +#define recode_char(x) table[x] +#else +#define recode_char(x) x +#endif +char *map_char(char **map,int c) + +{unsigned char *ptr; + static char buffer[2]="a"; + if ((ptr=strchr(specs,c))) + return map[ptr-specs]; + else + { buffer[0]=recode_char(c); return buffer; } +} +void format(char *buf,char **map) +{ unsigned char outstring[128]=""; + unsigned char *sp=buf,*dp;int table=0; + while (*sp) + { if (*sp==7&&table) + { printf("%s%s",outstring,map_char(map,'\n')); + outstring[0]=0; + table=0;sp++; + } + else + { if (strlen(strcat(outstring,map_char(map,*sp)))>TEXT_WIDTH) + { dp=strrchr(outstring,' '); + if (dp) + { *(dp++)=0; + printf("%s\n",outstring); + strcpy(outstring,dp); + } + else + { int i; + for(i=0;i<72;i++) putc(outstring[i],stdout); + putc('\n',stdout); + strcpy(outstring,outstring+72); + } + } + table=*(sp++)==7; + } + } +if (outstring[0]==0) putc('\n',stdout); + else printf("%s\n\n",outstring); + +} +void help(void) +{ printf("catdoc - exctract text from MS-Word files and catenate it to stdout\n" + "Copyright (c) by Victor B. Wagner, 1996\n" + "Usage catdoc [-ast] files ...\n" + "\t-a - converts non-standard printable chars into readable form (default)\n" + "\t-t - converts them into TeX control sequences\n" + "\t-s - exits with code 1 if MSWordDoc signature not found before\n" + "\t\tfirst printable paragraph\n\n" + "All options affects only files, specified AFTER them\n"); + exit(2); +} + +char buf[8192]; +void do_file(FILE *f,char **map,int search_sign) +{ int ok=!search_sign; + int bufptr,c; + while(!feof(f)) + {bufptr=-1; + do { + c=getc(f); + /* Special printable symbols 7- table separator \r - paragraph end + 0x1E - short defis */ + if ((c<=255&&c>=32)||c==7||c=='\t'||c=='\r'||c==0x1E) + buf[++bufptr]=c; + else + if (c==0x0b) buf[++bufptr]='\r'; + else + { if (!c) {buf[++bufptr]=0; + if(!strcmp(buf,"MSWordDoc")) + { ok=1; } + } + if (c!=2) bufptr=-1;/* \002 is Word's footnote mark */ + } + } while (c!='\r'&&c!=EOF); + if (bufptr>0&&buf[bufptr]=='\r') + { if (!ok) exit( 1); + buf[bufptr]=0; format(buf,map); + } + } +} + +int main(int argc,char **argv) +{ int search_sign =0; /* Must program exit with exit code 1 if MSWordDoc + signature is not found? */ + char **sequences=ascii_specs;/* pointer to array of character sequences + to represent special characters of Word */ + int i=1,stdin_processed=0; + if (argc<2) help(); + for(;i<argc;i++) + { if (!strcmp(argv[i],"-s")) search_sign=1; + else + if (!strcmp(argv[i],"-t")) sequences=TeX_specs; + else + if (!strcmp(argv[i],"-a")) sequences=ascii_specs; + else + if (!strcmp(argv[i],"-")) + if (!stdin_processed) {do_file(stdin,sequences,search_sign); + stdin_processed=1;} + else { fprintf(stderr,"Cannot process standard input twice a row\n"); + exit (2);} + else + if (argv[i][0]=='-') {fprintf(stderr,"Invalid option %s\n",argv[i]); + help();} + else + { FILE *f=fopen(argv[i],"r"); + if(!f) {fprintf(stderr,"Cannot open file %s\n",argv[i]);exit(2);} + do_file(f,sequences,search_sign); + } + } + return 0; +} diff --git a/debian/htdig/htdig-3.2.0b6/contrib/htparsedoc/htparsedoc b/debian/htdig/htdig-3.2.0b6/contrib/htparsedoc/htparsedoc new file mode 100755 index 00000000..9d47e85d --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/htparsedoc/htparsedoc @@ -0,0 +1,72 @@ +#!/bin/sh - + +#-- +# External parser for HTDIG that parses Word files so they can +# be indexed. +#-- +# Written by Richard W.M. Jones <[email protected]>. Distributed freely +# under the terms of the GNU General Public License (GPL). +# Modified by Andrew M. Bishop <[email protected]> +#-- + +#---------------------------------------------------------------------- +# Configurable stuff here: + +# The program that converts Word files into text. I use ``catdoc'' +# by Victor Wagner <[email protected]>. You may wish to just use +# ``strings''. +CATDOC=/usr/local/bin/catdoc +#CATDOC=strings + +# End of configurable stuff. +#---------------------------------------------------------------------- + +# Arguments are: +# $1 = input file +# $2 = content type (ignored) +# $3 = base URL +# $4 = HTDIG config file (ignored) +# HTDIG expects us to print out: +# w WORD LOCATION HEADING Word at location 0-1000 under heading +# u URL DESCRIPTION URL with description +# t TITLE Title of document +# h HEAD Heading +# a ANCHOR Anchor (ie. like <a name="">) +# i IMAGE_URL Image pointer + +#---------------------------------------------------------------------- + +# Format input to word per line. + +wordPerLine () { + tr '[ \010]' '\012' | awk 'NF==1 {print;}' +} + +# Change non-alphabetical/numeric characters in space. + +removeNonAlNum () { + tr -c '[a-zA-Z0-9\015]' ' ' +} + +#---------------------------------------------------------------------- + +# Parse input file to linear list of words. +$CATDOC $1 | removeNonAlNum | wordPerLine > /tmp/htparsedoc.$$ + +# Compute length of list. +filelen=`wc -l < /tmp/htparsedoc.$$` + +# We can't find the title from the document, so make one up. +echo "t Binary Document $3" + +# We can't make an excerpt so we make one up. +echo "h No excerpt available" + +# Pass words to htdig. +if [ $filelen -gt 0 ]; then + awk "{printf (\"w\t%s\t%d\t-\t\n\", \$1, 1000*NR/$filelen);}" \ + < /tmp/htparsedoc.$$ +fi + +# Remove temporary file. +rm /tmp/htparsedoc.$$ diff --git a/debian/htdig/htdig-3.2.0b6/contrib/multidig/Makefile b/debian/htdig/htdig-3.2.0b6/contrib/multidig/Makefile new file mode 100644 index 00000000..c2dc4857 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/multidig/Makefile @@ -0,0 +1,58 @@ +# +# Makefile for the multidig system +# +# Copyright (c) 1998-2000 The ht://Dig Group +# Distributed under the terms of the GNU General Public License (GPL) +# version 2 or later. +# for the ht://Dig search system http://www.htdig.org/ +# and the multidig script system http://www.htdig.org/contrib/scripts/ +# + + +# +# You probably want to change some or all of these. +# BASH = location of bash or other Bourne-like shell with 'source' builtin +# BASEDIR = directory of ht://Dig installation +# These should probably be OK. +# BINDIR = directory of ht://Dig binaries. Also destination for these scripts. +# CONFIG_DIR = directory of ht://Dig config files. +# DB_BASE = base directory for ht://Dig / multidig databases +BASH= /bin/bash +BASEDIR= /opt/htdig +BINDIR= $(BASEDIR)/bin +CONFIG_DIR= $(BASEDIR)/conf +DB_BASE= $(BASEDIR)/db + + +# +# You shouldn't need to change any of this... +# +SCRIPTS= add-collect add-urls multidig \ + new-collect new-db gen-collect +CONF= db.conf multidig.conf + +all: + +clean: + rm -f *~ + +install: + @echo "Installing scripts..." + @for i in $(SCRIPTS); do \ + sed -e s%@BASH@%$(BASH)% \ + -e s%@CONFIG_DIR@%$(CONFIG_DIR)% $$i >$(BINDIR)/$$i; \ + chmod a+x $(BINDIR)/$$i; \ + echo $(BINDIR)/$$i; \ + done && test -z "$$fail" + @echo + @echo "Installing config files..." + @echo + @for i in $(CONF); do \ + sed -e s%@BASH@%$(BASH)% -e s%@BASEDIR@%$(BASEDIR)% \ + -e s%@BINDIR@%$(BINDIR)% -e s%@CONFIG_DIR@%$(CONFIG_DIR)% \ + -e s%@DB_BASE@%$(DB_BASE)% $$i >$(CONFIG_DIR)/$$i; \ + echo $(CONFIG_DIR)/$$i; \ + done && test -z "$$fail" + @echo + @echo "Done with installation." + @echo diff --git a/debian/htdig/htdig-3.2.0b6/contrib/multidig/README b/debian/htdig/htdig-3.2.0b6/contrib/multidig/README new file mode 100644 index 00000000..f394e5e5 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/multidig/README @@ -0,0 +1,133 @@ +README for multidig 1.1 + by Geoff Hutchison <[email protected]> + + Copyright (c) 1998-1999 The ht://Dig Group <http://www.htdig.org/> + Distributed under the terms of the GNU General Public License (GPL) + version 2 or later. +-------------------------------- + +This document is part of the "multidig script system" a system of +shell scripts and some modified conf files that makes dealing with +multiple databases easier for ht://Dig. It assumes that you know what +ht://Dig is. If you don't know, see the website at +<http://www.htdig.org/> + +This README is a bit rough around the edges. I don't know what people +really want or need to know about the scripts. I expect a lot of +questions. Hey, maybe I'm wrong. I'm always open to suggestions, +criticisms, corrections, etc. E-mail me at <[email protected]> + +-------------------------------- + +INTRODUCTION: + +* Why write multidig? + + There are many reasons I started the multidig system. The biggest +were the complaints that ht://Dig didn't have much of an +administration interface. If you're looking for one, multidig isn't +it. Yet. The next biggest is that people wanted me to make dealing +with multiple databases easier. If you're looking for this, you're in +the right place. + +* Why should I bother with multidig? + + If you already have a multiple-database setup and it's working +smoothly, you probably don't want to bother. It was written the way +*I* would organize a multiple-database setup. Not suprisingly, it +might be more pain to convert to multidig than it's worth. + If you're planning a multiple-database setup or you have one and +it's not working well, this will help. It hides most of the pain and +suffering behind some shell scripts and generally automates life. :-) + +-------------------------------- + +SETTING UP: + +* How do I install it? + + It's pretty easy to install. It requires bash, or at least a +Bourne-shell that supports the "source" builtin. Obviously, it also +requires ht://Dig. :-) + Change any paths in the Makefile. D a "make install" to install the +scripts in the right place and the config files in the right +place. The Makefile edits the scripts for you so the paths are consistent. + +* Now that it's in, how does it work? + + The multidig script will replace the rundig script that comes with +ht://Dig. Use it through a cron job or some other means of automating +updates. It will run through all the db that multidig knows about, run +htdig, htmerge, move the databases around, etc. As written it tries to +index with the least disk space in the least time. Thus it keeps only +the minimum files and does "update" digs. + After indexing all the db, it merges all the collections, trying to +do the same thing, fastest speed, smallest disk and RAM +requirements. It spits out a short status to STDOUT and a more +complete report to the file referened with the $REPORT option in +multidig.conf. Adding a "-v" to the command-line makes everything more +verbose. + +* Can I convert my previous multiple-db setup? + + Yes. I'm assuming you have a config file for each database you've +set up. In that case, put the databases into a directory with the same +name as the .conf file and tack the name onto the db.list file in your +config directory. This is multidig's list of all databases, so adding +a line here will ensure it's indexed using multidig. + +* How do I add new URLs to databases or add new databases? + + 1) New URLs: Run 'add-urls <db>' and either paste in URLs or + redirect a file or program. + 2) New DB: Run 'new-db <db>' to set up everything for that database. + +-------------------------------- + +COLLECTIONS: + +* What's a collection? + + Version 3.1.0 of ht://Dig added support for merging multiple +databases together. Technically, you merge one database into +another. Multidig makes this a bit easier. You set up a "collection" +of other databases and the multidig script will merge them all +together. + +* Fantastic! How do I define a collection? + +./new-collect <name> +./add-collect <name> +<insert dbs here> + + The add-collect script will go through the list of dbs and make sure +the multidig system actually knows about them. If not, it complains. + +* Can I just generate the collections from my databases? + + Yup, run gen-collect. This is what the main multidig script runs. + +-------------------------------- + +DIRECTORY LAYOUT: + +Here are the locations of files used by multidig: + + $BASEDIR/bin + add-collect script for adding db to a collection + add-urls script for adding URLs to a db + gen-collect script for generating all collections + from their db (called by multidig) + multidig script for generating all db and collections + new-collect script for making a new collection + new-db script for making a new db + $BASEDIR/conf + db.conf template database config + used by new-collect and new-db + foo.conf database config for db foo + multidig.conf config for multidig paths and options + db.list list of all db, one per line + collect.list list of all collections, one per line + $BASEDIR/db + foo/foo.urls URLs used by foo db + foo/db.* actual foo databases diff --git a/debian/htdig/htdig-3.2.0b6/contrib/multidig/add-collect b/debian/htdig/htdig-3.2.0b6/contrib/multidig/add-collect new file mode 100644 index 00000000..d169ed84 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/multidig/add-collect @@ -0,0 +1,49 @@ +#!@BASH@ + +# +# add-collect 1.1 +# +# Copyright (c) 1998-1999 The ht://Dig Group +# Distributed under the terms of the GNU General Public License (GPL) +# version 2 or later. +# for the ht://Dig search system http://www.htdig.org/ +# and the multidig script system http://www.htdig.org/contrib/scripts/ +# +# syntax: +# add-collect <collection> +# +# Reads new DB in from the standard input (either redirect or paste) +# Ensures the DB actually exist before adding them to the collection +# + +# You may need to set the following: +MULTIDIG_CONF=@CONFIG_DIR@/multidig.conf +source $MULTIDIG_CONF + +# Catch people who don't supply an argument +if [ "$1" = "" ]; then + echo Syntax: add-colect \<collection\> + exit +fi + +# Do we actually have a collection named as specified? +TEST=`grep $1 $COLLECT_LIST` +if [ "$TEST" = "" ]; then + # This may become annoying. If so, comment it out! + echo The collection $1 does not exist. Sorry. + echo The existing collections are: + cat $COLLECT_LIST +else + # OK, now we have to make sure these are legal db + for db in `cat /dev/stdin`; do + DBTEST=`grep $db $DB_LIST` + if [ "$DBTEST" != "" ]; then + echo $db >>$DB_BASE/$1/$1.collect + else + # This may become annoying. If so, comment it out! + echo The database $db does not exist. Sorry. + echo The existing databases are: + cat $DB_LIST + fi + done +fi diff --git a/debian/htdig/htdig-3.2.0b6/contrib/multidig/add-urls b/debian/htdig/htdig-3.2.0b6/contrib/multidig/add-urls new file mode 100644 index 00000000..15866e23 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/multidig/add-urls @@ -0,0 +1,37 @@ +#!@BASH@ + +# +# add-urls 1.1 +# +# Copyright (c) 1998-1999 The ht://Dig Group +# Distributed under the terms of the GNU General Public License (GPL) +# version 2 or later. +# for the ht://Dig search system http://www.htdig.org/ +# and the multidig script system http://www.htdig.org/contrib/scripts/ +# +# syntax: +# add-urls <db> +# +# Reads new URLs in from the standard input (either redirect or paste) +# + +# You may need to set the following: +MULTIDIG_CONF=@CONFIG_DIR@/multidig.conf +source $MULTIDIG_CONF + +# Catch people who don't supply an argument +if [ "$1" = "" ]; then + echo Syntax: add-urls \<db\> + exit +fi + +# Do we actually have a database named as specified? +TEST=`grep $1 $DB_LIST` +if [ "$TEST" = "" ]; then + # This may become annoying. If so, comment it out! + echo The database $1 does not exist. Sorry. + echo The existing databases are: + cat $DB_LIST +else + cat /dev/stdin >>$DB_BASE/$1/$1.urls +fi diff --git a/debian/htdig/htdig-3.2.0b6/contrib/multidig/db.conf b/debian/htdig/htdig-3.2.0b6/contrib/multidig/db.conf new file mode 100644 index 00000000..edacd723 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/multidig/db.conf @@ -0,0 +1,26 @@ +# +# db.conf file for the multidig system +# (copied for each database used) +# +# Copyright (c) 1998-1999 The ht://Dig Group +# Distributed under the terms of the GNU General Public License (GPL) +# version 2 or later. +# for the ht://Dig search system http://www.htdig.org/ +# and the multidig script system http://www.htdig.org/contrib/scripts/ +# + +# Change this if you use a different global config file +# Put most of your configuration options in this file +# the db.conf files only define the URL list used and the directory for +# storing the databases +include: ${config_dir}/htdig.conf + +# Changed for each database. Places the databases in separate directories +# for convenience and organization +database_dir: @DB_BASE@/@DATABASE@ + +# Each database has a separate list of starting URLs +# This makes it easier to index a variety of categories +start_url: `${database_dir}/@[email protected]` + +# Any database-specific config options should go here... diff --git a/debian/htdig/htdig-3.2.0b6/contrib/multidig/gen-collect b/debian/htdig/htdig-3.2.0b6/contrib/multidig/gen-collect new file mode 100644 index 00000000..f75e08ad --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/multidig/gen-collect @@ -0,0 +1,99 @@ +#!@BASH@ + +# +# gen-collect 1.1 +# +# Copyright (c) 1998-1999 The ht://Dig Group +# Distributed under the terms of the GNU General Public License (GPL) +# version 2 or later. +# for the ht://Dig search system http://www.htdig.org/ +# and the multidig script system http://www.htdig.org/contrib/scripts/ +# +# Part of the "multidig script system" +# a system of shell scripts and some modified conf files +# that makes dealing with multiple databases easier for ht://Dig +# +# Syntax: +# gen-collect [-v] +# +# Merges multiple databases into ``collected'' db +# (This is done by multidig too, but this script lets you *just* +# generate the collections.) +# + +# This is useful for debugging info +if [ "$1" = "-v" ]; then + verbose=-v +fi + +# You may need to set the following: +MULTIDIG_CONF=@CONFIG_DIR@/multidig.conf +source $MULTIDIG_CONF + +# We may be called inside multidig, so we don't want to mess with the report. +for collect in `cat $COLLECT_LIST`; do + # What's the conf file for this database? + CONF=$CONFIG_DIR/$collect.conf + echo Generating $collect at: `date` + + # We want to replace the old .work files with the first database + # This ensures that we *only* get documents from the merged db + # and not old ones left around in our previous collected db + firstdb=`head -n 1 $DB_BASE/$collect/$collect.collect` + cp $DB_BASE/$firstdb/db.docdb $DB_BASE/$collect/db.docdb.work + cp $DB_BASE/$firstdb/db.docs.index $DB_BASE/$collect/db.docs.index.work + cp $DB_BASE/$firstdb/db.wordlist.work $DB_BASE/$collect/db.wordlist.work + cp $DB_BASE/$firstdb/db.words.db $DB_BASE/$collect/db.words.db.work + # Now we need to work out the number of remaining db in the collection + LENGTH=`wc -l $DB_BASE/$collect/$collect.collect | awk '{print $1;}'` + let NUM=LENGTH-1 + + for db in `tail -n $NUM $DB_BASE/$collect/$collect.collect`; do + if [ "$1" = "-v" ]; then + echo Merging db $db of collect $collect + fi + MERGE_CONF=$CONFIG_DIR/$db.conf + # There's a slight bug in the merge function. + # It's looking for db.wordlist, not .work. So lets copy it temporarily + cp $DB_BASE/$db/db.wordlist.work $DB_BASE/$db/db.wordlist + # Do the merging, using -d and -w to prevent normal merging + # (it would be a waste of time, we'd repeat it multiple times) + $BINDIR/htmerge $verbose -s -d -w -m $MERGE_CONF -a -c $CONF >>$REPORT + # And now remove the copy + rm $DB_BASE/$db/db.wordlist + done + + # Now after merging in all of those databases + # we need to do the usual htmerge run + $BINDIR/htmerge -a $verbose -s -c $CONF >>$REPORT + + if [ "$1" = "-v" ]; then + echo Moving files $collect at: `date` + fi + # If you don't have the space for backups, this step can be omitted + if [ $BACKUPS = "true" ]; then + cp $DB_BASE/$collect/db.docdb $DB_BASE/$collect/db.docdb.bak + cp $DB_BASE/$collect/db.docs.index $DB_BASE/$collect/db.docs.index.bak + # cp $DB_BASE/$collect/db.wordlist $DB_BASE/$collect/db.wordlist.bak + cp $DB_BASE/$collect/db.words.db $DB_BASE/$collect/db.words.db.bak + fi + + # Move them because we don't want .work files around + # (Remember, we're generating using merging, + # so we want to make sure we don't have old stuff to gum up the works... + mv $DB_BASE/$collect/db.docdb.work $DB_BASE/$collect/db.docdb + mv $DB_BASE/$collect/db.docs.index.work $DB_BASE/$collect/db.docs.index + # mv $DB_BASE/$collect/db.wordlist.work $DB_BASE/$collect/db.wordlist + mv $DB_BASE/$collect/db.words.db.work $DB_BASE/$collect/db.words.db + + # Make them world readable! + chmod 644 $DB_BASE/$collect/db.docdb + chmod 644 $DB_BASE/$collect/db.docs.index + # chmod 644 $DB_BASE/$collect/db.wordlist + chmod 644 $DB_BASE/$collect/db.words.db + if [ "$1" = "-v" ]; then + echo Done with $collect at: `date` + fi +done + +# That's it! diff --git a/debian/htdig/htdig-3.2.0b6/contrib/multidig/multidig b/debian/htdig/htdig-3.2.0b6/contrib/multidig/multidig new file mode 100644 index 00000000..0b59136a --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/multidig/multidig @@ -0,0 +1,93 @@ +#!@BASH@ + +# +# multidig 1.1 +# +# Copyright (c) 1998-1999 The ht://Dig Group +# Distributed under the terms of the GNU General Public License (GPL) +# version 2 or later. +# for the ht://Dig search system http://www.htdig.org/ +# and the multidig script system http://www.htdig.org/contrib/scripts/ +# +# Part of the "multidig script system" +# a system of shell scripts and some modified conf files +# that makes dealing with multiple databases easier for ht://Dig +# +# Syntax: +# multidig [-v] +# +# Performs all the digging, merging and so on needed +# for indexing and updating multiple db +# Merges multiple databases into ``collected'' db +# + +# This is useful for debugging info +if [ "$1" = "-v" ]; then + verbose=-v +fi + +# You may need to set the following: +MULTIDIG_CONF=@CONFIG_DIR@/multidig.conf +source $MULTIDIG_CONF + +# Start indexing. +rm $REPORT +for db in `cat $DB_LIST`; do + echo Digging $db at: `date` + # What's the conf file for this database? + CONF=$CONFIG_DIR/$db.conf + if [ "$1" = "-v" ]; then + echo " Indexing $db at: `date`" + fi + $BINDIR/htdig -a $verbose -s -c $CONF >>$REPORT + if [ "$1" = "-v" ]; then + echo " Merging $db at: `date`" + fi + $BINDIR/htmerge -a $verbose -s -c $CONF >>$REPORT + + if [ "$1" = "-v" ]; then + echo " Moving files $db at: `date`" + fi + # If you don't have the space for backups, this step can be omitted + if [ $BACKUPS = "true" ]; then + cp $DB_BASE/$db/db.docdb $DB_BASE/$db/db.docdb.bak + cp $DB_BASE/$db/db.docs.index $DB_BASE/$db/db.docs.index.bak + # cp $DB_BASE/$db/db.wordlist $DB_BASE/$db/db.wordlist.bak + cp $DB_BASE/$db/db.words.db $DB_BASE/$db/db.words.db.bak + fi + + # Copy the db.docdb file, the .work file is needed for update digs + cp $DB_BASE/$db/db.docdb.work $DB_BASE/$db/db.docdb + # We don't do anything with the db.wordlist file because the + # .work file is needed for update digs and the non-work file isn't needed + # cp $DB_BASE/$db/db.wordlist.work $DB_BASE/$db/db.wordlist + # These .work files are never used, so let's just keep the active copy + mv $DB_BASE/$db/db.docs.index.work $DB_BASE/$db/db.docs.index + mv $DB_BASE/$db/db.words.db.work $DB_BASE/$db/db.words.db + + # Make them world readable! + chmod 644 $DB_BASE/$db/db.docdb + chmod 644 $DB_BASE/$db/db.docdb.work + chmod 644 $DB_BASE/$db/db.docs.index + # chmod 644 $DB_BASE/$db/db.wordlist + chmod 644 $DB_BASE/$db/db.words.db + if [ "$1" = "-v" ]; then + echo " Done with $db at: `date`" + fi +done +# Now generate the collections by merging their component databases +# We do this in our gen-collect script, so we won't do that here. +$BINDIR/gen-collect $1 + +if [ "$1" = "-v" ]; then + echo + fgrep "htdig:" $REPORT + echo + fgrep "htmerge:" $REPORT + echo + echo Total lines in $REPORT: `wc -l $REPORT` +fi + +# You probably don't need to do this since the script will remove it next +# time it's run. But you can do it anyway +# rm $REPORT diff --git a/debian/htdig/htdig-3.2.0b6/contrib/multidig/multidig.conf b/debian/htdig/htdig-3.2.0b6/contrib/multidig/multidig.conf new file mode 100644 index 00000000..32164977 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/multidig/multidig.conf @@ -0,0 +1,32 @@ +#!@BASH@ +# +# multidig config 1.1 +# +# Copyright (c) 1998-1999 The ht://Dig Group +# Distributed under the terms of the GNU General Public License (GPL) +# version 2 or later. +# for the ht://Dig search system http://www.htdig.org/ +# and the multidig script system http://www.htdig.org/contrib/scripts/ +# + +# You may wish to set some of these: +# BASEDIR = base directory for ht://Dig installation +# BINDIR = directory with ht://Dig binaries (i.e. htdig, htmerge) +# DB_BASE = base directory for ht://Dig DB +# (i.e. each DB gets its own directory off of this) +# CONFIG_DIR = directory with ht://Dig config files +# DB_LIST = file with list of databases +# COLLECT_LIST = file with list of "collections" databases merged from others +# DB_CONF = file copied by new-db and new-collect for .conf files +# REPORT = temporary file used to generate a report for the dig +# TMPDIR = a directory with lots of temporary space for the merging +export BASEDIR=@BASEDIR@ +export BINDIR=@BINDIR@ +export DB_BASE=@DB_BASE@ +export CONFIG_DIR=@CONFIG_DIR@ +export DB_LIST=$CONFIG_DIR/db.list +export COLLECT_LIST=$CONFIG_DIR/collect.list +export DB_CONF=$CONFIG_DIR/db.conf +export REPORT=$BASEDIR/multidig.report +export TMPDIR=$DB_BASE +export BACKUPS=true diff --git a/debian/htdig/htdig-3.2.0b6/contrib/multidig/new-collect b/debian/htdig/htdig-3.2.0b6/contrib/multidig/new-collect new file mode 100644 index 00000000..6647d447 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/multidig/new-collect @@ -0,0 +1,39 @@ +#!@BASH@ + +# +# new-collect 1.1 +# +# Copyright (c) 1998-2000 The ht://Dig Group +# Distributed under the terms of the GNU General Public License (GPL) +# version 2 or later. +# for the ht://Dig search system http://www.htdig.org/ +# and the multidig script system http://www.htdig.org/contrib/scripts/ +# +# syntax: +# new-collect <collection> +# +# Creates a new database directory and conf file with given name +# Updates the global collect.list file +# + +# You may need to set the following: +MULTIDIG_CONF=@CONFIG_DIR@/multidig.conf +source $MULTIDIG_CONF + +# Catch people who don't supply an argument +if [ "$1" = "" ]; then + echo Syntax: new-collect \<collection\> + exit +fi + +# Add the new collection to the collect.list file +echo ${1:?You need to specify a collection} >>$COLLECT_LIST + +# Now make the appropriate database directory +mkdir $DB_BASE/$1 + +# And make a copy of the default (db.conf) conf file for the DB +# Use sed to replace @DATABASE@ with the name of the database +sed -e s%@DATABASE@%$1% $DB_CONF >$CONFIG_DIR/$1.conf +# And make a blank file for the ${start_urls} directive +touch $DB_BASE/$1/$1.collect diff --git a/debian/htdig/htdig-3.2.0b6/contrib/multidig/new-db b/debian/htdig/htdig-3.2.0b6/contrib/multidig/new-db new file mode 100644 index 00000000..1c4948f7 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/multidig/new-db @@ -0,0 +1,39 @@ +#!@BASH@ + +# +# new-db 1.1 +# +# Copyright (c) 1998-1999 The ht://Dig Group +# Distributed under the terms of the GNU General Public License (GPL) +# version 2 or later. +# for the ht://Dig search system http://www.htdig.org/ +# and the multidig script system http://www.htdig.org/contrib/scripts/ +# +# syntax: +# new-db <db> +# +# Creates a new database directory and conf file with given name +# Updates the global db.list file +# + +# You may need to set the following: +MULTIDIG_CONF=@CONFIG_DIR@/multidig.conf +source $MULTIDIG_CONF + +# Catch people who don't supply an argument +if [ "$1" = "" ]; then + echo Syntax: new-db \<db\> + exit +fi + +# Add the new database to the db.list file +echo ${1:?You need to specify a database} >>$DB_LIST + +# Now make the appropriate database directory +mkdir $DB_BASE/$1 + +# And make a copy of the default (db.conf) conf file for the DB +# Use sed to replace @DATABASE@ with the name of the database +sed -e s%@DATABASE@%$1% $DB_CONF >$CONFIG_DIR/$1.conf +# And make a blank file for the ${start_urls} directive +touch $DB_BASE/$1/$1.urls diff --git a/debian/htdig/htdig-3.2.0b6/contrib/parse_doc.pl b/debian/htdig/htdig-3.2.0b6/contrib/parse_doc.pl new file mode 100755 index 00000000..63b775db --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/parse_doc.pl @@ -0,0 +1,238 @@ +#!/usr/local/bin/perl + +# 1998/12/10 +# Added: push @allwords, $fields[$x]; <[email protected]> +# Replaced: matching patterns. they match words starting or ending with ()[]'`;:?.,! now, not when in between! +# Gone: the variable $line is gone (using $_ now) +# +# 1998/12/11 +# Added: catdoc test (is catdoc runnable?) <[email protected]> +# Changed: push line semi-colomn wrong. <[email protected]> +# Changed: matching works for end of lines now <[email protected]> +# Added: option to rigorously delete all punctuation <[email protected]> +# +# 1999/02/09 +# Added: option to delete all hyphens <[email protected]> +# Added: uses ps2ascii to handle PS files <[email protected]> +# 1999/02/15 +# Added: check for some file formats <[email protected]> +# 1999/02/25 +# Added: uses pdftotext to handle PDF files <[email protected]> +# Changed: generates a head record with punct. <[email protected]> +# 1999/03/01 +# Added: extra checks for file "wrappers" <[email protected]> +# & check for MS Word signature (no longer defaults to catdoc) +# 1999/03/05 +# Changed: rejoin hyphenated words across lines <[email protected]> +# (in PDFs) & remove multiple punct. chars. between words (all) +# 1999/03/10 +# Changed: fix handling of minimum word length <[email protected]> +# 1999/08/12 +# Changed: adapted for xpdf 0.90 release <[email protected]> +# Added: uses pdfinfo to handle PDF titles <[email protected]> +# Changed: keep hyphens by default, as htdig <[email protected]> +# does, but change dashes to hyphens +# 1999/09/09 +# Changed: fix to handle empty PDF title right <[email protected]> +# 2000/01/12 +# Changed: "break" to "last" (no break in Perl) <[email protected]> +# Changed: code for parsing a line into a list of +# words, to use "split", other streamlining. +# 2001/07/12 +# Changed: fix "last" handling in dehyphenation <[email protected]> +# Added: handle %xx codes in title from URL <[email protected]> +# 2003/06/07 +# Changed: allow file names with spaces <[email protected]> +######################################### +# +# set this to your MS Word to text converter +# get it from: http://www.fe.msk.ru/~vitus/catdoc/ +# +$CATDOC = "/usr/local/bin/catdoc"; +# +# set this to your WordPerfect to text converter, or /bin/true if none available +# this nabs WP documents with .doc suffix, so catdoc doesn't see them +# +$CATWP = "/bin/true"; +# +# set this to your RTF to text converter, or /bin/true if none available +# this nabs RTF documents with .doc suffix, so catdoc doesn't see them +# +$CATRTF = "/bin/true"; +# +# set this to your PostScript to text converter +# get it from the ghostscript 3.33 (or later) package +# +$CATPS = "/usr/bin/ps2ascii"; +# +# set this to your PDF to text converter, and pdfinfo tool +# get it from the xpdf 0.90 package at http://www.foolabs.com/xpdf/ +# +$CATPDF = "/usr/bin/pdftotext"; +$PDFINFO = "/usr/bin/pdfinfo"; +#$CATPDF = "/usr/local/bin/pdftotext"; +#$PDFINFO = "/usr/local/bin/pdfinfo"; + +# need some var's +$minimum_word_length = 3; +$head = ""; +@allwords = (); +@temp = (); +$x = 0; +#@fields = (); +$calc = 0; +$dehyphenate = 0; +$title = ""; +# +# okay. my programming style isn't that nice, but it works... + +#for ($x=0; $x<@ARGV; $x++) { # print out the args +# print STDERR "$ARGV[$x]\n"; +#} + +# Read first bytes of file to check for file type (like file(1) does) +open(FILE, "< $ARGV[0]") || die "Oops. Can't open file $ARGV[0]: $!\n"; +read FILE,$magic,8; +close FILE; + +if ($magic =~ /^\0\n/) { # possible MacBinary header + open(FILE, "< $ARGV[0]") || die "Oops. Can't open file $ARGV[0]: $!\n"; + read FILE,$magic,136; # let's hope parsers can handle them! + close FILE; +} + +if ($magic =~ /%!|^\033%-12345/) { # it's PostScript (or HP print job) + $parser = $CATPS; # gs 3.33 leaves _temp_.??? files in . + $parsecmd = "(cd /tmp; $parser; rm -f _temp_.???) < \"$ARGV[0]\" |"; +# keep quiet even if PS gives errors... +# $parsecmd = "(cd /tmp; $parser; rm -f _temp_.???) < \"$ARGV[0]\" 2>/dev/null |"; + $type = "PostScript"; + $dehyphenate = 0; # ps2ascii already does this + if ($magic =~ /^\033%-12345/) { # HP print job + open(FILE, "< $ARGV[0]") || die "Oops. Can't open file $ARGV[0]: $!\n"; + read FILE,$magic,256; + close FILE; + exit unless $magic =~ /^\033%-12345X\@PJL.*\n*.*\n*.*ENTER\s*LANGUAGE\s*=\s*POSTSCRIPT.*\n*.*\n*.*\n%!/ + } +} elsif ($magic =~ /%PDF-/) { # it's PDF (Acrobat) + $parser = $CATPDF; + $parsecmd = "$parser -raw \"$ARGV[0]\" - |"; +# to handle single-column, strangely laid out PDFs, use coalescing feature... +# $parsecmd = "$parser \"$ARGV[0]\" - |"; + $type = "PDF"; + $dehyphenate = 1; # PDFs often have hyphenated lines + if (open(INFO, "$PDFINFO \"$ARGV[0]\" 2>/dev/null |")) { + while (<INFO>) { + if (/^Title:/) { + $title = $_; + $title =~ s/^Title:\s+//; + $title =~ s/\s+$//; + $title =~ s/\s+/ /g; + $title =~ s/&/\&\;/g; + $title =~ s/</\<\;/g; + $title =~ s/>/\>\;/g; + last; + } + } + close INFO; + } +} elsif ($magic =~ /WPC/) { # it's WordPerfect + $parser = $CATWP; + $parsecmd = "$parser \"$ARGV[0]\" |"; + $type = "WordPerfect"; + $dehyphenate = 0; # WP documents not likely hyphenated +} elsif ($magic =~ /^{\\rtf/) { # it's Richtext + $parser = $CATRTF; + $parsecmd = "$parser \"$ARGV[0]\" |"; + $type = "RTF"; + $dehyphenate = 0; # RTF documents not likely hyphenated +} elsif ($magic =~ /\320\317\021\340/) { # it's MS Word + $parser = $CATDOC; + $parsecmd = "$parser -a -w \"$ARGV[0]\" |"; + $type = "Word"; + $dehyphenate = 0; # Word documents not likely hyphenated +} else { + die "Can't determine type of file $ARGV[0]; content-type: $ARGV[1]; URL: $ARGV[2]\n"; +} +# print STDERR "$ARGV[0]: $type $parsecmd\n"; +die "Hmm. $parser is absent or unwilling to execute.\n" unless -x $parser; + + +# open it +open(CAT, "$parsecmd") || die "Hmmm. $parser doesn't want to be opened using pipe.\n"; +while (<CAT>) { + while (/[A-Za-z\300-\377]-\s*$/ && $dehyphenate) { + $_ .= <CAT>; + last if eof; + s/([A-Za-z\300-\377])-\s*\n\s*([A-Za-z\300-\377])/$1$2/s + } + $head .= " " . $_; +# s/\s+[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]+|[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]+\s+|^[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]+|[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]+$/ /g; # replace reading-chars with space (only at end or begin of word, but allow multiple characters) +## s/\s[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]|[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]\s|^[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]|[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]$/ /g; # replace reading-chars with space (only at end or begin of word) +## s/[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]/ /g; # rigorously replace all by <[email protected]> +## s/[\-\255]/ /g; # replace hyphens with space +# s/[\255]/-/g; # replace dashes with hyphens +# @fields = split; # split up line +# next if (@fields == 0); # skip if no fields (does it speed up?) +# for ($x=0; $x<@fields; $x++) { # check each field if string length >= 3 +# if (length($fields[$x]) >= $minimum_word_length) { +# push @allwords, $fields[$x]; # add to list +# } +# } + + # Delete valid punctuation. These are the default values + # for valid_punctuation, and should be changed other values + # are specified in the config file. + tr{-\255._/!#$%^&'}{}d; + push @allwords, grep { length >= $minimum_word_length } split /\W+/; +} + +close CAT; + +exit unless @allwords > 0; # nothing to output + +############################################# +# print out the title, if it's set, and not just a file name +if ($title !~ /^$/ && $title !~ /^[A-G]:[^\s]+\.[Pp][Dd][Ff]$/) { + print "t\t$title\n"; +} else { # otherwise generate a title + @temp = split(/\//, $ARGV[2]); # get the filename, get rid of basename + $temp[-1] =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/gie; + print "t\t$type Document $temp[-1]\n"; # print it +} + + +############################################# +# print out the head +$head =~ s/^\s+//; # remove leading and trailing space +$head =~ s/\s+$//; +$head =~ s/\s+/ /g; +$head =~ s/&/\&\;/g; +$head =~ s/</\<\;/g; +$head =~ s/>/\>\;/g; +print "h\t$head\n"; +#$calc = @allwords; +#print "h\t"; +##if ($calc >100) { # but not more than 100 words +## $calc = 100; +##} +#for ($x=0; $x<$calc; $x++) { # print out the words for the exerpt +# print "$allwords[$x] "; +#} +#print "\n"; + + +############################################# +# now the words +#for ($x=0; $x<@allwords; $x++) { +# $calc=int(1000*$x/@allwords); # calculate rel. position (0-1000) +# print "w\t$allwords[$x]\t$calc\t0\n"; # print out word, rel. pos. and text type (0) +#} +$x = 0; +for ( @allwords ) { + # print out word, rel. pos. and text type (0) + printf "w\t%s\t%d\t0\n", $_, 1000*$x++/@allwords; +} + +$calc=@allwords; +# print STDERR "# of words indexed: $calc\n"; diff --git a/debian/htdig/htdig-3.2.0b6/contrib/rtf2html/COPYING b/debian/htdig/htdig-3.2.0b6/contrib/rtf2html/COPYING new file mode 100644 index 00000000..d60c31a9 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/rtf2html/COPYING @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/debian/htdig/htdig-3.2.0b6/contrib/rtf2html/Makefile b/debian/htdig/htdig-3.2.0b6/contrib/rtf2html/Makefile new file mode 100644 index 00000000..5409f487 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/rtf2html/Makefile @@ -0,0 +1,11 @@ +CC= gcc +CFLAGS= -O2 -Wall + +rtf2html: rtf2html.c + $(CC) $(CFLAGS) -o rtf2html rtf2html.c + +install: rtf2html + cp rtf2html /usr/local/bin + +clean: + rm -f rtf2html diff --git a/debian/htdig/htdig-3.2.0b6/contrib/rtf2html/README b/debian/htdig/htdig-3.2.0b6/contrib/rtf2html/README new file mode 100644 index 00000000..9f3084d4 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/rtf2html/README @@ -0,0 +1,16 @@ +rtf2html - a RTF to HTML conversion program + +This version of rtf2html has been developed by +David Lippi <[email protected]> and Gabriele Bartolini +<[email protected]>, based on an earlier work +by Chuck Shotton <[email protected]> +(see http://www.w3.org/Tools/HTMLGeneration/rtf2html.html) +and Dmitry Porapov <[email protected]>. + +This version can handle character set recognition at run-time: +currently, the ANSI Windows 1252 code and the Macintosh's are +supported. + +For copyright details, see the file COPYING in your distribution +or the GNU General Public License (GPL) version 2 or later +<http://www.gnu.org/copyleft/gpl.html> diff --git a/debian/htdig/htdig-3.2.0b6/contrib/rtf2html/charset1252.h b/debian/htdig/htdig-3.2.0b6/contrib/rtf2html/charset1252.h new file mode 100644 index 00000000..d2b40ba0 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/rtf2html/charset1252.h @@ -0,0 +1,257 @@ +unsigned char* charset1252[256] = { + "", /* 1 - 1 */ + "", /* 2 - 2 */ + "", /* 3 - 3 */ + "", /* 4 - 4 */ + "", /* 5 - 5 */ + "", /* 6 - 6 */ + "", /* 7 - 7 */ + "", /* 8 - 8 */ + "\t", /* 9 - 9 */ + "\n", /* 10 - a */ + "", /* 11 - b */ + "", /* 12 - c */ + "\r", /* 13 - d */ + "", /* 14 - e */ + "", /* 15 - f */ + "", /* 16 - 10 */ + "", /* 17 - 11 */ + "", /* 18 - 12 */ + "", /* 19 - 13 */ + "", /* 20 - 14 */ + "", /* 21 - 15 */ + "", /* 22 - 16 */ + "", /* 23 - 17 */ + "", /* 24 - 18 */ + "", /* 25 - 19 */ + "", /* 26 - 1a */ + "", /* 27 - 1b */ + "", /* 28 - 1c */ + "", /* 29 - 1d */ + "", /* 30 - 1e */ + "", /* 31 - 1f */ + " ", /* 32 - 20 */ + "!", /* 33 - 21 */ + "\"", /* 34 - 22 */ + "#", /* 35 - 23 */ + "$", /* 36 - 24 */ + "%", /* 37 - 25 */ + "&", /* 38 - 26 */ + "'", /* 39 - 27 */ + "(", /* 40 - 28 */ + ")", /* 41 - 29 */ + "*", /* 42 - 2a */ + "+", /* 43 - 2b */ + ",", /* 44 - 2c */ + "-", /* 45 - 2d */ + ".", /* 46 - 2e */ + "/", /* 47 - 2f */ + "0", /* 48 - 30 */ + "1", /* 49 - 31 */ + "2", /* 50 - 32 */ + "3", /* 51 - 33 */ + "4", /* 52 - 34 */ + "5", /* 53 - 35 */ + "6", /* 54 - 36 */ + "7", /* 55 - 37 */ + "8", /* 56 - 38 */ + "9", /* 57 - 39 */ + ":", /* 58 - 3a */ + ";", /* 59 - 3b */ + "<", /* 60 - 3c */ + "=", /* 61 - 3d */ + ">", /* 62 - 3e */ + "?", /* 63 - 3f */ + "@", /* 64 - 40 */ + "A", /* 65 - 41 */ + "B", /* 66 - 42 */ + "C", /* 67 - 43 */ + "D", /* 68 - 44 */ + "E", /* 69 - 45 */ + "F", /* 70 - 46 */ + "G", /* 71 - 47 */ + "H", /* 72 - 48 */ + "I", /* 73 - 49 */ + "J", /* 74 - 4a */ + "K", /* 75 - 4b */ + "L", /* 76 - 4c */ + "M", /* 77 - 4d */ + "N", /* 78 - 4e */ + "O", /* 79 - 4f */ + "P", /* 80 - 50 */ + "Q", /* 81 - 51 */ + "R", /* 82 - 52 */ + "S", /* 83 - 53 */ + "T", /* 84 - 54 */ + "U", /* 85 - 55 */ + "V", /* 86 - 56 */ + "W", /* 87 - 57 */ + "X", /* 88 - 58 */ + "Y", /* 89 - 59 */ + "Z", /* 90 - 5a */ + "[", /* 91 - 5b */ + "\\", /* 92 - 5c */ + "]", /* 93 - 5d */ + "^", /* 94 - 5e */ + "_", /* 95 - 5f */ + "`", /* 96 - 60 */ + "a", /* 97 - 61 */ + "b", /* 98 - 62 */ + "c", /* 99 - 63 */ + "d", /* 100 - 64 */ + "e", /* 101 - 65 */ + "f", /* 102 - 66 */ + "g", /* 103 - 67 */ + "h", /* 104 - 68 */ + "i", /* 105 - 69 */ + "j", /* 106 - 6a */ + "k", /* 107 - 6b */ + "l", /* 108 - 6c */ + "m", /* 109 - 6d */ + "n", /* 110 - 6e */ + "o", /* 111 - 6f */ + "p", /* 112 - 70 */ + "q", /* 113 - 71 */ + "r", /* 114 - 72 */ + "s", /* 115 - 73 */ + "t", /* 116 - 74 */ + "u", /* 117 - 75 */ + "v", /* 118 - 76 */ + "w", /* 119 - 77 */ + "x", /* 120 - 78 */ + "y", /* 121 - 79 */ + "z", /* 122 - 7a */ + "{", /* 123 - 7b */ + "¦", /* 124 - 7c */ + "}", /* 125 - 7d */ + "~", /* 126 - 7e */ + " ", /* 127 - 7f */ + "€", /* 128 - 80 */ + " ", /* 129 - 81 */ + "‚", /* 130 - 82 */ + "ƒ", /* 131 - 83 */ + "„", /* 132 - 84 */ + "…", /* 133 - 85 */ + "†", /* 134 - 86 */ + "‡", /* 135 - 87 */ + "ˆ", /* 136 - 88 */ + "‰", /* 137 - 89 */ + "Š", /* 138 - 8a */ + "‹", /* 139 - 8b */ + "Œ", /* 140 - 8c */ + " ", /* 141 - 8d */ + "Ž", /* 142 - 8e */ + " ", /* 143 - 8f */ + " ", /* 144 - 90 */ + "‘", /* 145 - 91 */ + "’", /* 146 - 92 */ + "“", /* 147 - 93 */ + "”", /* 148 - 94 */ + "•", /* 149 - 95 */ + " ", /* 150 - 96 */ + " ", /* 151 - 97 */ + "˜", /* 152 - 98 */ + "™", /* 153 - 99 */ + "š", /* 154 - 9a */ + "›", /* 155 - 9b */ + "œ", /* 156 - 9c */ + " ", /* 157 - 9d */ + "ž", /* 158 - 9e */ + "Ÿ", /* 159 - 9f */ + " ", /* 160 - a0 */ + "¡", /* 161 - a1 */ + "¢", /* 162 - a2 */ + "£", /* 163 - a3 */ + "¤", /* 164 - a4 */ + "¥", /* 165 - a5 */ + "¦", /* 166 - a6 */ + "§", /* 167 - a7 */ + "¨", /* 168 - a8 */ + "©", /* 169 - a9 */ + "ª", /* 170 - aa */ + "«", /* 171 - ab */ + "¬", /* 172 - ac */ + "­", /* 173 - ad */ + "®", /* 174 - ae */ + "¯", /* 175 - af */ + "°", /* 176 - b0 */ + "±", /* 177 - b1 */ + "²", /* 178 - b2 */ + "³", /* 179 - b3 */ + "´", /* 180 - b4 */ + "µ", /* 181 - b5 */ + "¶", /* 182 - b6 */ + "·", /* 183 - b7 */ + "ç", /* 184 - b8 */ + "¹", /* 185 - b9 */ + "º", /* 186 - ba */ + "»", /* 187 - bb */ + "¼", /* 188 - bc */ + "½", /* 189 - bd */ + "¾", /* 190 - be */ + "¿", /* 191 - bf */ + "À", /* 192 - c0 */ + "Á", /* 193 - c1 */ + "Â", /* 194 - c2 */ + "Ã", /* 195 - c3 */ + "Ä", /* 196 - c4 */ + "Å", /* 197 - c5 */ + "Æ", /* 198 - c6 */ + "Ç", /* 199 - c7 */ + "È", /* 200 - c8 */ + "É", /* 201 - c9 */ + "Ê", /* 202 - ca */ + "Ë", /* 203 - cb */ + "Ì", /* 204 - cc */ + "Í", /* 205 - cd */ + "Î", /* 206 - ce */ + "Ï", /* 207 - cf */ + "Ð", /* 208 - d0 */ + "Ñ", /* 209 - d1 */ + "Ò", /* 210 - d2 */ + "Ó", /* 211 - d3 */ + "&Oring;", /* 212 - d4 */ + "Õ", /* 213 - d5 */ + "Ö", /* 214 - d6 */ + "×", /* 215 - d7 */ + "Ø", /* 216 - d8 */ + "Ù", /* 217 - d9 */ + "Ú", /* 218 - da */ + "Û", /* 219 - db */ + "Ü", /* 220 - dc */ + "Ý", /* 221 - dd */ + "Þ", /* 222 - de */ + "ß", /* 223 - df */ + "à", /* 224 - e0 */ + "á", /* 225 - e1 */ + "â", /* 226 - e2 */ + "ã", /* 227 - e3 */ + "ä", /* 228 - e4 */ + "å", /* 229 - e5 */ + "æ", /* 230 - e6 */ + "ç", /* 231 - e7 */ + "è", /* 232 - e8 */ + "é", /* 233 - e9 */ + "ê", /* 234 - ea */ + "ë", /* 235 - eb */ + "ì", /* 236 - ec */ + "í", /* 237 - ed */ + "î", /* 238 - ee */ + "ï", /* 239 - ef */ + "ð", /* 240 - f0 */ + "ñ", /* 241 - f1 */ + "ò", /* 242 - f2 */ + "ó", /* 243 - f3 */ + "ô", /* 244 - f4 */ + "õ", /* 245 - f5 */ + "ö", /* 246 - f6 */ + "÷", /* 247 - f7 */ + "ø", /* 248 - f8 */ + "ù", /* 249 - f9 */ + "ú", /* 250 - fa */ + "û", /* 251 - fb */ + "ü", /* 252 - fc */ + "ý", /* 253 - fd */ + "þ", /* 254 - fe */ + "ÿ" /* 255 - ff */ +}; diff --git a/debian/htdig/htdig-3.2.0b6/contrib/rtf2html/charsetmac.h b/debian/htdig/htdig-3.2.0b6/contrib/rtf2html/charsetmac.h new file mode 100644 index 00000000..8c4aeca0 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/rtf2html/charsetmac.h @@ -0,0 +1,257 @@ +unsigned char* mac[256] = { + "", /* 1 - 1 */ + "", /* 2 - 2 */ + "", /* 3 - 3 */ + "", /* 4 - 4 */ + "", /* 5 - 5 */ + "", /* 6 - 6 */ + "", /* 7 - 7 */ + "", /* 8 - 8 */ + "\t", /* 9 - 9 */ + "\n", /* 10 - a */ + "", /* 11 - b */ + "", /* 12 - c */ + "\r", /* 13 - d */ + "", /* 14 - e */ + "", /* 15 - f */ + "", /* 16 - 10 */ + "", /* 17 - 11 */ + "", /* 18 - 12 */ + "", /* 19 - 13 */ + "", /* 20 - 14 */ + "", /* 21 - 15 */ + "", /* 22 - 16 */ + "", /* 23 - 17 */ + "", /* 24 - 18 */ + "", /* 25 - 19 */ + "", /* 26 - 1a */ + "", /* 27 - 1b */ + "", /* 28 - 1c */ + "", /* 29 - 1d */ + "", /* 30 - 1e */ + "", /* 31 - 1f */ + " ", /* 32 - 20 */ + "!", /* 33 - 21 */ + "\"", /* 34 - 22 */ + "#", /* 35 - 23 */ + "$", /* 36 - 24 */ + "%", /* 37 - 25 */ + "&", /* 38 - 26 */ + "'", /* 39 - 27 */ + "(", /* 40 - 28 */ + ")", /* 41 - 29 */ + "*", /* 42 - 2a */ + "+", /* 43 - 2b */ + ",", /* 44 - 2c */ + "-", /* 45 - 2d */ + ".", /* 46 - 2e */ + "/", /* 47 - 2f */ + "0", /* 48 - 30 */ + "1", /* 49 - 31 */ + "2", /* 50 - 32 */ + "3", /* 51 - 33 */ + "4", /* 52 - 34 */ + "5", /* 53 - 35 */ + "6", /* 54 - 36 */ + "7", /* 55 - 37 */ + "8", /* 56 - 38 */ + "9", /* 57 - 39 */ + ":", /* 58 - 3a */ + ";", /* 59 - 3b */ + "<", /* 60 - 3c */ + "=", /* 61 - 3d */ + ">", /* 62 - 3e */ + "?", /* 63 - 3f */ + "@", /* 64 - 40 */ + "A", /* 65 - 41 */ + "B", /* 66 - 42 */ + "C", /* 67 - 43 */ + "D", /* 68 - 44 */ + "E", /* 69 - 45 */ + "F", /* 70 - 46 */ + "G", /* 71 - 47 */ + "H", /* 72 - 48 */ + "I", /* 73 - 49 */ + "J", /* 74 - 4a */ + "K", /* 75 - 4b */ + "L", /* 76 - 4c */ + "M", /* 77 - 4d */ + "N", /* 78 - 4e */ + "O", /* 79 - 4f */ + "P", /* 80 - 50 */ + "Q", /* 81 - 51 */ + "R", /* 82 - 52 */ + "S", /* 83 - 53 */ + "T", /* 84 - 54 */ + "U", /* 85 - 55 */ + "V", /* 86 - 56 */ + "W", /* 87 - 57 */ + "X", /* 88 - 58 */ + "Y", /* 89 - 59 */ + "Z", /* 90 - 5a */ + "[", /* 91 - 5b */ + "\\", /* 92 - 5c */ + "]", /* 93 - 5d */ + "^", /* 94 - 5e */ + "_", /* 95 - 5f */ + "`", /* 96 - 60 */ + "a", /* 97 - 61 */ + "b", /* 98 - 62 */ + "c", /* 99 - 63 */ + "d", /* 100 - 64 */ + "e", /* 101 - 65 */ + "f", /* 102 - 66 */ + "g", /* 103 - 67 */ + "h", /* 104 - 68 */ + "i", /* 105 - 69 */ + "j", /* 106 - 6a */ + "k", /* 107 - 6b */ + "l", /* 108 - 6c */ + "m", /* 109 - 6d */ + "n", /* 110 - 6e */ + "o", /* 111 - 6f */ + "p", /* 112 - 70 */ + "q", /* 113 - 71 */ + "r", /* 114 - 72 */ + "s", /* 115 - 73 */ + "t", /* 116 - 74 */ + "u", /* 117 - 75 */ + "v", /* 118 - 76 */ + "w", /* 119 - 77 */ + "x", /* 120 - 78 */ + "y", /* 121 - 79 */ + "z", /* 122 - 7a */ + "{", /* 123 - 7b */ + "¦", /* 124 - 7c */ + "}", /* 125 - 7d */ + "~", /* 126 - 7e */ + " ", /* 127 - 7f */ + "€", /* 128 - 80 */ + "Å", /* 129 - 81 */ + "‚", /* 130 - 82 */ + "ƒ", /* 131 - 83 */ + "„", /* 132 - 84 */ + "…", /* 133 - 85 */ + "†", /* 134 - 86 */ + "‡", /* 135 - 87 */ + "á", /* 136 - 88 */ + "‰", /* 137 - 89 */ + "Š", /* 138 - 8a */ + "‹", /* 139 - 8b */ + "Œ", /* 140 - 8c */ + "ç", /* 141 - 8d */ + "é", /* 142 - 8e */ + "è ", /* 143 - 8f */ + "ê", /* 144 - 90 */ + "‘", /* 145 - 91 */ + "’", /* 146 - 92 */ + "ì", /* 147 - 93 */ + "\"", /* 148 - 94 */ + "•", /* 149 - 95 */ + " ", /* 150 - 96 */ + " ", /* 151 - 97 */ + "˜", /* 152 - 98 */ + "™", /* 153 - 99 */ + "š", /* 154 - 9a */ + "›", /* 155 - 9b */ + "œ", /* 156 - 9c */ + "ù", /* 157 - 9d */ + "ž", /* 158 - 9e */ + "Ÿ", /* 159 - 9f */ + " ", /* 160 - a0 */ + "º", /* 161 - a1 */ + "¢", /* 162 - a2 */ + "£", /* 163 - a3 */ + "§", /* 164 - a4 */ + "¥", /* 165 - a5 */ + "¦", /* 166 - a6 */ + "§", /* 167 - a7 */ + "¨", /* 168 - a8 */ + "©", /* 169 - a9 */ + "ª", /* 170 - aa */ + "«", /* 171 - ab */ + "¬", /* 172 - ac */ + "­", /* 173 - ad */ + "®", /* 174 - ae */ + "¯", /* 175 - af */ + "°", /* 176 - b0 */ + "±", /* 177 - b1 */ + "²", /* 178 - b2 */ + "³", /* 179 - b3 */ + "´", /* 180 - b4 */ + "µ", /* 181 - b5 */ + "¶", /* 182 - b6 */ + "·", /* 183 - b7 */ + "ç", /* 184 - b8 */ + "¹", /* 185 - b9 */ + "º", /* 186 - ba */ + "»", /* 187 - bb */ + "¼", /* 188 - bc */ + "½", /* 189 - bd */ + "¾", /* 190 - be */ + "¿", /* 191 - bf */ + "À", /* 192 - c0 */ + "Á", /* 193 - c1 */ + "Â", /* 194 - c2 */ + "Ã", /* 195 - c3 */ + "Ä", /* 196 - c4 */ + "Å", /* 197 - c5 */ + "Æ", /* 198 - c6 */ + "Ç", /* 199 - c7 */ + "È", /* 200 - c8 */ + "É", /* 201 - c9 */ + "Ê", /* 202 - ca */ + "À", /* 203 - cb */ + "Ì", /* 204 - cc */ + "Í", /* 205 - cd */ + "Î", /* 206 - ce */ + "Ï", /* 207 - cf */ + "Ð", /* 208 - d0 */ + "Ñ", /* 209 - d1 */ + "\"", /* 210 - d2 */ + "\"", /* 211 - d3 */ + "&Oring;", /* 212 - d4 */ + "Õ", /* 213 - d5 */ + "Ö", /* 214 - d6 */ + "×", /* 215 - d7 */ + "Ø", /* 216 - d8 */ + "Ù", /* 217 - d9 */ + "Ú", /* 218 - da */ + "Û", /* 219 - db */ + "Ü", /* 220 - dc */ + "Ý", /* 221 - dd */ + "Þ", /* 222 - de */ + "ß", /* 223 - df */ + "à", /* 224 - e0 */ + "á", /* 225 - e1 */ + "â", /* 226 - e2 */ + "ã", /* 227 - e3 */ + "ä", /* 228 - e4 */ + "å", /* 229 - e5 */ + "æ", /* 230 - e6 */ + "ç", /* 231 - e7 */ + "è", /* 232 - e8 */ + "é", /* 233 - e9 */ + "ê", /* 234 - ea */ + "ë", /* 235 - eb */ + "ì", /* 236 - ec */ + "í", /* 237 - ed */ + "î", /* 238 - ee */ + "ï", /* 239 - ef */ + "ð", /* 240 - f0 */ + "ñ", /* 241 - f1 */ + "ò", /* 242 - f2 */ + "ó", /* 243 - f3 */ + "ô", /* 244 - f4 */ + "õ", /* 245 - f5 */ + "ö", /* 246 - f6 */ + "÷", /* 247 - f7 */ + "ø", /* 248 - f8 */ + "ù", /* 249 - f9 */ + "ú", /* 250 - fa */ + "û", /* 251 - fb */ + "ü", /* 252 - fc */ + "ý", /* 253 - fd */ + "þ", /* 254 - fe */ + "ÿ" /* 255 - ff */ +}; diff --git a/debian/htdig/htdig-3.2.0b6/contrib/rtf2html/rtf2html.c b/debian/htdig/htdig-3.2.0b6/contrib/rtf2html/rtf2html.c new file mode 100644 index 00000000..d49140d4 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/rtf2html/rtf2html.c @@ -0,0 +1,910 @@ +/* RTF2HTML.c, Chuck Shotton - 6/21/93 */ +/************************************************************************ + * This program takes a stab at converting RTF (Rich Text Format) files + * into HTML. There are some limitations that keep RTF from being able to + * easily represent things like in-line images and anchors as styles. In + * particular, RTF styles apply to entire "paragraphs", so anchors or + * images in the middle of a text stream can't easily be represented by + * styles. The intent is to ultimately use something like embedded text + * color changes to represent these constructs. + * + * In the meantime, you can take existing Word documents, apply the + * correct style sheet, and convert them to HTML with this tool. + * + * AUTHOR: Chuck Shotton, UT-Houston Academic Computing, + * + * Dmitry Potapov, CapitalSoft + * + * David Lippi, Comune di Prato, Italy + * + * Gabriele Bartolini, Comune di Prato, Italy + * + * USAGE: rtf2html [rtf_filename] + * + * BEHAVIOR: + * rtf2html will open the specified RTF input file or read from + * standard input, writing converted HTML to standard output. + * + * NOTES: + * The RTF document must be formatted with a style sheet that has + * style numberings that conform to the style_mappings table + * defined in this source file. Characters are converted according + * to the ANSI Windows 1252 code or Macintosh. + * + * MODIFICATIONS: + * 6/21/93 : Chuck Shotton - created version 1.0. + * 11/26/98 : Dmitry Potapov - version 1.1 beta + * 05/07/04 : David Lippi, Gabriele Bartolini - version 1.2 + * + * Copyright (C) 2004 Comune di Prato + * + * For copyright details, see the file COPYING in your distribution + * or the GNU General Public License (GPL) version 2 or later + * <http://www.gnu.org/copyleft/gpl.html> + * + ************************************************************************/ + +/* Note, the source is formated with 4 character tabs */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include "charset1252.h" +#include "charsetmac.h" + +#ifdef _MSC_VER +# define strcasecmp _stricmp +#endif + +#ifndef TRUE +#define TRUE -1 +#define FALSE 0 +#endif + +#define MAX_LEVELS 40 /*defines the # of nested in-line styles (pairs of {})*/ +#define MAX_RTF_TOKEN 40 + +#define MAX_INLINE_STYLES 5 /*defines # of in-line styles, bold, italic, etc.*/ + +typedef struct tag_StyleState +{ + unsigned char s: MAX_INLINE_STYLES; +} TStyleState; + +typedef enum { s_plain, s_bold, s_italic, s_underline, s_hidden, /*in-line styles*/ + s_para, s_br, /*pseudo style*/ + s_h0, s_h1, s_h2, s_h3, s_h4, s_h5, s_h6 /*heading styles*/ +} StyleState; + +char *styles[][2] = { /*HTML Start and end tags for styles*/ + {"", ""}, + {"<strong>", "</strong>"}, + {"<em>", "</em>"}, + {"", ""}, + {"<!-- ", " -->"}, + {"\n", "\n"}, /* {"\n<p>", "</p>\n"}, */ + {"<br />\n",""}, + {"", ""}, + {"<h1>", "</h1>"}, + {"<h2>", "</h2>"}, + {"<h3>", "</h3>"}, + {"<h4>", "</h4>"}, + {"<h5>", "</h5>"}, + {"<h6>", "</h6>"} +}; + +/* style_mappings maps the style numbers in a RTF style sheet into one of the*/ +/* (currently) six paragraph-oriented HTML styles (i.e. heading 1 through 6.)*/ +/* Additional styles for lists, etc. should be added here. Style info */ +/* ultimately should be read from some sort of config file into these tables.*/ + +#define MAX_NAME_LEN 40 +char style_name[MAX_NAME_LEN]; + +#define STYLE_NUMBER 7 +char *style_namings[STYLE_NUMBER] = { + "", "heading 1", "heading 2", "heading 3", "heading 4", "heading 5", + "heading 6" +}; +char style_mappings[STYLE_NUMBER][MAX_RTF_TOKEN]; +char style_number[MAX_RTF_TOKEN]; + +/* RTF tokens that mean something to the parser. All others are ignored. */ + +typedef enum { + t_start, + t_fonttbl, t_colortbl, t_stylesheet, t_info, t_s, t_b, t_ul, t_ulw, + t_uld, t_uldb, t_i, t_v, t_plain, t_par, t_pict, t_tab, t_bullet, + t_cell, t_row, t_line, t_endash, t_emdash, t_rquote, + t_end +} TokenIndex; + +char *tokens[] = { + "###", + "fonttbl", "colortbl", "stylesheet", "info", "s", "b", "ul", "ulw", + "uld", "uldb", "i", "v", "plain", "par", "pict", "tab", "bullet", + "cell", "row", "line", "endash", "emdash", "rquote", + "###" +}; + +TStyleState style_state[MAX_LEVELS], curr_style; +short curr_heading; + +void (*RTF_DoControl)(FILE*,char*,char*); +char isBody; +char* title; +//FILE* f; + +short level, /*current {} nesting level*/ + skip_to_level,/*{} level to which parsing should skip (used to skip */ + /* font tables, style sheets, color tables, etc.) */ + gobble, /*Flag set to indicate all input should be discarded */ + ignore_styles;/*Set to ignore inline style expansions after style use*/ + +/* Charset */ +unsigned char** charset_table; + +#define CHARSET_DEFAULT 0 // Index of the default charset to use +#define CHARSET_NUMBER 2 // Number of charset used +#define CHARSET_MAX_LENGTH 20 // Max numbero of char in the charset +// metadata used in rtf standard for the charset definition +unsigned char *charset[CHARSET_NUMBER] = { + "ansi", + "mac" +}; +// variable with the charset definition +unsigned char **charset_variablename[CHARSET_NUMBER] = { + charset1252, + mac +}; + +/**************************************/ + +int openfile (char * filename, FILE ** f) +{ + int rv = 1; + + if (filename) + { + if (!(*f = fopen (filename, "r"))) + { + fprintf (stderr, "\nError: Input file %s not found.\n", filename); + rv = 0; + } + else + { + title = filename; + } + } + else + { + *f = stdin; + title="STDIN"; + } + return rv; +} + +/**************************************/ + +int closefile (FILE * f) +{ + return fclose (f); +} + +/**************************************/ + +char RTF_GetChar( FILE* f ) +{ + char ch; + do + { + ch = fgetc( f ); + } while ((ch=='\r')||(ch=='\n')); + return ch; +} + +/**************************************/ + +char RTF_UnGetChar(FILE* f, char ch) +{ + return ungetc(ch, f); +} + +/**************************************/ + +void RTF_PutStr(char* s) +{ + if (gobble) return; + fputs(s, stdout); +} + +/**************************************/ + +void RTF_PutHeader() +{ + RTF_PutStr("<head>\n<title>"); + RTF_PutStr(title); + RTF_PutStr("</title>\n"); + RTF_PutStr("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">\n"); + RTF_PutStr("</head>\n"); +} + +/**************************************/ + +void RTF_PutChar(char ch) +{ + if (gobble) return; + if (!isBody) + { + RTF_PutHeader(); + RTF_PutStr("<body>\n"); + isBody=TRUE; + } + switch (ch) { + case '<': + RTF_PutStr("<"); + break; + + case '>': + RTF_PutStr(">"); + break; + + case '&': + RTF_PutStr("&"); + break; + + default: + fputc(ch, stdout); + } +} + +/**************************************/ + +void RTF_PlainStyle (TStyleState* s) +{ + int i; + for(i=0;i<MAX_INLINE_STYLES;i++) + { + if(s->s & (1<<i)) + RTF_PutStr(styles[i][1]); + } + s->s=0; +} + +/**************************************/ + +void RTF_SetStyle(TStyleState* s, StyleState style) +{ + if( (!ignore_styles||(style==s_hidden)) && ((s->s&(1<<style))==0) ) + { + RTF_PutStr(styles[style][0]); + s->s|=(1<<style); + } +} + +/**************************************/ + +void RTF_PushState(short* level) +{ + if(*level>=MAX_LEVELS) + { + fprintf(stderr,"Exceed maximum level\n"); + exit(-1); + } + style_state[*level]=curr_style; + (*level)++; +} + +/**************************************/ + +void RTF_PopState(short* level) +{ + int j; + TStyleState new_style; + + if(*level<1) + { + fprintf(stderr,"RTF parse error: unexpected '}'\n"); + exit(-1); + } + new_style = style_state[*level-1]; + /*close off any in-line styles*/ + for (j=0;j<MAX_INLINE_STYLES;j++) + { + if ( ((curr_style.s & (1<<j))!=0) && ((new_style.s & (1<<j))==0) ) + { + curr_style.s &= ~(1<<j); + RTF_PutStr(styles[j][1]); + } + } + + for (j=0;j<MAX_INLINE_STYLES;j++) + { + if( ((curr_style.s & (1<<j))==0) && ((new_style.s & (1<<j))!=0) ) + RTF_PutStr(styles[j][0]); + } + (*level)--; + curr_style = new_style; + + if (*level == skip_to_level) { + skip_to_level = -1; + gobble = FALSE; + } +} + +/**************************************/ +/* Map a style number into a HTML heading */ + +short RTF_MapStyle(char* s) +{ + int i; + for (i=0;i<7;i++) + if (!strcmp(style_mappings[i], s)) + return (i); + return (0); +} + +/**************************************/ + +void RTF_AddStyleMap(char* name, char* number) +{ + int i, len; + len=strlen(name); + if( name[len-1]==';') name[--len]=0; + for(i=0;i<STYLE_NUMBER;i++) + { + if(!strcasecmp(name,style_namings[i])) + { + strcpy(style_mappings[i],number); + return; + } + } +} + +/**************************************/ + +void RTF_BuildName(char* token, char* ch, unsigned is_string) +{ + int len; + char *p; + len = strlen(token); + if(len>=MAX_NAME_LEN-1) + return; + if (is_string) + { + for (p = ch; p && *p; ++p) + { + token[len]=*p; + ++len; + } + } + else + { + token[len] = *ch; + ++len; + } + token[len]='\0'; +} + + +/**************************************/ + +void RTF_ClearName(char* token) +{ + token[0]=0; +} + +/**************************************/ + +TokenIndex GetTokenIndex(char* control) +{ + TokenIndex i; + + for (i=t_start; i<t_end; i++) + { + if(control[0]==tokens[i][0]) /* Added for fast compare */ + { + if (!strcmp(control, tokens[i])) + { + break; + } + } + } + return i; +} + +/**************************************/ + +void RTF_DoStyleControl (FILE* f, char* control, char* arg) +{ + if(GetTokenIndex(control)==t_s) + { + strcpy(style_number,arg); + } +} + +/**************************************/ + +int chartoi(char ch) +{ + if((ch>='0')&&(ch<='9')) + return ch-'0'; + if((ch>='A')&&(ch<='Z')) + return ch-'A'+10; + if((ch>='a')&&(ch<='z')) + return ch-'a'+10; + return -1; +} + +/**************************************/ + +void RTF_BuildArg (FILE * f, char ch, char* arg) +{ + int i=0; + + if(feof(f)) + { + arg[0]=0; + return; + } + if(ch=='-') + { + arg[i++]='-'; + ch = RTF_GetChar( f ); + if(feof(f)) + { + arg[0]=0; + return; + } + } + for(;isdigit(ch);i++) + { + arg[i]=ch; + if(i>=MAX_RTF_TOKEN-1) + { + arg[MAX_RTF_TOKEN-1]=0; + while(isdigit(ch)) { + ch = RTF_GetChar( f ); + if(feof(f)) + return; + } + break; + } + ch = RTF_GetChar( f ); + if(feof(f)) + { + arg[i+1]=0; + return; + } + } + arg[i]=0; + if(!isspace(ch)) + { + RTF_UnGetChar(f, ch); + } +} + +/**************************************/ + +void RTF_BuildToken (FILE* f, char ch) +{ + int i; + + for(i=1;;i++) + { + char token[MAX_RTF_TOKEN], arg[MAX_RTF_TOKEN]; + token[i-1]=ch; + if(i>=MAX_RTF_TOKEN-1) + { + do { + ch = RTF_GetChar( f ); + if(feof(f)) + return; + } while (isalpha(ch)); + RTF_BuildArg(f, ch,arg); + return; + } + ch = RTF_GetChar( f ); + if(feof(f)) + { + token[i]=0; + RTF_DoControl(f,token,""); + return; + } + if( !isalpha(ch) ) + { + token[i]=0; + RTF_BuildArg(f, ch,arg); + RTF_DoControl(f,token,arg); + return; + } + } +} + +/**************************************/ + +void RTF_backslash(FILE* f, char** pch, char* pf) +{ + int ch; + *pf=FALSE; + ch = RTF_GetChar( f ); + if(feof(f)) + { + fprintf(stderr,"Unexpected end of file\n"); + return; + } + switch (ch) + { + case '\\': + *pch=charset_table[92]; *pf=TRUE; + break; + case '{': + *pch=charset_table[123]; *pf=TRUE; + break; + case '}': + *pch=charset_table[125]; *pf=TRUE; + break; + case '*': + gobble = TRUE; /*perform no output, ignore commands 'til level-1*/ + if(skip_to_level>level-1||skip_to_level==-1) + skip_to_level = level-1; + break; + case '\'': + { + char ch1, ch2; + ch1 = RTF_GetChar( f ); + ch2 = RTF_GetChar( f ); + if(!feof(f)) + { + if(isxdigit(ch1)&&isxdigit(ch2)) + { + ch = chartoi(ch1)*16+chartoi(ch2); + *pch = charset_table[ch-1]; *pf=TRUE; + } else { + fprintf(stderr,"RTF Error: unexpected '%c%c' after \\\'\n",ch1,ch2); + } + } + break; + } + default: + if (isalpha(ch)) + { + RTF_BuildToken(f, ch); + } else { + fprintf(stderr, "\nRTF Error: unexpected '%c' after \\.\n", ch); + } + break; + } +} + +/**************************************/ + +void RTF_ParseStyle(FILE * f) +{ + char ch, pf; + char *code; + int level0; + void (*PrevDoControl)(FILE*,char*,char*); + + level0=level; + PrevDoControl=RTF_DoControl; + RTF_DoControl=RTF_DoStyleControl; + + RTF_ClearName(style_name); + style_number[0]=0; + while (1) + { + ch = RTF_GetChar( f ); + if(feof(f)) + break; + switch (ch) + { + case '\\': + RTF_backslash(f, &code, &pf); + if(pf) + { + RTF_BuildName(style_name, code, 1); + } else { + RTF_ClearName(style_name); + } + break; + + case '{': + level++; + RTF_ClearName(style_name); + break; + + case '}': + if(level0+1==level) + { + if(style_number[0]!=0) + { + RTF_AddStyleMap(style_name,style_number); + style_number[0]=0; + } + } else if(level0==level) { + RTF_DoControl=PrevDoControl; + RTF_UnGetChar(f, ch); + return; + } + level--; + RTF_ClearName(style_name); + break; + + default: + RTF_BuildName(style_name, &ch, 0); + break; + } + } /* while */ +} + +/**************************************/ +/* Perform actions for RTF control words */ + +void RTF_DoBodyControl (FILE * f, char* control,char* arg) +{ + short style; + + if (gobble) return; + + switch (GetTokenIndex(control)) + { + case t_stylesheet: + gobble = TRUE; /*perform no output, ignore commands 'til level-1*/ + skip_to_level = level-1; + RTF_ParseStyle( f ); + break; + case t_fonttbl: /*skip all of these and their contents!*/ + case t_colortbl: + case t_info: + gobble = TRUE; /*perform no output, ignore commands 'til level-1*/ + skip_to_level = level-1; + break; + case t_pict: + gobble = TRUE; /*perform no output, ignore commands 'til level-1*/ + if(skip_to_level>=level || skip_to_level==-1) + skip_to_level = level-1; + break; + + + case t_s: /*Style*/ + if (!curr_heading) + { + style = RTF_MapStyle (arg); + if(style) + { + curr_heading = s_h0 + style; + RTF_PutStr(styles[curr_heading][0]); + ignore_styles = TRUE; + } + } + break; + + case t_b: /*Bold*/ + RTF_SetStyle(&curr_style,s_bold); + break; + + case t_ulw: + case t_uld: + case t_uldb: + case t_ul: /*Underline, maps to "emphasis" HTML style*/ + RTF_SetStyle(&curr_style,s_underline); + break; + + case t_i: /*Italic*/ + RTF_SetStyle(&curr_style,s_italic); + break; + + case t_v: /* Hidden*/ + RTF_SetStyle(&curr_style,s_hidden); + break; + + case t_par: /*Paragraph*/ + if (curr_heading!=s_plain) { + RTF_PutStr(styles[curr_heading][1]); + curr_heading = s_plain; + } else { + RTF_PutStr(styles[s_para][0]); + } + ignore_styles = FALSE; + break; + + case t_plain: /*reset inline styles*/ + RTF_PlainStyle(&curr_style); + break; + case t_cell: + case t_tab: + RTF_PutChar(' '); + break; + case t_endash: + case t_emdash: + RTF_PutChar('-'); + break; + case t_line: + case t_row: + RTF_PutStr(styles[s_br][0]); + break; + case t_bullet: + RTF_PutChar('\xb7'); + break; + case t_start: + case t_end: + break; + case t_rquote: + //RTF_PutStr("’"); + RTF_PutStr("'"); + break; + } + +} + +/**************************************/ +/* RTF_Parse is a crude, ugly state machine that understands enough of */ +/* the RTF syntax to be dangerous. */ + +void RTF_ParseBody( FILE* f ) +{ + char ch, pf; + char* code; + + RTF_DoControl=RTF_DoBodyControl; + level = 0; + skip_to_level = -1; + gobble = FALSE; + ignore_styles = FALSE; + + while (1) + { + ch = RTF_GetChar( f ); + if (feof(f)) + { + break; + } + switch (ch) + { + case '\\': + RTF_backslash(f, &code,&pf); + if(pf && code) + RTF_PutStr(code); + break; + + case '{': + RTF_PushState(&level); + break; + + case '}': + RTF_PopState(&level); + break; + + default: + RTF_PutChar(ch); + break; + } + }/*while*/ +} + +/**************************************/ + +int RTF_Parse (FILE* f) +{ + RTF_PutStr("<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML//EN\">\n<html>\n"); + + isBody=FALSE; + + RTF_ParseBody(f); + + if (isBody) RTF_PutStr("</body>\n"); + + RTF_PutStr("</html>\n"); + + return 0; +} + +/**************************************/ + +void Initialize() +{ + int i; + + for (i=0;i<MAX_LEVELS;i++) + style_state[i].s=s_plain; + + curr_style.s=s_plain; + curr_heading = s_plain; + + // Set default styles maping + style_mappings[0][0]=0; + for(i=1;i<=6;i++) + sprintf(style_mappings[i],"%d",256-i); +} + +/**************************************/ + +int RTF_FindCharset(FILE * f) +{ + char ch; + char code[CHARSET_MAX_LENGTH]; + int metadata = 0; + int i = 0; + + while ( !feof(f) ) + { + ch = RTF_GetChar( f ); + if ( ch == '\\' ) + { + metadata++; + } + if ( metadata == 2 ) // the second metadata is the charset used + { + if ( ch != '\\' ) + { + code[i] = ch; + i++; + } + } + if ( metadata > 2 ) + { + code[i] = '\0'; + break; + } + } + + + for ( i = 0; i < CHARSET_NUMBER ; i++) + { + if ( strcmp( (const char *)charset[i], (const char *) code ) == 0 ) + { + charset_table = charset_variablename[i]; + break; + }; + } + if ( i == CHARSET_NUMBER ) + { + charset_table = charset_variablename[CHARSET_DEFAULT]; + } + + return 1; // always true! +} + +/**************************************/ + +int main(int argc,char** argv) +{ + int rv = 0; + FILE *f = NULL; + + Initialize(); + + if ( argc > 1) + { + if( strcmp(argv[1],"--help")==0 || strcmp(argv[1],"-H")==0 ) + { + printf("Use: %s [rtf_filename]\n",argv[0]); + rv = 0; + } else if ( strcmp(argv[1],"--version")==0 || strcmp(argv[1],"-V")==0 ) { + printf("rtf2html version 1.2\n"); + rv = 0; + } + else + { + rv = openfile(argv[1], &f); + if ( rv ) rv = RTF_FindCharset(f); + if ( rv ) + { + rewind(f); + rv = RTF_Parse(f); + } + if ( rv ) rv = closefile(f); + } + } + else + { + printf("Use: %s [rtf_filename]\n",argv[0]); + } + return rv; +} diff --git a/debian/htdig/htdig-3.2.0b6/contrib/run-robot.sh b/debian/htdig/htdig-3.2.0b6/contrib/run-robot.sh new file mode 100644 index 00000000..a5884f2c --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/run-robot.sh @@ -0,0 +1,23 @@ +#!/bin/sh + +CfgFile=/www/search.sbs.de/test/conf/htfig.conf +BinDir=/www/search.sbs.de/test/bin +CgiBinDir=/www/search.sbs.de/test/cgi-bin +DataDir=/www/search.sbs.de/data/robot +Date=`date +%y%m%d` + +date > $DataDir/$Date-runtime +$BinDir/htdig -v -t -s -c $CfgFile >> $DataDir/$Date-robot +$BinDir/htmerge -v -c $CfgFile >> $DataDir/$Date-robot +date >> $DataDir/$Date-runtime + +$BinDir/whatsnew.pl -v > $DataDir/$Date-whatsnew +sort $BinDir/urls | uniq > $DataDir/$Date-urls + +rm -f $DataDir/current-* +ln -s $DataDir/$Date-runtime $DataDir/current-runtime +ln -s $DataDir/$Date-robot $DataDir/current-robot +ln -s $DataDir/$Date-urls $DataDir/current-urls + +$BinDir/status.pl -v > $DataDir/$Date-status + diff --git a/debian/htdig/htdig-3.2.0b6/contrib/scriptname/README b/debian/htdig/htdig-3.2.0b6/contrib/scriptname/README new file mode 100644 index 00000000..9b94ec5d --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/scriptname/README @@ -0,0 +1,16 @@ +This is a small example to demonstrate the script_name attribute. + + +Assuming that these files are located within your server's "search" +directory, just add the following line to your htdig configuration +file: + +script_name: /search/results.shtml + +You may also have to override the standard template files, using the +search_results_header, search_results_footer, syntax_error_file and +nothing_found_file attributes. + + +(c) 1999, Hanno Mueller, http://www.hanno.de + diff --git a/debian/htdig/htdig-3.2.0b6/contrib/scriptname/results.shtml b/debian/htdig/htdig-3.2.0b6/contrib/scriptname/results.shtml new file mode 100644 index 00000000..86e09563 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/scriptname/results.shtml @@ -0,0 +1,17 @@ +<!-- + -- script_name example using SSI + -- This is the results page. + -- Note the server side include directive calling /cgi-bin/htsearch. + -- The page's parameters will be handed over to htsearch. + --> + +<html><head><title>Search results (SHTML)</title></head> +<body bgcolor="#eef7ff"> +<h2><img src="/htdig/htdig.gif"> + +<!--#exec cgi="/cgi-bin/htsearch" --> + +<hr noshade size=4> +<a href="http://www.htdig.org"> +<img src="/htdig/htdig.gif" border=0>ht://Dig</a> +</body></html> diff --git a/debian/htdig/htdig-3.2.0b6/contrib/scriptname/search.html b/debian/htdig/htdig-3.2.0b6/contrib/scriptname/search.html new file mode 100644 index 00000000..b6f80e97 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/scriptname/search.html @@ -0,0 +1,53 @@ +<!-- + -- script_name example using SSI + -- This is the standard search page (no dynamic stuff), + -- with two minor differences: The form uses the "get" + -- method and the "action" sends the form input to the + -- the dynamic results page. + --> + +<html> +<head> +<title>ht://Dig WWW Search</title> +</head> +<body bgcolor="#eef7ff"> +<h1> +<a href="http://www.htdig.org"><IMG SRC="/htdig/htdig.gif" align=bottom alt="ht://Dig" border=0></a> +WWW Site Search</H1> +<hr noshade size=4> +This search will allow you to search the contents of +all the publicly available WWW documents at this site. +<br> +<p> +<form method="get" action="/search/results.shtml"> +<font size=-1> +Match: <select name=method> +<option value=and>All +<option value=or>Any +<option value=boolean>Boolean +</select> +Format: <select name=format> +<option value=builtin-long>Long +<option value=builtin-short>Short +</select> +Sort by: <select name=sort> +<option value=score>Score +<option value=time>Time +<option value=title>Title +<option value=revscore>Reverse Score +<option value=revtime>Reverse Time +<option value=revtitle>Reverse Title +</select> +</font> +<input type=hidden name=config value="htdig-ssi"> +<input type=hidden name=restrict value=""> +<input type=hidden name=exclude value=""> +<br> +Search: +<input type="text" size="30" name="words" value=""> +<input type="submit" value="Search"> +</form> +<hr noshade size=4> +</body> +</html> + diff --git a/debian/htdig/htdig-3.2.0b6/contrib/scriptname/templates/NOTE b/debian/htdig/htdig-3.2.0b6/contrib/scriptname/templates/NOTE new file mode 100644 index 00000000..b6a82833 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/scriptname/templates/NOTE @@ -0,0 +1,2 @@ +These are the standard template files, minus the standard start and +ending of the HTML that is already in the dynamic results page. diff --git a/debian/htdig/htdig-3.2.0b6/contrib/scriptname/templates/footer.html b/debian/htdig/htdig-3.2.0b6/contrib/scriptname/templates/footer.html new file mode 100644 index 00000000..67938f89 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/scriptname/templates/footer.html @@ -0,0 +1,2 @@ +$(PAGEHEADER) +$(PREVPAGE) $(PAGELIST) $(NEXTPAGE) diff --git a/debian/htdig/htdig-3.2.0b6/contrib/scriptname/templates/header.html b/debian/htdig/htdig-3.2.0b6/contrib/scriptname/templates/header.html new file mode 100644 index 00000000..41503364 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/scriptname/templates/header.html @@ -0,0 +1,22 @@ +Search results for '$(LOGICAL_WORDS)'</h2> +<hr noshade size=4> +<form method="get" action="$(CGI)"> +<font size=-1> +<input type=hidden name=config value=$(CONFIG)> +<input type=hidden name=restrict value="$(RESTRICT)"> +<input type=hidden name=exclude value="$(EXCLUDE)"> +Match: $(METHOD) +Format: $(FORMAT) +Sort by: $(SORT) +<br> +Refine search: +<input type="text" size="30" name="words" value="$(WORDS)"> +<input type="submit" value="Search"> +</select> +</font> +</form> +<hr noshade size=1> +<b>Documents $(FIRSTDISPLAYED) - $(LASTDISPLAYED) of $(MATCHES) matches. +More <img src="/htdig/star.gif" alt="*">'s indicate a better match. +</b> +<hr noshade size=1> diff --git a/debian/htdig/htdig-3.2.0b6/contrib/scriptname/templates/long.html b/debian/htdig/htdig-3.2.0b6/contrib/scriptname/templates/long.html new file mode 100644 index 00000000..57ea8dcc --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/scriptname/templates/long.html @@ -0,0 +1,6 @@ +<dl><dt><strong><a href="$(URL)">$(TITLE)</a></strong>$(STARSLEFT) +</dt><dd>$(EXCERPT)<br> +<i><a href="$(URL)">$(URL)</a></i> +<font size=-1>$(MODIFIED), $(SIZE) bytes</font> +</dd></dl> + diff --git a/debian/htdig/htdig-3.2.0b6/contrib/scriptname/templates/nomatch.html b/debian/htdig/htdig-3.2.0b6/contrib/scriptname/templates/nomatch.html new file mode 100644 index 00000000..840e4098 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/scriptname/templates/nomatch.html @@ -0,0 +1,30 @@ +Search results</h1> +<hr noshade size=4> +<h2>No matches were found for '$(LOGICAL_WORDS)'</h2> +<p> +Check the spelling of the search word(s) you used. +If the spelling is correct and you only used one word, +try using one or more similar search words with "<b>Any</b>." +</p><p> +If the spelling is correct and you used more than one +word with "<b>Any</b>," try using one or more similar search +words with "<b>Any</b>."</p><p> +If the spelling is correct and you used more than one +word with "<b>All</b>," try using one or more of the same words +with "<b>Any</b>."</p> +<hr noshade size=4> +<form method="get" action="$(CGI)"> +<font size=-1> +<input type=hidden name=config value=$(CONFIG)> +<input type=hidden name=restrict value="$(RESTRICT)"> +<input type=hidden name=exclude value="$(EXCLUDE)"> +Match: $(METHOD) +Format: $(FORMAT) +Sort by: $(SORT) +<br> +Refine search: +<input type="text" size="30" name="words" value="$(WORDS)"> +<input type="submit" value="Search"> +</select> +</font> +</form> diff --git a/debian/htdig/htdig-3.2.0b6/contrib/scriptname/templates/short.html b/debian/htdig/htdig-3.2.0b6/contrib/scriptname/templates/short.html new file mode 100644 index 00000000..b5044b31 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/scriptname/templates/short.html @@ -0,0 +1 @@ +$(STARSRIGHT) <strong><a href="$(URL)">$(TITLE)</a></strong><br> diff --git a/debian/htdig/htdig-3.2.0b6/contrib/scriptname/templates/syntax.html b/debian/htdig/htdig-3.2.0b6/contrib/scriptname/templates/syntax.html new file mode 100644 index 00000000..feddac71 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/scriptname/templates/syntax.html @@ -0,0 +1,27 @@ +Error in Boolean search for '$(LOGICAL_WORDS)'</h1> +<hr noshade size=4> +Boolean expressions need to be 'correct' in order for the search +system to use them. +The expression you entered has errors in it.<p> +Examples of correct expressions are: <b>cat and dog</b>, <b>cat +not dog</b>, <b>cat or (dog not nose)</b>.<br>Note that +the operator <b>not</b> has the meaning of 'without'. +<blockquote><b> +$(SYNTAXERROR) +</b></blockquote> +<hr noshade size=4> +<form method="get" action="$(CGI)"> +<font size=-1> +<input type=hidden name=config value=$(CONFIG)> +<input type=hidden name=restrict value="$(RESTRICT)"> +<input type=hidden name=exclude value="$(EXCLUDE)"> +Match: $(METHOD) +Format: $(FORMAT) +Sort: $(SORT) +<br> +Refine search: +<input type="text" size="30" name="words" value="$(WORDS)"> +<input type="submit" value="Search"> +</select> +</font> +</form> diff --git a/debian/htdig/htdig-3.2.0b6/contrib/status.pl b/debian/htdig/htdig-3.2.0b6/contrib/status.pl new file mode 100755 index 00000000..25ddeda8 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/status.pl @@ -0,0 +1,258 @@ +#!/usr/local/bin/perl +# +# status.pl v1.0 960413 Iain Lea ([email protected]) +# +# ChangeLog +# 960413 IL +# +# Produces a HTML 'Search Engine Status' page with last 5 runs +# and 'Top 10' servers by #URLS indexed. +# +# Usage: status.pl [options] +# -h help +# -F file HTML footer +# -H file HTML header +# -o file HTML generated file +# -v verbose +# +# TODO + +require 'timelocal.pl'; +require 'getopts.pl'; +require '/www/search.sbs.de/bin/sbs.pl'; + +$DataDir = '/www/search.sbs.de/data/robot'; +$RunTimeFile = "$DataDir/current-runtime"; +$RobotFile = "$DataDir/current-robot"; +$IndexFile = '/www/search.sbs.de/test/db/db.wordlist'; + +$DefOutputFile = '/www/search.sbs.de/test/pub/status.html'; +$TmpFile = "/tmp/status.$$"; +$DefFooter = ''; +$DefHeader = ''; +$Verbose = 0; +$Top10Servers = 10; + +&ParseCmdLine; + +print "Generating status.html...\n" if $Verbose; + +&ReadDataFiles ($RunTimeFile, $RobotFile, $IndexFile); +&WriteStatus ($DataDir, $DefOutputFile, $DefHeader, $DefFooter); + +exit 1; + +############################################################################# +# Subroutines +# + +sub ParseCmdLine +{ + &Getopts ('F:hH:o:v'); + + if ($opt_h ne "") { + print <<EndOfHelp +Produce an HTML 'Status' page of last 5 runs and Top 10 servers by #URLS. + +Usage: $0 [options] + -h help + -F file HTML footer + -H file HTML header + -o file HTML generated file + -v verbose + +EndOfHelp +; + exit 0; + } + $DefFooter = $opt_F if ($opt_H ne ""); + $DefHeader = $opt_H if ($opt_H ne ""); + $DefOutputFile = $opt_o if ($opt_o ne ""); + $Verbose = 1 if ($opt_v ne ""); +} + +sub ReadDataFiles +{ + my ($RunTimeFile, $RobotFile, $IndexFile) = @_; + my ($IndexSize, $NumWords, $NumURLS, $NumServers); + my ($BegTime, $EndTime, $RunDate, $RunTime, $Key); + my (%Months) = ( + 'Jan', '0', 'Feb', '1', 'Mar', '2', 'Apr', '3', 'May', '4', 'Jun', '5', + 'Jul', '6', 'Aug', '7', 'Sep', '8', 'Oct', '9', 'Nov', '10', 'Dec', '11' ); + + # RunDate : RunTime + + open (TIME, "$RunTimeFile") || die "Error: $RunTimeFile - $!\n"; + while (<TIME>) { + chop; + if (! $EndTime && $BegTime) { + # Sat Apr 13 12:57:52 MET DST 1996 + /^...\ (...)\ ([0-9][0-9])\ (..):(..):(..)\ ... ... ([0-9]{4}$)/; + $EndTime = timelocal ($5, $4, $3, $2, $Months{$1}, $6 - 1900); + $RunTime = $EndTime - $BegTime; + $RunTime = sprintf ("%02d%02d", $RunTime/3600, ($RunTime%3600)/60); + print "END=[$_] [$EndTime] [$RunTime]\n" if $Verbose; + } + if (! $BegTime) { + # Sat Apr 13 12:57:52 MET DST 1996 + /^...\ (...)\ ([0-9][0-9])\ (..):(..):(..)\ ... ... ([0-9]{4}$)/; + $Mon = $Months{$1}; + $Year = $6 - 1900; + $BegTime = timelocal ($5, $4, $3, $2, $Mon, $Year); + $RunDate = sprintf ("%02d%02d%02d", $Year, $Mon+1, $2); + print "BEG=[$_] [$BegTime] [$RunDate]\n" if $Verbose; + } + } + close (TIME); + + # IndexSize : NumWords : NumURLS : NumServers + + @StatData = stat ($IndexFile); + $IndexSize = $StatData[7]; + print "SIZE=[$IndexSize]\n" if $Verbose; + + # NumWords : NumURLS : NumServers + + $NumWords = $NumURLS = $NumServers = 0; + + open (ROBOT, "$RobotFile") || die "Error: $RobotFile - $!\n"; + while (<ROBOT>) { + if (/^htdig:\s+(.*)\s+([0-9]*)\s+documents$/) { + $NumURLS += $2; + $NumServers++; + if ($2 > 0) { + $Key = sprintf ("%07d|%s", $2, $1); + $Top10ByName{$Key} = $2; + } + print "SERVER=[$1] DOCS=[$2]\n" if $Verbose; + } elsif (/^Read\s+([0-9]*)\s+words$/) { + $NumWords = $1; + print "WORDS=[$NumWords]\n" if $Verbose; + } + } + close (ROBOT); + + # Write data to YYMMDD-info file + + $InfoFile = "$DataDir/$RunDate-info"; + $CurrFile = "$DataDir/current-info"; + + open (INFO, ">$InfoFile") || die "Error: $InfoFile - $!\n"; + print "$RunDate:$RunTime:$IndexSize:$NumWords:$NumURLS:$NumServers\n" if $Verbose; + print INFO "$RunDate:$RunTime:$IndexSize:$NumWords:$NumURLS:$NumServers\n"; + close (INFO); + unlink ($CurrFile); + symlink ($InfoFile, $CurrFile); +} + +sub WriteStatus +{ + my ($DataDir, $OutFile, $Header, $Footer) = @_; + + $RobotInfo = &ReadRobotInfo ("$DataDir/current-info"); + + open (HTML, ">$OutFile") || die "Error: $OutFile - $!\n"; + + &PrintBoilerPlate ($Header, 1); + + print HTML <<EOT +<p> +<strong>$RobotInfo</strong> +<p> +<table border=2 width=400> +<caption>Table of last 5 robot runs.</caption> +<th>Run Date<th>Run Time<th># Servers<th># URL's<th># Words<th>Index (MB) +<tr> +EOT +; + # read YYMMDD-info files + opendir (DIR, $DataDir) || die "Error: $DataDir - $!\n"; + @InfoFiles = grep (/^[0-9]{6}-info$/, readdir (DIR)); + closedir (DIR); + @InfoFiles = reverse (sort (@InfoFiles)); + + @InfoFiles = @InfoFiles[0,1,2,3,4]; + foreach $File (@InfoFiles) { + $File = "$DataDir/$File"; + open (INFO, "$File") || die "Error: $File - $!\n"; + chop (($_ = <INFO>)); + ($RunDate, $RunTime, $IndexSize, $NumWords, $NumURLS, $NumServers) = split (':'); + $IndexSize = sprintf ("%.1f", $IndexSize / (1024*1024)); + $RunTime =~ /(..)(..)/; + $RunTime = "$1:$2"; + print HTML <<EOT +<td align="center">$RunDate</td> +<td align="center">$RunTime</td> +<td align="right">$NumServers</td> +<td align="right">$NumURLS</td> +<td align="right">$NumWords</td> +<td align="right">$IndexSize</td> +<tr> +EOT +; + close (INFO); + } + + print HTML <<EOT +</table> +<p> +<p> +<table border=2 width=400> +<caption>Table of Top 10 servers listed by number of indexed documents.</caption> +<th>Top 10 Servers<th># URL's +<tr> +EOT +; + $NumServers = 0; + foreach $Key (reverse (sort (keys (%Top10ByName)))) { + if ($NumServers < $Top10Servers) { + $NumServers++; + $NumURLS = $Top10ByName{$Key}; + $Key =~ /^[0-9]*\|(.*)$/; + $Server = $1; + $Server =~ s/:80$//; + print HTML <<EOT +<td width="80%" align="left"><a href="http://$Server/">$Server</a></td> +<td width="20%" align="right">$NumURLS</td> +<tr> +EOT +; + } + } + + print HTML "</table>\n"; + + &PrintBoilerPlate ($Footer, 0); + + close (HTML); +} + +sub PrintBoilerPlate +{ + my ($File, $IsHeader) = @_; + + if ($File ne "" && -e $File) { + open (FILE, $File) || die "Error: $File - $!\n"; + while (<FILE>) { + print HTML; + } + close (FILE); + } else { + if ($IsHeader) { + print HTML <<EOT +<html> +<head> +<title>Search Engine Status</title> +</head> +<body> +<h2>Search Engine Status</h2> +<hr> +<p> +EOT +; + } else { + &PrintFooterHTML; + } + } +} + diff --git a/debian/htdig/htdig-3.2.0b6/contrib/urlindex/urlindex.pl b/debian/htdig/htdig-3.2.0b6/contrib/urlindex/urlindex.pl new file mode 100755 index 00000000..436c5eef --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/urlindex/urlindex.pl @@ -0,0 +1,285 @@ +#!/usr/local/bin/perl + +## +## urlindex.pl (C) 1995 Andrew Scherpbier +## +## This program will build an index of all the URLs in the +## htdig document database. +## + +use GDBM_File; +require('SDSU_www.pl'); + +$dbfile = "/gopher/www/htdig/sdsu3d.docdb"; +$dbfile = "/tmp/db.docdb"; +$exclude = "rohan.sdsu.edu\\/home\\/"; + +tie(%docdb, GDBM_File, $dbfile, GDBM_READER, 0) || die "Unable to open $dbfile: $!"; + +print "Reading...\n"; + +## +## Read in all the relevant data. +## +while (($key, $value) = each %docdb) +{ + next if $key =~ /^nextDocID/; + %record = parse_ref_record($value); + next if $record{"STATE"} eq 1; + next if $key =~ /$exclude/; + + $title = $record{"TITLE"}; + + ## + ## Get rid of starting and trailing whitespace junk + ## + $title =~ s/^[ \t\n\r]*//; + $title =~ s/[ \t\n\r]*$//; + + ## + ## If the title starts with 'the', it will be taken out and added + ## to the end of the title. This means that a title like "The + ## Homepage of X" will become "Homepage of X, The" + ## + if ($title =~ /^the /i) + { + $title = substr($title, 4) . ", " . substr($title, 0, 3); + } + if ($title =~ /^SDSU /) + { + $title = substr($title, 5) . ", " . substr($title, 0, 4); + } + if ($title =~ /^San Diego State University /i) + { + $title = substr($title, 27) . ", " . substr($title, 0, 26); + } + $value = $title; + $value =~ tr/A-Z/a-z/; + $titles{$value} = "$title\001$key"; + push(@unsorted, $value); +} + +$current = " "; +open(M, ">index.html"); +print M "<html><head><title>Index of all documents at SDSU</title></head>\n"; +print M "<body>\n"; +print M &www_logo_2("Index of all documents at SDSU"); +print M "<p>This is a list of WWW documents that were found while indexing all\n"; +print M "the publicly available WWW servers at San Diego State University.\n"; +print M "The documents are indexed by their titles.\n"; +print M "</p><h2>\n"; + +$previous = ""; + +print "Writing...\n"; + +foreach $value (sort @unsorted) +{ + next if $value eq $previous; + $previous = $value; + next if !($value =~ /^[a-zA-Z]/); + + ($title, $url) = split('\001', $titles{$value}, 2); + + $first = substr($title, 0, 1); + if ($current =~ /$first/i) + { + print F "<li><a href=\"$url\">$title</a></li>\n"; + } + else + { + ## + ## New letter. Open a new file for it + ## + $current = $first; + $current =~ tr/a-z/A-Z/; + print F "</li></body></html>\n"; + close(F); + open(F, ">index$current.html"); + print F "<html><head><title>Index for $current</title></head>\n"; + print F "<body>\n"; + print F &www_logo_2("Index for $current"); + print F "<ul>\n"; + print F "<li><a href=\"$url\">$title</a></li>\n"; + + ## + ## Add a reference to the main index for this letter + ## + print M " <a href=\"index$current.html\">$current</a>\n"; + + print "Index of $current\n"; + } +} + +close(F); + +print M "</h2></body></html>\n"; +close(M); + + +sub parse_ref_record +{ + local($value) = @_; + local(%rec, $length, $count, $result); + + while (length($value) > 0) + { + $what = unpack("C", $value); + $value = substr($value, 1); + if ($what == 0) + { + # ID + $rec{"ID"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 1) + { + # TIME + $rec{"TIME"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 2) + { + # ACCESSED + $rec{"ACCESSED"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 3) + { + # STATE + $rec{"STATE"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 4) + { + # SIZE + $rec{"SIZE"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 5) + { + # LINKS + $rec{"LINKS"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 6) + { + # IMAGESIZE + $rec{"IMAGESIZE"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 7) + { + # HOPCOUNT + $rec{"HOPCOUNT"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 8) + { + # URL + $length = unpack("i", $value); + $rec{"URL"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 9) + { + # HEAD + $length = unpack("i", $value); + $rec{"HEAD"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 10) + { + # TITLE + $length = unpack("i", $value); + $rec{"TITLE"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 11) + { + # DESCRIPTIONS + $count = unpack("i", $value); + $value = substr($value, 4); + $result = ""; + foreach (1 .. $count) + { + $length = unpack("i", $value); + $result = $result . unpack("x4 A$length", $value) . ""; + $value = substr($value, 4 + $length); + } + chop $result; + $rec{"DESCRIPTIONS"} = $result; + } + elsif ($what == 12) + { + # ANCHORS + $count = unpack("i", $value); + $value = substr($value, 4); + $result = ""; + foreach (1 .. $count) + { + $length = unpack("i", $value); + $result = $result . unpack("x4 A$length", $value) . ""; + $value = substr($value, 4 + $length); + } + chop $result; + $rec{"ANCHORS"} = $result; + } + elsif ($what == 13) + { + # EMAIL + $length = unpack("i", $value); + $rec{"EMAIL"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 14) + { + # NOTIFICATION + $length = unpack("i", $value); + $rec{"NOTIFICATION"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 15) + { + # SUBJECT + $length = unpack("i", $value); + $rec{"SUBJECT"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 16) + { + # STRING (ignore, but unpack) + $length = unpack("i", $value); + $rec{"STRING"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 17) + { + # METADSC + $length = unpack("i", $value); + $rec{"METADSC"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 18) + { + # BACKLINKS + $rec{"BACKLINKS"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 19) + { + # SIGNATURE + $rec{"SIG"} = unpack("i", $value); + $value = substr($value, 4); + } + } + print "title = $rec{'TITLE'}\n"; + return %rec; +} + + + + + + + diff --git a/debian/htdig/htdig-3.2.0b6/contrib/whatsnew/whatsnew.pl b/debian/htdig/htdig-3.2.0b6/contrib/whatsnew/whatsnew.pl new file mode 100755 index 00000000..e27e744c --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/whatsnew/whatsnew.pl @@ -0,0 +1,365 @@ +#!/usr/local/bin/perl +# +# whatsnew.pl v1.1 (C) 1996 Iain Lea +# modified 26 Oct 1998 (c) 1998 Jacques Reynes +# +# ChangeLog +# 960321 IL Reversed sorting to show newest documents first +# 981026 JR Modified to work with Berkeley DB2. +# 980204 GRH Modified to work with changes in ht://Dig db format +# +# Produces a HTML 'Whats New' page with custom header and footer. +# +# Title +# Descriptions +# URL +# Last modification date (in ctime format) +# +# The date is specified as yyyymmdd +# +# Usage: whatsnew.pl [options] +# -h help +# -d date base date [default: $DefDate] +# -n days list documents newer than days old [default: $DefDays] +# -f file database index [default: $DefIndex] +# -F file HTML footer +# -H file HTML header +# -o file HTML generated file +# -v verbose + +use BerkeleyDB; +require 'timelocal.pl'; +require 'getopts.pl'; + +$DefIndex = ' your data base .docdb'; +$DefOutputFile = ' your result file URL created in your web server whatsnew.html'; +$TmpFile = "/tmp/whatsnew.$$"; +$DefFooter = ''; +$DefHeader = ''; +$Verbose = 0; +$NewNum = 0; +$DefDays = 3; +chop (($DefDate = '19'.`date +%y%m%d`)); + +&ParseCmdLine; + +$DefDate =~ /([0-9]{4})([0-9]{2})([0-9]{2})/; +$When = timelocal (0, 0, 0, $3, $2 - 1, $1 - 1900)- ($DefDays * 86400); +$NewDate = localtime ($When); +$dbfile = $DefIndex; + +print "Generating 'Whats New' for documents newer than '$NewDate'...\n" if $Verbose; + +&ReadDatabase ($DefIndex, $TmpFile); +&WriteWhatsNew ($TmpFile, $DefOutputFile, $DefHeader, $DefFooter); + +exit 1; + +############################################################################# +# Subroutines +# + +sub ParseCmdLine +{ + &Getopts ('d:f:F:hH:n:o:v'); + + if ($opt_h ne "") { + print <<EndOfHelp +Produce an HTML 'Whats New' page with custom header & footer for database. + +Usage: $0 [options] + -h help + -d date base date [default: $DefDate] + -n days list documents newer than days old [default: $DefDays] + -f file database index [default: $DefIndex] + -F file HTML footer + -H file HTML header + -o file HTML generated file + -v verbose + +EndOfHelp +; + exit 0; + } + $DefDate = $opt_d if ($opt_d ne ""); + $DefDays = $opt_n if ($opt_n ne ""); + $DefIndex = $opt_f if ($opt_f ne ""); + $DefFooter = $opt_F if ($opt_H ne ""); + $DefHeader = $opt_H if ($opt_H ne ""); + $DefOutputFile = $opt_o if ($opt_o ne ""); + $Verbose = 1 if ($opt_v ne ""); +} + +sub ReadDatabase +{ + my ($Index, $TmpFile) = @_; + + tie %docdb, 'BerkeleyDB::Btree', -Filename => $Index, -Flags => DB_RDONLY || die "Error: $Index - $!"; + + open (TMP, ">$TmpFile") || die "Error: $TmpFile - $!\n"; + + while (($key, $value) = each %docdb) + { + next if $key =~ /^nextDocID/; + %rec = parse_ref_record ($value); + if ($rec{'TIME'} >= $When) + { + $Line = "$rec{'TIME'}|$rec{'URL'}|$rec{'TITLE'}|$rec{'DESCRIPTIONS'}\n"; + print $Line if $Verbose; + print TMP $Line; + $NewNum++; + } + } + + close (TMP); +} + +sub WriteWhatsNew +{ + my ($InFile, $OutFile, $Header, $Footer) = @_; + + open (URLS, "sort -r $InFile |") || die "Error: $InFile - $!\n"; + open (HTML, ">$OutFile") || die "Error: $OutFile - $!\n"; + + &PrintBoilerPlate ($Header, 1); + + while (<URLS>) { + chop; + ($Time, $URL, $Title, $Description) = split ('\|'); + $Ctime = localtime ($Time); + if ($Verbose) { + print <<EOT +Title: $Title +Description: $Description +URL: $URL +Modified: $Ctime + +EOT +; + } + print HTML <<EOT +<strong>Title:</strong> <a href="$URL">$Title</a> +<strong>Description:</strong> $Description +<strong>URL:</strong> $URL +<strong>Modified:</strong> $Ctime + +EOT +; + } + + &PrintBoilerPlate ($Footer, 0); + + close (HTML); + close (URLS); + + unlink ($InFile); +} + +sub PrintBoilerPlate +{ + my ($File, $IsHeader) = @_; + + if ($File ne "" && -e $File) { + open (FILE, $File) || die "Error: $File - $!\n"; + while (<FILE>) { + print HTML; + } + close (FILE); + } else { + if ($IsHeader) { + print HTML <<EOT +<html> +<head> +<title>Whats New!</title> +</head> +<body> +<h2>Whats New!</h2> +<center> +<a href="/whatsnew.html"><img src="/new.gif"></a> +<a href="/"><img src="/home.gif"></a> +<a href="/intranet.html"><img src="/search.gif"></a> +<a href="mailto:Iain.Lea\@sbs.de"><img src="/contact.gif"></a> +</center> +<hr> +<strong>Found $NewNum documents newer than '$NewDate'</strong> +<pre> +EOT +; + } else { + print HTML <<EOT +</pre> +<hr> +<center> +<a href="/whatsnew.html"><img src="/new.gif"></a> +<a href="/"><img src="/home.gif"></a> +<a href="/intranet.html"><img src="/search.gif"></a> +<a href="mailto:Iain.Lea\@sbs.de"><img src="/contact.gif"></a> +</center> +</body> +</html> +EOT +; + } + } +} + + +sub parse_ref_record +{ + local($value) = @_; + local(%rec, $length, $count, $result); + + while (length($value) > 0) + { + $what = unpack("C", $value); + $value = substr($value, 1); + if ($what == 0) + { + # ID + $rec{"ID"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 1) + { + # TIME + $rec{"TIME"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 2) + { + # ACCESSED + $rec{"ACCESSED"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 3) + { + # STATE + $rec{"STATE"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 4) + { + # SIZE + $rec{"SIZE"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 5) + { + # LINKS + $rec{"LINKS"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 6) + { + # IMAGESIZE + $rec{"IMAGESIZE"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 7) + { + # HOPCOUNT + $rec{"HOPCOUNT"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 8) + { + # URL + $length = unpack("i", $value); + $rec{"URL"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 9) + { + # HEAD + $length = unpack("i", $value); + $rec{"HEAD"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 10) + { + # TITLE + $length = unpack("i", $value); + $rec{"TITLE"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 11) + { + # DESCRIPTIONS + $count = unpack("i", $value); + $value = substr($value, 4); + $result = ""; + foreach (1 .. $count) + { + $length = unpack("i", $value); + $result = $result . unpack("x4 A$length", $value) . ""; + $value = substr($value, 4 + $length); + } + chop $result; + $rec{"DESCRIPTIONS"} = $result; + } + elsif ($what == 12) + { + # ANCHORS + $count = unpack("i", $value); + $value = substr($value, 4); + $result = ""; + foreach (1 .. $count) + { + $length = unpack("i", $value); + $result = $result . unpack("x4 A$length", $value) . ""; + $value = substr($value, 4 + $length); + } + chop $result; + $rec{"ANCHORS"} = $result; + } + elsif ($what == 13) + { + # EMAIL + $length = unpack("i", $value); + $rec{"EMAIL"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 14) + { + # NOTIFICATION + $length = unpack("i", $value); + $rec{"NOTIFICATION"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 15) + { + # SUBJECT + $length = unpack("i", $value); + $rec{"SUBJECT"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 16) + { + # STRING (ignore, but unpack) + $length = unpack("i", $value); + $rec{"STRING"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 17) + { + # METADSC + $length = unpack("i", $value); + $rec{"METADSC"} = unpack("x4 A$length", $value); + $value = substr($value, 4 + $length); + } + elsif ($what == 18) + { + # BACKLINKS + $rec{"BACKLINKS"} = unpack("i", $value); + $value = substr($value, 4); + } + elsif ($what == 19) + { + # SIGNATURE + $rec{"SIG"} = unpack("i", $value); + $value = substr($value, 4); + } + } + return %rec; +} + diff --git a/debian/htdig/htdig-3.2.0b6/contrib/wordfreq/wordfreq.html b/debian/htdig/htdig-3.2.0b6/contrib/wordfreq/wordfreq.html new file mode 100644 index 00000000..164b8e5f --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/wordfreq/wordfreq.html @@ -0,0 +1,16 @@ +<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN"> +<html> <head> +<title></title> +</head> + +<body> +<h1></h1> + + + +<hr> +<address><a href="http://www.sdsu.edu/~turtle/">Andrew Scherpbier <[email protected]></a></address> +<!-- hhmts start --> +Last modified: Wed Jul 5 10:26:36 PDT 1995 +<!-- hhmts end --> +</body> </html> diff --git a/debian/htdig/htdig-3.2.0b6/contrib/wordfreq/wordfreq.pl b/debian/htdig/htdig-3.2.0b6/contrib/wordfreq/wordfreq.pl new file mode 100755 index 00000000..31402a23 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/wordfreq/wordfreq.pl @@ -0,0 +1,54 @@ +#!/usr/local/bin/perl + +use GDBM_File; +use BerkeleyDB; + +## +## wordfreq.pl +## (C) 1995 Andrew Scherpbier <[email protected]> +## +## Will generate a list of words and how frequently they are used +## +## updated to deal with Berkeley db files 1998 Iosif Fettich <[email protected]> +## + + +$filetype = 'DB'; + +if (not defined $ARGV[0] or defined ($ARGV[1]) and $ARGV[1] !~ /g/i) { + print "\n\nThis program is used in conjunction with ht://Dig \n"; + print "to determine the frequency of words in a database containing word references.\n\n"; + print "Usage: $0 filename (to use a Berkeley db2 wordlist)\n"; + print " $0 filename g[dbm] (to use a GDBM wordlist)\n\n\n"; + exit; +} + +$filename = $ARGV[0]; + +if ($filename =~ /gdbm$/i or $ARGV[1] =~ /g/i) { + $filetype = 'GDBM'; +} + +if ($filetype eq 'GDBM') { + tie %worddb, 'GDBM_File', $ARGV[0], GDBM_READER, 0 + or die "Unable to open $ARGV[0] $!"; +} else { + tie %worddb, 'BerkeleyDB::Btree', + -Filename => $filename, + -Flags => DB_RDONLY + or die "Cannot open file $filename: $! $BerkeleyDB::Error\n" ; +} + +while (($key, $value) = each %worddb) +{ + $length = length($value) / 20; + $total = 0; + foreach $i (0 .. $length - 1) + { + ($count, $id, $weight, $anchor, $location) = + unpack("i i i i i", substr($value, $i * 20, 20)); + $total += $count; + } + print "$total\t$key\n"; +} + |