1 files changed, 238 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/contrib/parse_doc.pl b/debian/htdig/htdig-3.2.0b6/contrib/parse_doc.pl
new file mode 100755
index 00000000..63b775db
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/contrib/parse_doc.pl
@@ -0,0 +1,238 @@
+#!/usr/local/bin/perl
+
+# 1998/12/10
+# Added:        push @allwords, $fields[$x];   <[email protected]>
+# Replaced:     matching patterns. they match words starting or ending with ()[]'`;:?.,! now, not when in between!
+# Gone:         the variable $line is gone (using $_ now)
+#
+# 1998/12/11
+# Added:        catdoc test (is catdoc runnable?)    <[email protected]>
+# Changed:      push line semi-colomn wrong.         <[email protected]>
+# Changed:      matching works for end of lines now  <[email protected]>
+# Added:        option to rigorously delete all punctuation <[email protected]>
+#
+# 1999/02/09
+# Added:        option to delete all hyphens         <[email protected]>
+# Added:        uses ps2ascii to handle PS files     <[email protected]>
+# 1999/02/15
+# Added:        check for some file formats          <[email protected]>
+# 1999/02/25
+# Added:        uses pdftotext to handle PDF files   <[email protected]>
+# Changed:      generates a head record with punct.  <[email protected]>
+# 1999/03/01
+# Added:        extra checks for file "wrappers"     <[email protected]>
+#               & check for MS Word signature (no longer defaults to catdoc)
+# 1999/03/05
+# Changed:      rejoin hyphenated words across lines <[email protected]>
+#               (in PDFs) & remove multiple punct. chars. between words (all)
+# 1999/03/10
+# Changed:      fix handling of minimum word length  <[email protected]>
+# 1999/08/12
+# Changed:      adapted for xpdf 0.90 release        <[email protected]>
+# Added:        uses pdfinfo to handle PDF titles    <[email protected]>
+# Changed:      keep hyphens by default, as htdig    <[email protected]>
+#               does, but change dashes to hyphens
+# 1999/09/09
+# Changed:      fix to handle empty PDF title right  <[email protected]>
+# 2000/01/12
+# Changed:      "break" to "last" (no break in Perl) <[email protected]>
+# Changed:      code for parsing a line into a list of
+#               words, to use "split", other streamlining.
+# 2001/07/12
+# Changed:      fix "last" handling in dehyphenation <[email protected]>
+# Added:        handle %xx codes in title from URL   <[email protected]>
+# 2003/06/07
+# Changed:	allow file names with spaces         <[email protected]>
+#########################################
+#
+# set this to your MS Word to text converter
+# get it from: http://www.fe.msk.ru/~vitus/catdoc/
+#
+$CATDOC = "/usr/local/bin/catdoc";
+#
+# set this to your WordPerfect to text converter, or /bin/true if none available
+# this nabs WP documents with .doc suffix, so catdoc doesn't see them
+#
+$CATWP = "/bin/true";
+#
+# set this to your RTF to text converter, or /bin/true if none available
+# this nabs RTF documents with .doc suffix, so catdoc doesn't see them
+#
+$CATRTF = "/bin/true";
+#
+# set this to your PostScript to text converter
+# get it from the ghostscript 3.33 (or later) package
+#
+$CATPS = "/usr/bin/ps2ascii";
+#
+# set this to your PDF to text converter, and pdfinfo tool
+# get it from the xpdf 0.90 package at http://www.foolabs.com/xpdf/
+#
+$CATPDF = "/usr/bin/pdftotext";
+$PDFINFO = "/usr/bin/pdfinfo";
+#$CATPDF = "/usr/local/bin/pdftotext";
+#$PDFINFO = "/usr/local/bin/pdfinfo";
+
+# need some var's
+$minimum_word_length = 3;
+$head = "";
+@allwords = ();
+@temp = ();
+$x = 0;
+#@fields = ();
+$calc = 0;
+$dehyphenate = 0;
+$title = "";
+#
+# okay. my programming style isn't that nice, but it works...
+
+#for ($x=0; $x<@ARGV; $x++) {           # print out the args
+#       print STDERR "$ARGV[$x]\n";
+#}
+
+# Read first bytes of file to check for file type (like file(1) does)
+open(FILE, "< $ARGV[0]") || die "Oops. Can't open file $ARGV[0]: $!\n";
+read FILE,$magic,8;
+close FILE;
+
+if ($magic =~ /^\0\n/) {                # possible MacBinary header
+        open(FILE, "< $ARGV[0]") || die "Oops. Can't open file $ARGV[0]: $!\n";
+        read FILE,$magic,136;           # let's hope parsers can handle them!
+        close FILE;
+}
+
+if ($magic =~ /%!|^\033%-12345/) {      # it's PostScript (or HP print job)
+        $parser = $CATPS;               # gs 3.33 leaves _temp_.??? files in .
+        $parsecmd = "(cd /tmp; $parser; rm -f _temp_.???) < \"$ARGV[0]\" |";
+# keep quiet even if PS gives errors...
+#       $parsecmd = "(cd /tmp; $parser; rm -f _temp_.???) < \"$ARGV[0]\" 2>/dev/null |";
+        $type = "PostScript";
+        $dehyphenate = 0;               # ps2ascii already does this
+        if ($magic =~ /^\033%-12345/) { # HP print job
+                open(FILE, "< $ARGV[0]") || die "Oops. Can't open file $ARGV[0]: $!\n";
+                read FILE,$magic,256;
+                close FILE;
+                exit unless $magic =~ /^\033%-12345X\@PJL.*\n*.*\n*.*ENTER\s*LANGUAGE\s*=\s*POSTSCRIPT.*\n*.*\n*.*\n%!/
+        }
+} elsif ($magic =~ /%PDF-/) {           # it's PDF (Acrobat)
+        $parser = $CATPDF;
+        $parsecmd = "$parser -raw \"$ARGV[0]\" - |";
+# to handle single-column, strangely laid out PDFs, use coalescing feature...
+#       $parsecmd = "$parser \"$ARGV[0]\" - |";
+        $type = "PDF";
+        $dehyphenate = 1;               # PDFs often have hyphenated lines
+        if (open(INFO, "$PDFINFO \"$ARGV[0]\" 2>/dev/null |")) {
+                while (<INFO>) {
+                        if (/^Title:/) {
+                                $title = $_;
+                                $title =~ s/^Title:\s+//;
+                                $title =~ s/\s+$//;
+                                $title =~ s/\s+/ /g;
+                                $title =~ s/&/\&amp\;/g;
+                                $title =~ s/</\&lt\;/g;
+                                $title =~ s/>/\&gt\;/g;
+                                last;
+                        }
+                }
+                close INFO;
+        }
+} elsif ($magic =~ /WPC/) {             # it's WordPerfect
+        $parser = $CATWP;
+        $parsecmd = "$parser \"$ARGV[0]\" |";
+        $type = "WordPerfect";
+        $dehyphenate = 0;               # WP documents not likely hyphenated
+} elsif ($magic =~ /^{\\rtf/) {         # it's Richtext
+        $parser = $CATRTF;
+        $parsecmd = "$parser \"$ARGV[0]\" |";
+        $type = "RTF";
+        $dehyphenate = 0;               # RTF documents not likely hyphenated
+} elsif ($magic =~ /\320\317\021\340/) {    # it's MS Word
+        $parser = $CATDOC;
+        $parsecmd = "$parser -a -w \"$ARGV[0]\" |";
+        $type = "Word";
+        $dehyphenate = 0;               # Word documents not likely hyphenated
+} else {
+        die "Can't determine type of file $ARGV[0]; content-type: $ARGV[1]; URL: $ARGV[2]\n";
+}
+# print STDERR "$ARGV[0]: $type $parsecmd\n";
+die "Hmm. $parser is absent or unwilling to execute.\n" unless -x $parser;
+
+
+# open it
+open(CAT, "$parsecmd") || die "Hmmm. $parser doesn't want to be opened using pipe.\n";
+while (<CAT>) {
+        while (/[A-Za-z\300-\377]-\s*$/ && $dehyphenate) {
+                $_ .= <CAT>;
+                last if eof;
+                s/([A-Za-z\300-\377])-\s*\n\s*([A-Za-z\300-\377])/$1$2/s
+        }
+        $head .= " " . $_;
+#       s/\s+[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]+|[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]+\s+|^[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]+|[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]+$/ /g;    # replace reading-chars with space (only at end or begin of word, but allow multiple characters)
+##       s/\s[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]|[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]\s|^[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]|[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]$/ /g;    # replace reading-chars with space (only at end or begin of word)
+##       s/[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]/ /g;      # rigorously replace all by <[email protected]>
+##       s/[\-\255]/ /g;                                 # replace hyphens with space
+#       s/[\255]/-/g;                                   # replace dashes with hyphens
+#       @fields = split;                                # split up line
+#       next if (@fields == 0);                         # skip if no fields (does it speed up?)
+#       for ($x=0; $x<@fields; $x++) {                  # check each field if string length >= 3
+#               if (length($fields[$x]) >= $minimum_word_length) {
+#                       push @allwords, $fields[$x];    # add to list
+#               }
+#       }
+
+	# Delete valid punctuation.  These are the default values
+	# for valid_punctuation, and should be changed other values
+	# are specified in the config file.
+	tr{-\255._/!#$%^&'}{}d;
+	push @allwords, grep { length >= $minimum_word_length } split /\W+/;
+}
+
+close CAT;
+
+exit unless @allwords > 0;              # nothing to output
+
+#############################################
+# print out the title, if it's set, and not just a file name
+if ($title !~ /^$/ && $title !~ /^[A-G]:[^\s]+\.[Pp][Dd][Ff]$/) {
+        print "t\t$title\n";
+} else {                                        # otherwise generate a title
+        @temp = split(/\//, $ARGV[2]);          # get the filename, get rid of basename
+        $temp[-1] =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/gie;
+        print "t\t$type Document $temp[-1]\n";  # print it
+}
+
+
+#############################################
+# print out the head
+$head =~ s/^\s+//;                      # remove leading and trailing space
+$head =~ s/\s+$//;
+$head =~ s/\s+/ /g;
+$head =~ s/&/\&amp\;/g;
+$head =~ s/</\&lt\;/g;
+$head =~ s/>/\&gt\;/g;
+print "h\t$head\n";
+#$calc = @allwords;
+#print "h\t";
+##if ($calc >100) {                      # but not more than 100 words
+##       $calc = 100;
+##}
+#for ($x=0; $x<$calc; $x++) {            # print out the words for the exerpt
+#        print "$allwords[$x] ";
+#}
+#print "\n";
+
+
+#############################################
+# now the words
+#for ($x=0; $x<@allwords; $x++) {
+#       $calc=int(1000*$x/@allwords);           # calculate rel. position (0-1000)
+#       print "w\t$allwords[$x]\t$calc\t0\n";   # print out word, rel. pos. and text type (0)
+#}
+$x = 0;
+for ( @allwords ) {
+    # print out word, rel. pos. and text type (0)
+    printf "w\t%s\t%d\t0\n", $_, 1000*$x++/@allwords;
+}
+
+$calc=@allwords;
+# print STDERR "# of words indexed: $calc\n";