diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/contrib/parse_doc.pl')
-rwxr-xr-x | debian/htdig/htdig-3.2.0b6/contrib/parse_doc.pl | 238 |
1 files changed, 238 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/contrib/parse_doc.pl b/debian/htdig/htdig-3.2.0b6/contrib/parse_doc.pl new file mode 100755 index 00000000..63b775db --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/parse_doc.pl @@ -0,0 +1,238 @@ +#!/usr/local/bin/perl + +# 1998/12/10 +# Added: push @allwords, $fields[$x]; <[email protected]> +# Replaced: matching patterns. they match words starting or ending with ()[]'`;:?.,! now, not when in between! +# Gone: the variable $line is gone (using $_ now) +# +# 1998/12/11 +# Added: catdoc test (is catdoc runnable?) <[email protected]> +# Changed: push line semi-colomn wrong. <[email protected]> +# Changed: matching works for end of lines now <[email protected]> +# Added: option to rigorously delete all punctuation <[email protected]> +# +# 1999/02/09 +# Added: option to delete all hyphens <[email protected]> +# Added: uses ps2ascii to handle PS files <[email protected]> +# 1999/02/15 +# Added: check for some file formats <[email protected]> +# 1999/02/25 +# Added: uses pdftotext to handle PDF files <[email protected]> +# Changed: generates a head record with punct. <[email protected]> +# 1999/03/01 +# Added: extra checks for file "wrappers" <[email protected]> +# & check for MS Word signature (no longer defaults to catdoc) +# 1999/03/05 +# Changed: rejoin hyphenated words across lines <[email protected]> +# (in PDFs) & remove multiple punct. chars. between words (all) +# 1999/03/10 +# Changed: fix handling of minimum word length <[email protected]> +# 1999/08/12 +# Changed: adapted for xpdf 0.90 release <[email protected]> +# Added: uses pdfinfo to handle PDF titles <[email protected]> +# Changed: keep hyphens by default, as htdig <[email protected]> +# does, but change dashes to hyphens +# 1999/09/09 +# Changed: fix to handle empty PDF title right <[email protected]> +# 2000/01/12 +# Changed: "break" to "last" (no break in Perl) <[email protected]> +# Changed: code for parsing a line into a list of +# words, to use "split", other streamlining. +# 2001/07/12 +# Changed: fix "last" handling in dehyphenation <[email protected]> +# Added: handle %xx codes in title from URL <[email protected]> +# 2003/06/07 +# Changed: allow file names with spaces <[email protected]> +######################################### +# +# set this to your MS Word to text converter +# get it from: http://www.fe.msk.ru/~vitus/catdoc/ +# +$CATDOC = "/usr/local/bin/catdoc"; +# +# set this to your WordPerfect to text converter, or /bin/true if none available +# this nabs WP documents with .doc suffix, so catdoc doesn't see them +# +$CATWP = "/bin/true"; +# +# set this to your RTF to text converter, or /bin/true if none available +# this nabs RTF documents with .doc suffix, so catdoc doesn't see them +# +$CATRTF = "/bin/true"; +# +# set this to your PostScript to text converter +# get it from the ghostscript 3.33 (or later) package +# +$CATPS = "/usr/bin/ps2ascii"; +# +# set this to your PDF to text converter, and pdfinfo tool +# get it from the xpdf 0.90 package at http://www.foolabs.com/xpdf/ +# +$CATPDF = "/usr/bin/pdftotext"; +$PDFINFO = "/usr/bin/pdfinfo"; +#$CATPDF = "/usr/local/bin/pdftotext"; +#$PDFINFO = "/usr/local/bin/pdfinfo"; + +# need some var's +$minimum_word_length = 3; +$head = ""; +@allwords = (); +@temp = (); +$x = 0; +#@fields = (); +$calc = 0; +$dehyphenate = 0; +$title = ""; +# +# okay. my programming style isn't that nice, but it works... + +#for ($x=0; $x<@ARGV; $x++) { # print out the args +# print STDERR "$ARGV[$x]\n"; +#} + +# Read first bytes of file to check for file type (like file(1) does) +open(FILE, "< $ARGV[0]") || die "Oops. Can't open file $ARGV[0]: $!\n"; +read FILE,$magic,8; +close FILE; + +if ($magic =~ /^\0\n/) { # possible MacBinary header + open(FILE, "< $ARGV[0]") || die "Oops. Can't open file $ARGV[0]: $!\n"; + read FILE,$magic,136; # let's hope parsers can handle them! + close FILE; +} + +if ($magic =~ /%!|^\033%-12345/) { # it's PostScript (or HP print job) + $parser = $CATPS; # gs 3.33 leaves _temp_.??? files in . + $parsecmd = "(cd /tmp; $parser; rm -f _temp_.???) < \"$ARGV[0]\" |"; +# keep quiet even if PS gives errors... +# $parsecmd = "(cd /tmp; $parser; rm -f _temp_.???) < \"$ARGV[0]\" 2>/dev/null |"; + $type = "PostScript"; + $dehyphenate = 0; # ps2ascii already does this + if ($magic =~ /^\033%-12345/) { # HP print job + open(FILE, "< $ARGV[0]") || die "Oops. Can't open file $ARGV[0]: $!\n"; + read FILE,$magic,256; + close FILE; + exit unless $magic =~ /^\033%-12345X\@PJL.*\n*.*\n*.*ENTER\s*LANGUAGE\s*=\s*POSTSCRIPT.*\n*.*\n*.*\n%!/ + } +} elsif ($magic =~ /%PDF-/) { # it's PDF (Acrobat) + $parser = $CATPDF; + $parsecmd = "$parser -raw \"$ARGV[0]\" - |"; +# to handle single-column, strangely laid out PDFs, use coalescing feature... +# $parsecmd = "$parser \"$ARGV[0]\" - |"; + $type = "PDF"; + $dehyphenate = 1; # PDFs often have hyphenated lines + if (open(INFO, "$PDFINFO \"$ARGV[0]\" 2>/dev/null |")) { + while (<INFO>) { + if (/^Title:/) { + $title = $_; + $title =~ s/^Title:\s+//; + $title =~ s/\s+$//; + $title =~ s/\s+/ /g; + $title =~ s/&/\&\;/g; + $title =~ s/</\<\;/g; + $title =~ s/>/\>\;/g; + last; + } + } + close INFO; + } +} elsif ($magic =~ /WPC/) { # it's WordPerfect + $parser = $CATWP; + $parsecmd = "$parser \"$ARGV[0]\" |"; + $type = "WordPerfect"; + $dehyphenate = 0; # WP documents not likely hyphenated +} elsif ($magic =~ /^{\\rtf/) { # it's Richtext + $parser = $CATRTF; + $parsecmd = "$parser \"$ARGV[0]\" |"; + $type = "RTF"; + $dehyphenate = 0; # RTF documents not likely hyphenated +} elsif ($magic =~ /\320\317\021\340/) { # it's MS Word + $parser = $CATDOC; + $parsecmd = "$parser -a -w \"$ARGV[0]\" |"; + $type = "Word"; + $dehyphenate = 0; # Word documents not likely hyphenated +} else { + die "Can't determine type of file $ARGV[0]; content-type: $ARGV[1]; URL: $ARGV[2]\n"; +} +# print STDERR "$ARGV[0]: $type $parsecmd\n"; +die "Hmm. $parser is absent or unwilling to execute.\n" unless -x $parser; + + +# open it +open(CAT, "$parsecmd") || die "Hmmm. $parser doesn't want to be opened using pipe.\n"; +while (<CAT>) { + while (/[A-Za-z\300-\377]-\s*$/ && $dehyphenate) { + $_ .= <CAT>; + last if eof; + s/([A-Za-z\300-\377])-\s*\n\s*([A-Za-z\300-\377])/$1$2/s + } + $head .= " " . $_; +# s/\s+[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]+|[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]+\s+|^[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]+|[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]+$/ /g; # replace reading-chars with space (only at end or begin of word, but allow multiple characters) +## s/\s[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]|[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]\s|^[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]|[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]$/ /g; # replace reading-chars with space (only at end or begin of word) +## s/[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]/ /g; # rigorously replace all by <[email protected]> +## s/[\-\255]/ /g; # replace hyphens with space +# s/[\255]/-/g; # replace dashes with hyphens +# @fields = split; # split up line +# next if (@fields == 0); # skip if no fields (does it speed up?) +# for ($x=0; $x<@fields; $x++) { # check each field if string length >= 3 +# if (length($fields[$x]) >= $minimum_word_length) { +# push @allwords, $fields[$x]; # add to list +# } +# } + + # Delete valid punctuation. These are the default values + # for valid_punctuation, and should be changed other values + # are specified in the config file. + tr{-\255._/!#$%^&'}{}d; + push @allwords, grep { length >= $minimum_word_length } split /\W+/; +} + +close CAT; + +exit unless @allwords > 0; # nothing to output + +############################################# +# print out the title, if it's set, and not just a file name +if ($title !~ /^$/ && $title !~ /^[A-G]:[^\s]+\.[Pp][Dd][Ff]$/) { + print "t\t$title\n"; +} else { # otherwise generate a title + @temp = split(/\//, $ARGV[2]); # get the filename, get rid of basename + $temp[-1] =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/gie; + print "t\t$type Document $temp[-1]\n"; # print it +} + + +############################################# +# print out the head +$head =~ s/^\s+//; # remove leading and trailing space +$head =~ s/\s+$//; +$head =~ s/\s+/ /g; +$head =~ s/&/\&\;/g; +$head =~ s/</\<\;/g; +$head =~ s/>/\>\;/g; +print "h\t$head\n"; +#$calc = @allwords; +#print "h\t"; +##if ($calc >100) { # but not more than 100 words +## $calc = 100; +##} +#for ($x=0; $x<$calc; $x++) { # print out the words for the exerpt +# print "$allwords[$x] "; +#} +#print "\n"; + + +############################################# +# now the words +#for ($x=0; $x<@allwords; $x++) { +# $calc=int(1000*$x/@allwords); # calculate rel. position (0-1000) +# print "w\t$allwords[$x]\t$calc\t0\n"; # print out word, rel. pos. and text type (0) +#} +$x = 0; +for ( @allwords ) { + # print out word, rel. pos. and text type (0) + printf "w\t%s\t%d\t0\n", $_, 1000*$x++/@allwords; +} + +$calc=@allwords; +# print STDERR "# of words indexed: $calc\n"; |