diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/contrib/conv_doc.pl')
-rwxr-xr-x | debian/htdig/htdig-3.2.0b6/contrib/conv_doc.pl | 214 |
1 files changed, 214 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/contrib/conv_doc.pl b/debian/htdig/htdig-3.2.0b6/contrib/conv_doc.pl new file mode 100755 index 00000000..78d8a985 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/conv_doc.pl @@ -0,0 +1,214 @@ +#!/usr/local/bin/perl + +# +# Sample external converter for htdig 3.1.4 or later. +# Usage: (in htdig.conf) +# +# external_parsers: application/msword->text/html /usr/local/bin/conv_doc.pl \ +# application/postscript->text/html /usr/local/bin/conv_doc.pl \ +# application/pdf->text/html /usr/local/bin/conv_doc.pl +# +# Written by Gilles Detillieux <[email protected]>. +# Based in part on the parse_word_doc.pl script, written by +# Jesse op den Brouw <[email protected]> but heavily revised. +# +# 1998/12/11 +# Added: catdoc test (is catdoc runnable?) <[email protected]> +# 1999/02/09 +# Added: uses ps2ascii to handle PS files <[email protected]> +# 1999/02/15 +# Added: check for some file formats <[email protected]> +# 1999/02/25 +# Added: uses pdftotext to handle PDF files <[email protected]> +# 1999/03/01 +# Added: extra checks for file "wrappers" <[email protected]> +# & check for MS Word signature (no longer defaults to catdoc) +# 1999/03/05 +# Changed: rejoin hyphenated words across lines <[email protected]> +# (in PDFs) +# 1999/08/12 +# Changed: adapted for xpdf 0.90 release <[email protected]> +# Added: uses pdfinfo to handle PDF titles <[email protected]> +# Changed: change dashes to hyphens <[email protected]> +# 1999/09/09 +# Changed: fix to handle empty PDF title right <[email protected]> +# 1999/12/01 +# Changed: rewritten as external converter <[email protected]> +# stripped out all parser-related code +# Added: test to silently ignore wrapped EPS files < " > +# Added: test for null device on Win32 env. <[email protected]> +# 2000/01/12 +# Changed: "break" to "last" (no break in Perl) <[email protected]> +# 2001/07/12 +# Changed: fix "last" handling in dehyphenation <[email protected]> +# Added: handle %xx codes in title from URL <[email protected]> +######################################### +# +# set this to your MS Word to text converter +# get it from: http://www.fe.msk.ru/~vitus/catdoc/ +# +$CATDOC = "/usr/local/bin/catdoc"; +# +# set this to your WordPerfect to text converter, or /bin/true if none available +# this nabs WP documents with .doc suffix, so catdoc doesn't see them +# +$CATWP = "/bin/true"; +# +# set this to your RTF to text converter, or /bin/true if none available +# this nabs RTF documents with .doc suffix, so catdoc doesn't see them +# +$CATRTF = "/bin/true"; +# +# set this to your PostScript to text converter +# get it from the ghostscript 3.33 (or later) package +# +$CATPS = "/usr/bin/ps2ascii"; +# +# set this to your PDF to text converter, and pdfinfo tool +# get it from the xpdf 0.90 package at http://www.foolabs.com/xpdf/ +# +$CATPDF = "/usr/bin/pdftotext"; +$PDFINFO = "/usr/bin/pdfinfo"; +#$CATPDF = "/usr/local/bin/pdftotext"; +#$PDFINFO = "/usr/local/bin/pdfinfo"; + +######################################### +# +# need some var's +$dehyphenate = 0; # set if we must dehyphenate text output +$ishtml = 0; # set if converter produces HTML +$null = ""; +$magic = ""; +$type = ""; +$cvtr = ""; +$cvtcmd = ""; +$title = ""; +@parts = (); + +# make portable to win32 platform or unix +$null = "/dev/null"; +if ($^O eq "MSWin32") {$null = "nul";} + + +######################################### +# +# Read first bytes of file to check for file type (like file(1) does) +open(FILE, "< $ARGV[0]") || die "Can't open file $ARGV[0]: $!\n"; +read FILE,$magic,8; +close FILE; + +if ($magic =~ /^\0\n/) { # possible MacBinary header + open(FILE, "< $ARGV[0]") || die "Can't open file $ARGV[0]: $!\n"; + read FILE,$magic,136; # let's hope converters can handle them! + close FILE; +} + +if ($magic =~ /%!|^\033%-12345/) { # it's PostScript (or HP print job) + $cvtr = $CATPS; # gs 3.33 leaves _temp_.??? files in . +# keep quiet even if PS gives errors... + $cvtcmd = "(cd /tmp; $cvtr; rm -f _temp_.???) < $ARGV[0] 2>$null"; +# allow PS interpreter to give error messages... +# $cvtcmd = "(cd /tmp; $cvtr; rm -f _temp_.???) < $ARGV[0]"; + $type = "PostScript"; + $dehyphenate = 0; # ps2ascii already does this + if ($magic =~ /^\033%-12345/) { # HP print job + open(FILE, "< $ARGV[0]") || die "Can't open file $ARGV[0]: $!\n"; + read FILE,$magic,256; + close FILE; + exit unless $magic =~ /^\033%-12345X\@PJL.*\n*.*\n*.*ENTER\s*LANGUAGE\s*=\s*POSTSCRIPT.*\n*.*\n*.*\n%!/ + } +} elsif ($magic =~ /\305\320\323\306\036/) { # it's a wrapped EPS - ignore + exit +} elsif ($magic =~ /%PDF-/) { # it's PDF (Acrobat) + $cvtr = $CATPDF; + $cvtcmd = "$cvtr -raw $ARGV[0] -"; +# to handle single-column, strangely laid out PDFs, use coalescing feature... +# $cvtcmd = "$cvtr $ARGV[0] -"; + $type = "PDF"; + $dehyphenate = 1; # PDFs often have hyphenated lines + if (open(INFO, "$PDFINFO $ARGV[0] 2>$null |")) { + while (<INFO>) { + if (/^Title:/) { + s/^Title:\s+//; + s/\s+$//; + s/\s+/ /g; + s/&/\&\;/g; + s/</\<\;/g; + s/>/\>\;/g; + $title = $_; + last; + } + } + close INFO; + } +# to use coalescing feature conditionally... +# if ($title =~ /...Title of Corel DRAW output.../) { +# $cvtcmd = "$cvtr $ARGV[0] -"; +# } +} elsif ($magic =~ /WPC/) { # it's WordPerfect + $cvtr = $CATWP; + $cvtcmd = "$cvtr $ARGV[0]"; + $type = "WordPerfect"; + $dehyphenate = 0; # WP documents not likely hyphenated +} elsif ($magic =~ /^{\\rtf/) { # it's Richtext + $cvtr = $CATRTF; + $cvtcmd = "$cvtr $ARGV[0]"; + $type = "RTF"; + $dehyphenate = 0; # RTF documents not likely hyphenated +} elsif ($magic =~ /\320\317\021\340/) { # it's MS Word + $cvtr = $CATDOC; + $cvtcmd = "$cvtr -a -w $ARGV[0]"; + $type = "Word"; + $dehyphenate = 0; # Word documents not likely hyphenated +} else { + die "Can't determine type of file $ARGV[0]; content-type: $ARGV[1]; URL: $ARGV[2]\n"; +} + +die "$cvtr is absent or unwilling to execute.\n" unless -x $cvtr; + +############################################# +# +# Start output. + +# if running as a converter for "user-defined" output type... +#print "Content-Type: text/html\n\n"; + +if ($ishtml) { + # converter will give its own HTML output + system("$cvtcmd") || die "$cvtr doesn't want to be run from shell.\n"; + exit; +} + +# Produce HTML output from converter's text output, so we can add title. +print "<HTML>\n<head>\n"; + +# print out the title, if it's set, and not just a file name, or make one up +if ($title eq "" || $title =~ /^[A-G]:[^\s]+\.[Pp][Dd][Ff]$/) { + @parts = split(/\//, $ARGV[2]); # get the file basename + $parts[-1] =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/gie; + $title = "$type Document $parts[-1]"; # use it in title +} +print "<title>$title</title>\n"; + +print "</head>\n<body>\n"; + +# Open file via selected converter, output its text. +open(CAT, "$cvtcmd |") || die "$cvtr doesn't want to be opened using pipe.\n"; +while (<CAT>) { + while (/[A-Za-z\300-\377]-\s*$/ && $dehyphenate) { + $_ .= <CAT>; + last if eof; + s/([A-Za-z\300-\377])-\s*\n\s*([A-Za-z\300-\377])/$1$2/s + } + s/[\255]/-/g; # replace dashes with hyphens + s/\f/\n/g; # replace form feed + s/&/\&\;/g; # HTMLify text + s/</\<\;/g; + s/>/\>\;/g; + print; +} + +print "</body>\n</HTML>\n"; + +close CAT; + |