summaryrefslogtreecommitdiffstats
path: root/debian/htdig/htdig-3.2.0b6/contrib/conv_doc.pl
diff options
context:
space:
mode:
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/contrib/conv_doc.pl')
-rwxr-xr-xdebian/htdig/htdig-3.2.0b6/contrib/conv_doc.pl214
1 files changed, 214 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/contrib/conv_doc.pl b/debian/htdig/htdig-3.2.0b6/contrib/conv_doc.pl
new file mode 100755
index 00000000..78d8a985
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/contrib/conv_doc.pl
@@ -0,0 +1,214 @@
+#!/usr/local/bin/perl
+
+#
+# Sample external converter for htdig 3.1.4 or later.
+# Usage: (in htdig.conf)
+#
+# external_parsers: application/msword->text/html /usr/local/bin/conv_doc.pl \
+# application/postscript->text/html /usr/local/bin/conv_doc.pl \
+# application/pdf->text/html /usr/local/bin/conv_doc.pl
+#
+# Written by Gilles Detillieux <[email protected]>.
+# Based in part on the parse_word_doc.pl script, written by
+# Jesse op den Brouw <[email protected]> but heavily revised.
+#
+# 1998/12/11
+# Added: catdoc test (is catdoc runnable?) <[email protected]>
+# 1999/02/09
+# Added: uses ps2ascii to handle PS files <[email protected]>
+# 1999/02/15
+# Added: check for some file formats <[email protected]>
+# 1999/02/25
+# Added: uses pdftotext to handle PDF files <[email protected]>
+# 1999/03/01
+# Added: extra checks for file "wrappers" <[email protected]>
+# & check for MS Word signature (no longer defaults to catdoc)
+# 1999/03/05
+# Changed: rejoin hyphenated words across lines <[email protected]>
+# (in PDFs)
+# 1999/08/12
+# Changed: adapted for xpdf 0.90 release <[email protected]>
+# Added: uses pdfinfo to handle PDF titles <[email protected]>
+# Changed: change dashes to hyphens <[email protected]>
+# 1999/09/09
+# Changed: fix to handle empty PDF title right <[email protected]>
+# 1999/12/01
+# Changed: rewritten as external converter <[email protected]>
+# stripped out all parser-related code
+# Added: test to silently ignore wrapped EPS files < " >
+# Added: test for null device on Win32 env. <[email protected]>
+# 2000/01/12
+# Changed: "break" to "last" (no break in Perl) <[email protected]>
+# 2001/07/12
+# Changed: fix "last" handling in dehyphenation <[email protected]>
+# Added: handle %xx codes in title from URL <[email protected]>
+#########################################
+#
+# set this to your MS Word to text converter
+# get it from: http://www.fe.msk.ru/~vitus/catdoc/
+#
+$CATDOC = "/usr/local/bin/catdoc";
+#
+# set this to your WordPerfect to text converter, or /bin/true if none available
+# this nabs WP documents with .doc suffix, so catdoc doesn't see them
+#
+$CATWP = "/bin/true";
+#
+# set this to your RTF to text converter, or /bin/true if none available
+# this nabs RTF documents with .doc suffix, so catdoc doesn't see them
+#
+$CATRTF = "/bin/true";
+#
+# set this to your PostScript to text converter
+# get it from the ghostscript 3.33 (or later) package
+#
+$CATPS = "/usr/bin/ps2ascii";
+#
+# set this to your PDF to text converter, and pdfinfo tool
+# get it from the xpdf 0.90 package at http://www.foolabs.com/xpdf/
+#
+$CATPDF = "/usr/bin/pdftotext";
+$PDFINFO = "/usr/bin/pdfinfo";
+#$CATPDF = "/usr/local/bin/pdftotext";
+#$PDFINFO = "/usr/local/bin/pdfinfo";
+
+#########################################
+#
+# need some var's
+$dehyphenate = 0; # set if we must dehyphenate text output
+$ishtml = 0; # set if converter produces HTML
+$null = "";
+$magic = "";
+$type = "";
+$cvtr = "";
+$cvtcmd = "";
+$title = "";
+@parts = ();
+
+# make portable to win32 platform or unix
+$null = "/dev/null";
+if ($^O eq "MSWin32") {$null = "nul";}
+
+
+#########################################
+#
+# Read first bytes of file to check for file type (like file(1) does)
+open(FILE, "< $ARGV[0]") || die "Can't open file $ARGV[0]: $!\n";
+read FILE,$magic,8;
+close FILE;
+
+if ($magic =~ /^\0\n/) { # possible MacBinary header
+ open(FILE, "< $ARGV[0]") || die "Can't open file $ARGV[0]: $!\n";
+ read FILE,$magic,136; # let's hope converters can handle them!
+ close FILE;
+}
+
+if ($magic =~ /%!|^\033%-12345/) { # it's PostScript (or HP print job)
+ $cvtr = $CATPS; # gs 3.33 leaves _temp_.??? files in .
+# keep quiet even if PS gives errors...
+ $cvtcmd = "(cd /tmp; $cvtr; rm -f _temp_.???) < $ARGV[0] 2>$null";
+# allow PS interpreter to give error messages...
+# $cvtcmd = "(cd /tmp; $cvtr; rm -f _temp_.???) < $ARGV[0]";
+ $type = "PostScript";
+ $dehyphenate = 0; # ps2ascii already does this
+ if ($magic =~ /^\033%-12345/) { # HP print job
+ open(FILE, "< $ARGV[0]") || die "Can't open file $ARGV[0]: $!\n";
+ read FILE,$magic,256;
+ close FILE;
+ exit unless $magic =~ /^\033%-12345X\@PJL.*\n*.*\n*.*ENTER\s*LANGUAGE\s*=\s*POSTSCRIPT.*\n*.*\n*.*\n%!/
+ }
+} elsif ($magic =~ /\305\320\323\306\036/) { # it's a wrapped EPS - ignore
+ exit
+} elsif ($magic =~ /%PDF-/) { # it's PDF (Acrobat)
+ $cvtr = $CATPDF;
+ $cvtcmd = "$cvtr -raw $ARGV[0] -";
+# to handle single-column, strangely laid out PDFs, use coalescing feature...
+# $cvtcmd = "$cvtr $ARGV[0] -";
+ $type = "PDF";
+ $dehyphenate = 1; # PDFs often have hyphenated lines
+ if (open(INFO, "$PDFINFO $ARGV[0] 2>$null |")) {
+ while (<INFO>) {
+ if (/^Title:/) {
+ s/^Title:\s+//;
+ s/\s+$//;
+ s/\s+/ /g;
+ s/&/\&amp\;/g;
+ s/</\&lt\;/g;
+ s/>/\&gt\;/g;
+ $title = $_;
+ last;
+ }
+ }
+ close INFO;
+ }
+# to use coalescing feature conditionally...
+# if ($title =~ /...Title of Corel DRAW output.../) {
+# $cvtcmd = "$cvtr $ARGV[0] -";
+# }
+} elsif ($magic =~ /WPC/) { # it's WordPerfect
+ $cvtr = $CATWP;
+ $cvtcmd = "$cvtr $ARGV[0]";
+ $type = "WordPerfect";
+ $dehyphenate = 0; # WP documents not likely hyphenated
+} elsif ($magic =~ /^{\\rtf/) { # it's Richtext
+ $cvtr = $CATRTF;
+ $cvtcmd = "$cvtr $ARGV[0]";
+ $type = "RTF";
+ $dehyphenate = 0; # RTF documents not likely hyphenated
+} elsif ($magic =~ /\320\317\021\340/) { # it's MS Word
+ $cvtr = $CATDOC;
+ $cvtcmd = "$cvtr -a -w $ARGV[0]";
+ $type = "Word";
+ $dehyphenate = 0; # Word documents not likely hyphenated
+} else {
+ die "Can't determine type of file $ARGV[0]; content-type: $ARGV[1]; URL: $ARGV[2]\n";
+}
+
+die "$cvtr is absent or unwilling to execute.\n" unless -x $cvtr;
+
+#############################################
+#
+# Start output.
+
+# if running as a converter for "user-defined" output type...
+#print "Content-Type: text/html\n\n";
+
+if ($ishtml) {
+ # converter will give its own HTML output
+ system("$cvtcmd") || die "$cvtr doesn't want to be run from shell.\n";
+ exit;
+}
+
+# Produce HTML output from converter's text output, so we can add title.
+print "<HTML>\n<head>\n";
+
+# print out the title, if it's set, and not just a file name, or make one up
+if ($title eq "" || $title =~ /^[A-G]:[^\s]+\.[Pp][Dd][Ff]$/) {
+ @parts = split(/\//, $ARGV[2]); # get the file basename
+ $parts[-1] =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/gie;
+ $title = "$type Document $parts[-1]"; # use it in title
+}
+print "<title>$title</title>\n";
+
+print "</head>\n<body>\n";
+
+# Open file via selected converter, output its text.
+open(CAT, "$cvtcmd |") || die "$cvtr doesn't want to be opened using pipe.\n";
+while (<CAT>) {
+ while (/[A-Za-z\300-\377]-\s*$/ && $dehyphenate) {
+ $_ .= <CAT>;
+ last if eof;
+ s/([A-Za-z\300-\377])-\s*\n\s*([A-Za-z\300-\377])/$1$2/s
+ }
+ s/[\255]/-/g; # replace dashes with hyphens
+ s/\f/\n/g; # replace form feed
+ s/&/\&amp\;/g; # HTMLify text
+ s/</\&lt\;/g;
+ s/>/\&gt\;/g;
+ print;
+}
+
+print "</body>\n</HTML>\n";
+
+close CAT;
+