diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/contrib/wordfreq/wordfreq.pl')
-rwxr-xr-x | debian/htdig/htdig-3.2.0b6/contrib/wordfreq/wordfreq.pl | 54 |
1 files changed, 54 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/contrib/wordfreq/wordfreq.pl b/debian/htdig/htdig-3.2.0b6/contrib/wordfreq/wordfreq.pl new file mode 100755 index 00000000..31402a23 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/wordfreq/wordfreq.pl @@ -0,0 +1,54 @@ +#!/usr/local/bin/perl + +use GDBM_File; +use BerkeleyDB; + +## +## wordfreq.pl +## (C) 1995 Andrew Scherpbier <[email protected]> +## +## Will generate a list of words and how frequently they are used +## +## updated to deal with Berkeley db files 1998 Iosif Fettich <[email protected]> +## + + +$filetype = 'DB'; + +if (not defined $ARGV[0] or defined ($ARGV[1]) and $ARGV[1] !~ /g/i) { + print "\n\nThis program is used in conjunction with ht://Dig \n"; + print "to determine the frequency of words in a database containing word references.\n\n"; + print "Usage: $0 filename (to use a Berkeley db2 wordlist)\n"; + print " $0 filename g[dbm] (to use a GDBM wordlist)\n\n\n"; + exit; +} + +$filename = $ARGV[0]; + +if ($filename =~ /gdbm$/i or $ARGV[1] =~ /g/i) { + $filetype = 'GDBM'; +} + +if ($filetype eq 'GDBM') { + tie %worddb, 'GDBM_File', $ARGV[0], GDBM_READER, 0 + or die "Unable to open $ARGV[0] $!"; +} else { + tie %worddb, 'BerkeleyDB::Btree', + -Filename => $filename, + -Flags => DB_RDONLY + or die "Cannot open file $filename: $! $BerkeleyDB::Error\n" ; +} + +while (($key, $value) = each %worddb) +{ + $length = length($value) / 20; + $total = 0; + foreach $i (0 .. $length - 1) + { + ($count, $id, $weight, $anchor, $location) = + unpack("i i i i i", substr($value, $i * 20, 20)); + $total += $count; + } + print "$total\t$key\n"; +} + |