#! /usr/bin/perl
###############################################################################
#
# Ferret user interface program
#
# Written by Brian White <bcwhite@verisim.com>
# Copyright (c) 1996 by Verisim, Inc.
#
###############################################################################



# Load the big guns...
use Ferret;



###############################################################################
#
# Program-wide variable declarations (and defaults)
#
$debug=0;
$search;
$action= '';
$imode = 1;
@parms = ();
%opts  = (
	addprefix	=> '',
	doctype		=> '',
	filter      => '',
	index		=> 'ferret.index',
	lines       => '',
	stripprefix	=> '',
	summary		=> '250',
);
%flags = (
	filter		=> 0,
	force		=> 0,
	help		=> 0,
	notitles	=> 0,
	summaries   => 0,
);

$opts{index} = $ENV{FERRET_INDEX} if $ENV{FERRET_INDEX};

if ($debug) {
	use FileHandle;
	autoflush STDOUT 1;
}



###############################################################################
#
# Usage message
#
my $usage = qq"
Use: $0 [--index=<index-file>] <action> [action parameters] [...]

Actions:
     addfile        adds named file(s) to index
     addstoppers    adds to list of stopper (non-indexed) words
     commonwords    reports words that exist in many documents
     query          searches for specified words (put in 'quotes')
     removefile     removes named file(s) from index
     removestoppers removes from list of stopper (non-indexed) words
     setoption      set specified options
     shrink         reduce index size after adds or removes
     unsetoption    unset specified options

Type \"$0 --help <action>\" for more information.

";


###############################################################################
#
# Work ("action") subroutines follow
#
###############################################################################



###############################################################################
#
# Utility Subroutines
#

sub BadFileTypes {
	my($fref) = @_;
	my $temp;

	$temp = pack("N",hex("13579ACE"));
	return "database file" if $$fref =~ m/^\Q$temp\E/;

	$temp = pack("V",hex("13579ACE"));
	return "database file" if $$fref =~ m/^\Q$temp\E/;

	return;
}



###############################################################################
#
# Action:  ADDFILE <files>
#

sub AddFile {
	if ($flags{help} || @parms < 1) {
		print STDERR "\nUse: $0 [...] addfile [--addprefix=<path>] [--stripprefix=<path>]\n";
		print STDERR "     [--notitles] [--summary=<size>] [--lines=<number>] [--doctype=<type>]\n";
		print STDERR "     [--filter='<program & args>'] <file-to-add> [...]\n";
		print STDERR "\nOptions:\n";
		print STDERR "     addprefix    add to each filename after loading but before indexing\n";
		print STDERR "     doctype      process as documents of this type:  HTML, Text, Code\n";
		print STDERR "     filter       run row data through this program (+args) before indexing\n";
		print STDERR "     force        add all files regardless of last modification date\n";
		print STDERR "     lines        limit summary to this number of lines in length\n";
		print STDERR "     notitles     supress storing titles for indexed documents\n";
		print STDERR "     stripprefix  strip from each filename after loading but before indexing\n";
		print STDERR "     summary      store maximum of this number of bytes as a document summary\n";
		die "\n";
	}

	my($filename,$ignored);

	foreach $filename (@parms) {
		my($filter,$fixed,$filtname,$data,$title,$summary,$dtype);
		$summary= "";

		$fixed = $filename;
		$fixed =~s!^$opts{stripprefix}!! if $opts{stripprefix};
		$fixed = $opts{addprefix} . $fixed;

		my($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
		   $atime,$mtime,$ctime,$blksize,$blocks) = stat($filename);
		if ($search->DocumentTimestamp($fixed) > $mtime) {
			if ($flags{force}) {
				print "File '$filename' is unchanged since last added -- forced\n";
			} else {
				print "File '$filename' is unchanged since last added -- ignored\n";
				$ignored = 1;
				next;
			}
		}

		if ($opts{filter}) {
			$filter = $opts{filter};
			$fixed  = $filename;
		} else {
			$filter = Ferret::StdExtFilters($filename,\$filtname);
		}
		eval { $data = Ferret::LoadFile($filename,$filter); };
		if ($@) {
			$@ =~ s/(;|\s)\s*stopped\s.*$//si;
			print $@," -- skipped\n";
			next;
		}
		$dtype = $opts{doctype};
		Ferret::FixCRLF(\$data);

		my $badfile = BadFileTypes(\$data);
		if ($badfile) {
			if ($flags{force}) {
				print "Warning: forced indexing of $badfile '$filename'\n";
			} else {
				print "Warning: not indexing $badfile '$filename'\n";
				next;
			}
		}

		unless ($dtype) {
			$dtype = "HTML"	if ($filtname =~ m/\.html?$/i || $data =~ m/^\s*<html>/si);
			$dtype = "Code" if ($filtname =~ m/\.(h|hh|hpp|h\+\+|c|cc|cpp|c\+\+)$/i || $data =~ m/^\#\!/);
			$dtype = "MIF"	if ($filtname =~ m/\.mif$/i || $data =~ m/^<MIFFile[\s\d\.]*>/);
			$dtype = "Mail"	if (!$dtype && $data =~ m/^from /i);
			$dtype = "Text" unless $dtype;
		}

		print "Adding '$filename' ";
		print "as '$fixed' " if $filename ne $fixed;
		print "($dtype) ...\n";
		eval 'Ferret::Strip' . $dtype . '(\$data,\$title,\$summary,$opts{summary})';
		die "$0: Unknown document type '$dtype'\n" if $@;
		Ferret::LimitLineCount(\$summary,$opts{lines}) if $opts{lines} && $dtype ne "HTML";
		Ferret::MakeHTMLSummary(\$summary, $dtype);
		$search->AddDocument($fixed,$data);
		$search->DBPutTitle("$fixed",$title)		if $title   && !$flags{notitle};
		$search->DBPutSummary("$fixed",$summary)	if $summary &&  $opts{summary};
	}

	if ($ignored) {
		print '(use the "--force" option to force adding of unmodified files)',"\n";
	}
}



###############################################################################
#
# Action:  ADDSTOPPERS <words>
#

sub AddStoppers {
	if ($flags{help} || @parms < 1) {
		print STDERR "\nUse: $0 [...] addstoppers <word> [...]\n";
		die "\n";
	}

	$search->AddStoppers(@parms);
}



###############################################################################
#
# Action:  COMMONWORDS <min> [max]
#

sub CommonWords {
	if ($flags{help} || @parms < 1 || @parms > 2) {
		print STDERR "\nUse: $0 [...] commonwords <min> [max]\n";
		print STDERR "\nBoth <min> and <max> can be either a number between 0 and 1 to indicate a\n";
		print STDERR "a frequency (eg. 0.90 = 90% of all documents) or a whole number greater than\n";
		print STDERR "1 to indicate an exact number of documents.  Running \"commonwords 0.90\"\n";
		print STDERR "will return a list of words in more than 90% of the documents.  This is useful\n";
		print STDERR "for determining words to be added to the \"stopper\" list.  (see: \"addstoppers\"\n";
		print STDERR "and \"removestoppers\")  Running \"commonwords 0.00 0.10\" will list all the words\n";
		print STDERR "that are in less than 10% of all documents.\n";
		die "\n";
	}

	my $words;

	if (@parms == 1) {
		$words = $search->CommonWords($parms[0]);
	} else {
		$words = $search->CommonWords($parms[0],$parms[1]);
	}

	my @words = sort(split(/\n/,$words));

	print "common words: @words\n";
}



###############################################################################
#
# Action:  QUERY 'query-string'
#

sub Query {
	if ($flags{help} || @parms != 1) {
		print STDERR "\nUse: $0 [...] query [--summaries] 'query-string'\n";
		print STDERR "\nOptions:\n";
		print STDERR "     summaries    display summaries for all matches found\n";
		print STDERR "\nThe query string should be enclosed in single quotes so that double quotes\n";
		print STDERR "can be passed as part of the query.  Any single quotes in the query (for\n";
		print STDERR "apostrophes in contractions) will have to be escaped with a backslash.\n";
		die "\n";
	}

	my @results = $search->Query($parms[0]);
	die "$@\n" if $@;

	print "Score Match\n~~~~~ ~~~~~\n";
	foreach (@results) {
		my($score,$match) = (m/^(\d+) (.*)$/);
		printf " %3d  %s\n",$score,$match;

		if ($flags{summaries}) {
			my $summary = $search->DBGetSummary($match);

			if ($summary) {
				$summary =~ s/(^|\n)/$1      /g;
				print $summary,"\n\n";
			}
		}
	}
}



###############################################################################
#
# Action:  REMOVEFILE <files>
#

sub RemoveFile {
	if ($flags{help} || @parms < 1) {
		print STDERR "\nUse: $0 [...] removefile [--addprefix=<path>] [--stripprefix=<path>]\n";
		print STDERR "     <file-to-remove> [...]\n";
		print STDERR "\nOptions:\n";
		print STDERR "     addprefix    add to each filename after loading but before indexing\n";
		print STDERR "     stripprefix  strip from each filename after loading but before indexing\n";
		die "\n";
	}

	my($filename);

	foreach $filename (@parms) {
		my($filter,$filtfile,$fixed,$removed);
		$filter = Ferret::StdExtFilters($filename,\$filtfile);
		$fixed	= $filename;
		$fixed  =~s!^$opts{stripprefix}!! if $opts{stripprefix};
		$fixed  = $opts{addprefix} . $fixed;

		print "Removing '$filename' ";
		print "as '$fixed' " if $filename ne $fixed;
		print "...\n";
		$removed = $search->RemoveDocument($fixed);
		print "Warning: document '$fixed' was not in index\n" unless $removed;
		$search->DBDelSummary("$fixed");
	}
}



###############################################################################
#
# Action:  REMOVESTOPPERS <words>
#

sub RemoveStoppers {
	if ($flags{help} || @parms < 1) {
		print STDERR "\nUse: $0 [...] removestoppers <words> [...]\n";
		die "\n";
	}

	$search->RemoveStoppers(@parms);
}



###############################################################################
#
# Action:  SETOPTION <options>
#

sub SetOption {
	if ($flags{help} || @parms < 1) {
		print STDERR "\nUse: $0 [...] setoption <option> [...]\n";
		print STDERR "\nOptions:\n";
		print STDERR "     tiny         make index as small as possible (loses proximity searching)\n";
		print STDERR "     nostoppers   don't remove any stopper (non-content) words\n";
		die "\n";
	}

	foreach (@parms) {
		my $option=0;

		/^tiny$/i		and $option=&Ferret::OPT_TINY;
		/^nostoppers$/i	and $option=&Ferret::OPT_NOSTOPPERS;

		print "Setting option '$_' ...\n";
		$search->SetOption($option);
	}
}



###############################################################################
#
# Action:  SHRINK
#

sub Shrink {
	if ($flags{help} || @parms != 0) {
		print STDERR "\nUse: $0 [...] shrink\n";
		die "\n";
	}

	print "Shrinking index...\n";
	$search->Shrink();
}



###############################################################################
#
# Action:  UNSETOPTION <options>
#
sub UnsetOption {
	if ($flags{help} || @parms < 1) {
		print STDERR "\nUse: $0 [...] unsetoption <option> [...]\n";
		print STDERR "\nOptions:\n";
		print STDERR "     tiny         make index as small as possible (cannot be unset)\n";
		print STDERR "     nostoppers   don't remove any stopper (non-content) words\n";
		die "\n";
	}

	foreach (@parms) {
		my $option=0;

		/^tiny$/i		and $option=&Ferret::OPT_TINY;
		/^nostoppers$/i	and $option=&Ferret::OPT_NOSTOPPERS;

		print "Unsetting option '$_' ...\n";
		$search->UnsetOption($option);
	}
}



###############################################################################
#
# Main program follows
#
###############################################################################


###############################################################################
#
# Parse command line arguments
#
die $usage unless @ARGV > 0;

foreach (@ARGV) {
	if (/^--(.*?)=(.*)$/) {
		if (defined $opts{$1}) {
			$opts{$1} = $2;
		} else {
			die "$0: Unknown option '--$1'\n" . $usage;
		}
	} elsif (/^--(.*?)$/) {
		if (defined $flags{$1}) {
			$flags{$1} = 1;
		} else {
			die "$0: Unknown flag '--$1'\n" . $usage;
		}
	} elsif ($action) {
		push @parms,$_;
	} else {
		$action = $_;
		if (/^addfile$/)		{ 			next; }
		if (/^addstoppers$/)	{ 			next; }
		if (/^commonwords$/)	{ $imode=0;	next; }
		if (/^query$/)			{ $imode=0;	next; }
		if (/^removefile$/)		{ 			next; }
		if (/^removestoppers$/)	{ 			next; }
		if (/^setoption$/)		{ 			next; }
		if (/^shrink$/)			{ 			next; }
		if (/^unsetoption$/)	{ 			next; }

		die "$0: Unknown action '$_'\n" . $usage;
	}
}

die $usage unless $action;



###############################################################################
#
# Create a Ferret and open the specified index (only if not in "help" mode)
#
unless ($flags{help}) {
	$search = new Ferret;
	if ($imode) {
		$search->Update($opts{index});
	} else {
		$search->Open($opts{index});
	}
}



###############################################################################
#
# Call the appropriate routine for the requested action
#
AddFile()		if ($action eq "addfile");
AddStoppers()	if ($action eq "addstoppers");
CommonWords()	if ($action eq "commonwords");
Query()			if ($action eq "query");
RemoveFile()	if ($action eq "removefile");
RemoveStoppers()if ($action eq "removestoppers");
SetOption()		if ($action eq "setoption");
Shrink()		if ($action eq "shrink");
UnsetOption()	if ($action eq "unsetoption");



###############################################################################
#
# Put things away when done...
#
print "\nWriting index...\n" if $imode;
$search->Close();



