#! /usr/bin/perl
###############################################################################
#
# WebScan:  A utility to traverse a web site and index all the pages found.
#
# Written by Brian C. White <bcwhite@verisim.com>
# Copyright (c) 1996 by Verisim, Inc.
#
###############################################################################


# I've got the power!
use Ferret;


# Other useful libraries
use FileHandle;


# Don't even look at these extensions!
# (this should be handled by the "Accept" flag in the HTTP options, but most
# servers don't seem to pay any attention to that.  <sigh>)
$badext='jpg|jpeg|gif|png|bmp|xbm|au|wav|mpg|mpeg|qt|mov|dl|gl|fli|zip|gz|Z|exe|com|map|tar|tgz';


# Program-wide variable declarations
$count	= 0;			# Counter for periodic flushing
@urllist;				# List of urls yet to process
%urldone;				# List of processed urls (to avoid looping)
%disallow;				# Disallowed URLs (from "robots.txt")
STDOUT->autoflush(1);



# Get and parse the "robots.txt" file for a site
sub GetRobotsTXT {
	my($site) = @_;

	$disallow{$site} = "";
#	print "fetching http:$site/robots.txt...";
	my $data = eval { Ferret::LoadHTTP("http:$site/robots.txt"); };
#	print "error=$@, data:\n$data\n";
	if ($@ || !$data) {
		return;
	}

	my $apply = 0;
	foreach (split(/\n/,$data)) {
		s/^\s+|\s+$//gs;

		if (m/^user-agent:\s*(.*?)$/i) {
			$apply = ($1 eq '*' || $1 =~ m/ferret/i || $uname =~ m/\Q$1\E/i);
			next;
		}
		next unless $apply;

		if (m/^disallow:\s*(\S*)/i) {
			$disallow{$site} .= "|" if $disallow{$site};
			$disallow{$site} .= "\Q$1\E";
#			print "$site disallowed: '$disallow{$site}'\n";
		}
	}
}



# A subroutine to validate URLs
sub CleanURL {
	my($from,$to) = @_;

	$to = Ferret::ResolveURL($from,$to);

	# Remove CGI arguments & tags
	$to =~ s/[?\#].*$//;

	my($ftype,$fhost,$fport,$ffile, $ttype,$thost,$tport,$tfile);
	($ftype,$fhost,$fport,$ffile) = ($from =~ m|^(\w+:)?(//[^/:]*)?(:\d+)?(.*)$|i);
	($ttype,$thost,$tport,$tfile) = ($to   =~ m|^(\w+:)?(//[^/:]*)?(:\d+)?(.*)$|i);

#	print "type=$ftype, host=$fhost, port=$fport, file=$ffile -- ";
#	print "type=$ttype, host=$thost, port=$tport, file=$tfile\n";

	my $tsite = $thost;
	$tsite .= ":$tport" if $tport;

	return unless $ttype =~ m/^(|http:)$/i;
	return unless $fhost eq $thost || ($mpatt && $thost =~ m!^$mpatt$!i);
	return if $nclud && $tfile !~ m!$nclud!i;
	return if $xclud && $tfile =~ m!$xclud!i;

	GetRobotsTXT($tsite) unless exists $disallow{$tsite};
	return if $disallow{$tsite} && $tfile =~ m!^($disallow{$tsite})!;

#	print "return: $to\n";
	return $to;
}



###############################################################################
#
# Begin main program
#
###############################################################################



# Parse command line arguments
my $usage = qq"
Use: $0 [--index=<index-file>] [--include=<allowed-filename-pattern>]
     [--exclude=<disallowed-filename-pattern>] [--delay=<delay-time>]
     [--machines=<allowed-machine-pattern>] [--summary=<max-summary-bytes>]
     [--lines=<max-summary-lines>] [--refresh=<url-pattern>] [--shrink]
     [--name=<user's-name>] [--email=<contact-email-address>] [--random]
     [--timeout=<network timeout>] <new-url> [...]

";

$index = 'ferret.index';
$mpatt = '';
$xclud = '';
$nclud = '';
$ssize = '';
$lines = '';
$sleep = 2;
$flush = 100;
$fresh = '';
$shrnk = 0;
$uname = '';
$email = '';
$rndom = 0;
$tmout = 0;


die $usage unless @ARGV > 0;

foreach (@ARGV) {
	if (/^--index=(.+)/)	{ $index = $1;	next; }
	if (/^--machines=(.+)/)	{ $mpatt = $1;	next; }
	if (/^--avoid=(.+)/)	{ $xclud = $1;	print '("--avoid" has been depricated.  Please use "--exclude")',"\n"; next; }
	if (/^--exclude=(.+)/)	{ $xclud = $1;	next; }
	if (/^--include=(.+)/)	{ $nclud = $1;	next; }
	if (/^--summary=(.+)/)	{ $ssize = $1;	next; }
	if (/^--lines=(.+)/)	{ $lines = $1;	next; }
	if (/^--delay=(.+)/)	{ $sleep = $1;	next; }
	if (/^--refresh=(.+)/)	{ $fresh = $1;	next; }
	if (/^--shrink$/)		{ $shrnk =  1;	next; }
	if (/^--name=(.+)/)		{ $uname = $1;	next; }
	if (/^--email=(.+)/)	{ $email = $1;	next; }
	if (/^--flush=(.+)/)	{ $flush = $1;	next; }
	if (/^--random$/)		{ $rndom =  1;	next; }
	if (/^--timeout=(.+)/)	{ $tmout = $1;	next; }

	if (/^--/)				{ print STDERR "Unknown option '$_'\n"; die $usage; }

	push @urllist,$_;
}
$xclud =~ s/!/\\!/g;
$nclud =~ s/!/\\!/g;
Ferret::SetNetworkTimeout($tmout);
if ($uname) {
	$uname = "Ferret/$Ferret::VERSION ($uname)";
}


# Load the big guns...
$search = new Ferret;
$search->Update($index);


# A friendly reminder
unless ($uname && $email) {
	print "Note: It is considered good netiquette when crawling the web to include both\n";
	print "your name and an email address at which you can be contacted.\n";
}


# Process command-line URL list
for (my $i=0; $i < @urllist; $i++) {
	my $url = $urllist[$i];
	$url = "//$url" unless $url =~ m|/|;
	$url = CleanURL($url);
	if ($url) {
		$urllist[$i] = $url;
	} else {
		print STDERR "Error: Invalid or unaccessible url '$urllist[$i]'\n";
	}
}


# If we're doing a "refresh", load the appropriate urls into the list
push @urllist, grep(/$fresh/,$search->DocumentList()) if $fresh;


# Loop until no more URLs
while (@urllist) {
	my $url  = shift @urllist;
	next unless $url;
	next if $urldone{$url};
	my $lurl = $url;
	my $write= 1;

	# Remove any not-allowed URLs (changes in cmd line args or robots.txt)
	unless (CleanURL($url)) {
		$search->RemoveDocument($url);
		$search->DBDelSummary($url);
		next;
	}

	# Build GET options
	my $opts = "";
	$opts .= "Connection: Keep-Alive\n";
	$opts .= "User-agent: $uname\n" if $uname;
	$opts .= "From: $email\n" if $email;
	$opts .= "If-Modified-Since: " . Ferret::TimetoRFC1123($search->DocumentTimestamp($url)) . "\n";
	$opts .= "Accept: application/*, text/*\n";

	print "\n$url:  ";
	my $data = eval { Ferret::LoadHTTP($url,$opts) };
	my $size = (length $data) / 1024;
	if ($@) {
		$urldone{$url} = 1;
		print STDERR "$@\n";
		next;
	}

	Ferret::FixCRLF(\$data);
	my($head) = ($data =~ m!^(.*?\n)\n!s);
	my($type) = ($head =~ m!^Content-type:\s+(.*)!im);
	my($rslt) = ($head =~ m!^HTTP\S*\s+(\d+)!im);

	$urldone{$url} = 1;

	if ($rslt == 304) {
		print "(unchanged) ";
		sleep $sleep/2 if $sleep >= 2;
		next;
	}

	$search->RemoveDocument($url);
	$search->DBDelSummary($url);

	if ($rslt == 301 || $rslt == 302) {
		if ($head =~ m/^Location: (.*)/im) { $url = $1; }
		print "-> $url ";
#		print "[" . CleanURL($lurl,$url) . "] ";
		push @urllist, CleanURL($lurl,$url);
		next;
	} elsif ($rslt != 200 && $rslt != 304) {
		$head =~ m!^HTTP\S*\s+(.*)!im;
		print STDERR "Error: Could not load '$url' -- $1\n";
		next;
	}

	unless ($type) {
		print STDERR "Error: '$url' return with no content-type\n";
		next;
	}
	printf "(%0.1fk) ",$size;
	$data =~ s/^.*?\n\n//s;
	study $data;


	my($title,$summary,$newurl);
	if ($type eq "text/html") {
		print '"HTML" ';
		my @hrefs = ($data =~ m/<a\s[^>]*href\s*=\s*\"?([^>\"\s\#\?]+).*?>/gis);
		push @hrefs,($data =~ m/<frame\s[^>]*src\s*=\s*\"?([^>\"\s\#\?]+).*?>/gis);
		if (@hrefs) {
			printf "(%u hrefs", scalar @hrefs;
			@hrefs = Ferret::Uniq(sort(@hrefs));
			printf ", %u unique) ", scalar @hrefs;
			foreach $href (@hrefs) {
				Ferret::UnquoteHTML(\$href);
#				print "($href)\n";
				$newurl = CleanURL($url,$href);
				if ($newurl) {
					unless ($newurl =~ m!\.($badext)$!oi) {
						push @urllist, $newurl;
#						print "($newurl)\n";
					}
				}
			}
		}

		Ferret::StripHTML(\$data,\$title,\$summary,$ssize);
		Ferret::ResolveHTMLImages(\$summary,$url,0.75,100);
		Ferret::MakeHTMLSummary(\$summary,"HTML");
		$search->AddDocument($url,$data);
	} elsif ($type =~ m|^text/|) {
		if ($url =~ m/\.(h|hpp|h\+\+|c|cpp|c\+\+)$/i || $data =~ m/^\#\!/) {
			print '"Code" ';
			Ferret::StripCode(\$data,\$title,\$summary, ($lines ? $lines*400 : $ssize));
		} elsif ($data =~ m/^from /i) {
			print '"Mail" ';
			Ferret::StripMail(\$data,\$title,\$summary, ($lines ? $lines*400 : $ssize));
		} else {
			print '"Text" ';
			Ferret::StripText(\$data,\$title,\$summary, ($lines ? $lines*400 : $ssize));
		}
		Ferret::LimitLineCount(\$summary,$lines) if $lines;
		Ferret::MakeHTMLSummary(\$summary,"Text");
		$search->AddDocument($url,$data);
	}

	elsif ($type =~ m!^application/x-(sh|shar|csh)!) {
		print '"Shell " ';
		Ferret::StripCode(\$data,\$title,\$summary, ($lines ? $lines*400 : $ssize));
		Ferret::LimitLineCount(\$summary,$lines) if $lines;
		Ferret::MakeHTMLSummary(\$summary,"Code");
		$search->AddDocument($url,$data);
	} elsif ($type eq "application/x-mif") {
		print '"MIF" ';
		Ferret::StripMIF(\$data,\$title,\$summary, ($lines ? $lines*400 : $ssize));
		Ferret::LimitLineCount(\$summary,$lines) if $lines;
		Ferret::MakeHTMLSummary(\$summary,"MIF");
		$search->AddDocument($url,$data);
	}

	else {
		print STDERR "Warning: Unsupported content-type '$type' from $url\n";
		next;
	}

#	print "Title: '$title'\nSummary:\n$summary\n";
	$search->DBPutTitle("$url",$title)		if $title;
	$search->DBPutSummary("$url",$summary)	if $summary;

	if ($count++ == $flush) {
		print "Writing index... ";
		$search->Flush();
		$count = 0;
		if ($rndom) {
			@urllist = Ferret::Uniq(sort(@urllist));
			srand(time());
			my $max = @urllist;
			for (my $i=0; $i < $max; $i++) {
				my $j = int(rand($max));
				my $temp	 = $urllist[$i];
				$urllist[$i] = $urllist[$j];
				$urllist[$j] = $temp;
			}
		}
	}

	sleep $sleep if $sleep > 0;
}
print "\n";



# Time to put it all away...  May take a while to write the database.
print "\nWriting index...\n";
$search->Shrink() if $shrnk;
$search->Close();
print "\n";



