# NOTE: Derived from ./blib/lib/Ferret.pm.  Changes made here will be lost.
package Ferret;

sub StripHTML {
	my $usage = 'Usage: Ferret::StripHTML(\$htmldata [, \$title [, \$summary [, SUMMARYSIZE]]])';
	@_ >= 1 && @_ <= 4 or croak $usage;
	my($doc,$title,$summary,$amount) = @_;

	croak $usage if (ref $doc ne "SCALAR");
	croak $usage if ($title && ref $title ne "SCALAR");
	croak $usage if ($summary && ref $summary ne "SCALAR");

	my($frameflag,$extrawords);
	$amount = 512 unless $amount;

	if ($title) {
		$$title = "";
		if ($$doc =~ m!<title>(.*?)</title>!is) {
			$$title = $1;
		}
	}

	$frameflag = ($$doc =~ m/<frame/);
	$extrawords= join(" ", ($$doc =~ m/<meta\s+(.*?)\s*>/gis));
	$extrawords=~s/\s+/ /gis;

	$$doc .= "\n" . $extrawords . "\n";

	$$doc =~ s!<head.*?>.*?</head.*?>!!gis;
	$$doc =~ s!<title.*?>.*?</title.*?>!!gis;
	$$doc =~ s!<script.*?>.*?</script.*?>!!gis;
	$$doc =~ s|<!.*?>||gs;
	$$doc =~ s!<map.*?>.*?</map.*?>!!gis;
	$$doc =~ s!</?(html|body|meta|(no)?frame).*?>!!gis;
	$$doc =~ s!</?a(\s+.*?)?>!!gis;
	$$doc =~ s!^\s+!!;
	$$doc =~ s!\n+!\n!g;
	$$doc =~ s![ \t]+! !gs;

	if ($summary) {
#		$$doc =~ m/^(.{1,$amount})/s;
		$$summary =	 substr($$doc,0,$amount);
#		$$summary =~ s/[^\n]*$//s;	# can cause problems with editors that make very long lines
		$$summary =~ s/<[^>]*$//;
		$$summary =~ s/\s*[^\s>]*$//;

		if ($frameflag) {
			$$summary = "[ This document uses frames -- the summary may be useless ] " . $$summary;
		}
	}

	$$doc .= $$title if $title;
	StripBadLines($doc);
	$$doc =~ s!<.*?>!!gs;
#	$$doc =~ s!\s+! !gs;
}



1;
