#!/local/bin/perl

# showdead.pl   Daniel MacKay Daniel.MacKay@Dal.Ca
# 990922 DEM    Scan the log from a "htdig -s" run and produce pages listing
#               all the dead links for your web managers to browse.
# dig_report.pl a variant by Malcolm.Austen@oucs.ox.ac.uk
#                            for use on daneel.ox.ac.uk
# 990929-30 MDA first hack with a view to counting pages at each level
# 991201    MDA tidy up the presentation
# 000119    MDA exclude "not found" and "redirect" urls from the table
# 0103..    MDA reordeed server lists by reverse IPname
#               added lists of all pages indexed (+hop counts)

# Typical(?) usage:
#    perl dig_report.pl < htdig.stdout

#use strict;

$prefix = "db/new/report/" ;
# that's where the html pages will be written

$title = "ht://Dig report generated on " . localtime();
# this is the <H1> for the html pages, could be better!
# print "\n$title\n\n";

my ($maxd, $addr, $totcount, $badtot, @dcount, %bad, %servtot, %sites);
$maxd = 1;

while (<>) {
    chop;
    # print "$_|\n";
    s/\s*$//;   # strip trailing spaces
    #
    # count pages and depths
    # this patern matches the a "digits:digits:digits:serverURL" record
    # we want $3 (depth at which the page indexed) and $4 (server name)
    if ( m'^(\d+):(\d+):(\d+):http://(.*?)/(.*):' ) {
	next if ( /(not found$|redirect$)/ );
	$totcount++;
	$servtot{$4}++;
	if ( $3 eq "255") {
	    $badtot++; $bad{$4}++
	} else {
	    if ($3 > $maxd ) { $maxd = $3 };
	    $dcount[$3]++;
	    $sites{$4}->[$3]++
	};
        # note away the URL and hopcount
	push( @hops, "$4\thttp://$4/$5\t" . sprintf( "%2d", $3 ) );
        #
	next;
    };
    #
    # this notes the pages that were not indexed (needs -s option on htdig)
    if (m/^Not found:\s+(.*) Ref: (.*)$/) {
	($bad,$ref) = ($1,$2) ;
	$key = $ref ;
        # print "$_\n" ;
	$key =~ s/^http:\/\///;
	$key =~ s/\/.*$//;
	$key = join( ".", reverse split( /\./, $key )), "\n";
	push(@bad,"$key\t$ref\t$bad") ;
    }
} ;

open (SERV,">$prefix/index.html") or die "can't open dead index file" ;
chmod (0644,"$prefix/index.html");

open( SERVERNAMES, ">$prefix/server.names" ) or die "can't open names file" ;

print SERV <<HTML_1;

<html><head><title>ht://Dig report</title></head>

<body><h1 align=\"center\">$title</h1>

<p><a href="#ni_lists">Jump forward to lists of pages that were not
indexed.</a></p>

<p>This table shows the number of pages indexed at various depths from
each server. <br>The depth is measured from (one of) the server start
points. Some level counts may be misled by, for example, a high level
page having already been indexed at a lower level from another server
start point or by one server having serveral start point entries (with
different start directories). <br>The "?" column gives the count for
pages that had a dubious(unset?) depth in the log record.</p>

<table border=1>

HTML_1

# first line of table gives overall totals
print SERV "</tr><tr><td align=right>Overall</td>\n";
print SERV "<td align=right>$totcount</td><td align-right>$badtot</td>\n";

for ($i=0;$i<=$maxd;$i++) { print SERV "<td align=right>$dcount[$i]</td>\n" };

print SERV "</tr>\n";

# second line of table is just the headers
print SERV "<tr><th align=right>Server\\Depth</th>\n";
print SERV "<th align=right>Total</th><th align=right>?</th>\n";
for ($i=0;$i<=$maxd;$i++) { print SERV "<th align=right>$i</td>\n" };
print SERV "</tr>\n";

# need to build reverse site lookup table befor going on
    foreach $s ( keys %sites) {
	my $rev = join( ".", reverse split( /\./, $s));
	$revsites{$rev}=$s;
    };

# then one line per server found in the log
foreach $rs (sort keys %revsites) {
    my $s = $revsites{$rs};
    print SERVERNAMES "$s\n";
    print SERV "<tr><td align=right><a href=\"hl_$s.html\">$s</a></td>\n";
    print SERV "<td align=right>$servtot{$s}</td>\n";
    print SERV "<td align=right>$bad{$s}</td>\n";
    for ($i=0;$i<=$maxd;$i++)
    {  print SERV "<td align=right>$sites{$s}->[$i]</td>\n" };
    print SERV "</tr>\n";
};

close(SERVERNAMES);

print SERV "</table>\n";

# table of pages/depths is done, now present the lists of non-indexed pages

print SERV <<HTML_2;

<a name="ni_lists"></a>

<p>These lists (one per server) show the pages that ht://Dig could not
index. The page may be missing but it may simply be that the ht://Dig
process was forbidden access to the page.</p>

<table>
HTML_2

$okey = "" ;
foreach $_ (sort(@bad)) {
    ($key,$ref,$bad) = split /\t/ ;
    $key = join( ".", reverse split( /\./, $key ));
    if ($okey ne $key) {
	print OUT "</table></body></html>\n";
	close OUT ;
        if ( $okey ne "" ) {
	    print SERV "<tr><td align=right><a href=\"ni_$okey.html\">$okey</a></td>\n" ;
	    print SERV "<td>- $count pages could not be indexed</td></tr>\n";
        };
	$count=0;
#	print "Now writing to $key\n" ;
        open (OUT,">$prefix/ni_$key.html") ||
	             die "can't open file for $key" ;
	chmod (0644,"$prefix/ni_$key.html");
	print OUT "<html><head><title>Un-Indexed links on $key</title></head>\n";
	print OUT "<body><h1 align=\"center\">Un-Indexed links on $key</h1>\n" ;
	print OUT "<h2 align=\"center\">$title</h2><ul>\n" ;
	$okey = $key ;
    };
    $count++;
    print OUT "<li>$bad<br><a href=\"$ref\">$ref</a></li>\n" ;
} ;
if ( $okey ne "" ) {
     print SERV "<tr><td align=right><a href=\"ni_$okey.html\">$okey</a></td>\n" ;
     print SERV "<td>- $count pages could not be indexed</td><tr>\n";
};

print OUT "</ul></body></html>\n";
close OUT;
print SERV "</ul></body></html>\n" ;
close SERV ;

##
# that dealt with the main table and the not-indexed lists
# now to dump out the lists of indexed pages, one per server
# and ordered by hopcount
##
    $okey = "" ;
    foreach $_ (sort(@hops)) {
	($key,$url,$hop) = split /\t/ ;
	if ($okey ne $key) {
	    print OUT "</table></body></html>\n";
	    close OUT ;
	    open (OUT,">$prefix/hl_$key.html") ||
		die "can't open file for $key" ;
	    chmod (0644,"$prefix/hl_$key.html");
	    print OUT "<html><head><title>Page & hop count list for $key</title></head>\n";
	    print OUT "<body><h1 align=\"center\">Page & hop count list for $key</h1>\n";
	    print OUT "<h2 align=\"center\">$title</h2><table cellspacing=0>\n";
	    print OUT "<tr><th align=right>HopCount</th><th>&nbsp</th><th align=left>URL</th></tr>\n";
	    $okey = $key ;
	};
	print OUT "<tr><td align=right>$hop</td><td></td><td><a href=\"$url\">$url</a></td></tr>\n" ;
    } ;
    print OUT "</table></body></html>\n";
    close OUT;

# all done

