#!/bin/perl

# WWWreferer
#   - a WWW-log analyzing CGI-script (referrer report)
#
# Lars M. Garshol - larsga@ifi.uio.no - http://www.ifi.uio.no/~larsga/

# This program is freeware. Please let me know if you modify the code,
# and not just the configuration. 

require "cgi.pl";
require "shared.pl";
require $VAR{"config"};

# --- Init

@SearchEngines= (".excite.com",".infoseek.com","\\.yahoo\\.",
		 "hotbot.com","search.kvasir.sol.no",".dejanews.com",
		 "altavista.digital.com","index.polarsearch.com",
		 "altavista.telia.com","www.origo.no","netfind.aol.com",
		 "www.goo.ne.jp","metacrawler\\.","www.euroseek.net",
		 "www.excite.de","nwi.bibsys.no","cgi.feedme.org",
		 "\\.search.com","www.highway61.com","webcrawler.com",
		 "\\.lycos\\.","ahoy.cs.washington.edu","infind.inference.com",
		 "www.nlsearch.com","www.northernlight.com","dogpile.com",
		 "projectcool.com","www.askjeeves.com","euroseek.net");

%SENames=(".excite.com","Excite",".infoseek.com","Infoseek",
	  "\\.yahoo\\.","Yahoo","hotbot.com","Hotbot",
	  "search.kvasir.sol.no","Kvasir",".dejanews.com","DejaNews",
	  "altavista.digital.com","AltaVista",
	  "index.polarsearch.com","PolarSearch",
	  "altavista.telia.com","AltaVista","www.origo.no","Origo",
	  "netfind.aol.com","AOL Netfind",
	  "www.goo.ne.jp","Other search engines",
	  "metacrawler\\.","MetaCrawler","www.euroseek.net","Euroseek",
	  "www.excite.de","Excite","nwi.bibsys.no","Other search engines",
	  "cgi.feedme.org","Other search engines",
	  "\\.search.com","Other search engines",
	  "www.highway61.com","Other search engines",
	  "webcrawler.com","WebCrawler","\\.lycos\\.","Lycos",
	  "ahoy.cs.washington.edu","Other search engines",
	  "infind.inference.com","Other search engines",
	  "www.nlsearch.com","Other search engines",
	  "www.northernlight.com","Other search engines",
	  "dogpile.com","Other search engines",
	  "projectcool.com","Other search engines",
	  "www.askjeeves.com","Ask Jeeves",
	  "euroseek.net","Other search engines");

# ---------- Initializing ------------------------------

for ($ix=0; $ix<7; $ix++) {
    $HitFrom[$ix]=0;
}

# ---------- The program itself ------------------------

if (!$Redirect) { &printCType("text/html"); }

open(INN,$LogFile) ||
  error("Couldn't open logfile $LogFile: $!");

&PrintPageTop("Referrer report $ForString");

if ($Redirect) {
    print "<P>\nReport generated: $LastRun\n</P>\n\n";
}

while (<INN>) {	
  if (!/\S/) { next; }          	#Just whitespace, skip

  @felter=split(/ /,$_);      

  #Check to see if we want this line
  if ($felter[5] eq '"HEAD')        { next; }
  if ($felter[8] != 200)            { next; }   # If HTTP code not 200
  if (&SkipHost($felter[0]))        { next; }
  if (&SkipPage($felter[6]))        { next; }

  #Need to do some tinkering to ensure we get the entire UA name here
  for ($ix=12; $ix<=$#felter; $ix++) {
    $felter[11] .= $felter[$ix];
  } 
  if (&SkipRobot($felter[11]))      { next; }

  #Canonize URIs
  $felter[6] =~ s/#.+/\"/;
  $felter[6] =~ s/%7e/~/i;
  $felter[6] =~ s/\/$DefaultFile/\//;
#  $felter[10] =~ s/"//g;	  
  $felter[10] =~ s/#.+/\"/;
  $felter[10] =~ s/%7e/~/i;
  $felter[10] =~ s/\/index\.html?/\//;
  $felter[10] =~ s/:80\//\//;
  $felter[10] =~ s/[^:]\/\//\//g;
  $felter[10] =~ s/\?.*/"/;	  

  if ($felter[10] =~ /"http:\/\/[-a-z0-9](.[-a-z0-9])+"/) {
      # If just a server name without trailing slash, add slash.
      $felter[10] .= "/";
  }
      
  #DNS lookup on referrer server

  $HitsTo{$felter[6]}++;
  
  ## Extract statistics from $felter[10]

  # What kind of referral was this?
  if    ($felter[10] eq "-" ||
	 $felter[10] eq "")    { $HitFrom[0]++; }
  elsif ($felter[10] =~ /^\"http/)     {     
      # What kind of HTTP referral was this?
      if ($felter[10] =~ /^"$LocalSite/) {
	  #This hit came from somewhere in the site
	  $HitFrom[6]++;
	  if ($ReferInternal) {
	      $PageRefs{$felter[6]}{$felter[10]}++;
	  }
      } else {
	  #From an external web site. Was it a search engine?
	  $found=0;
	  foreach $Key (@SearchEngines) {
	      if ($felter[10] =~ /$Key/i) {
		  $HitFrom[4]++; #Yes, search engine
		  $SEReferrals{$SENames{$Key}}++;
		  $PageRefs{$felter[6]}{$SENames{$Key}}++;
		  $found=1;
		  last;
	      }
	  }

	  if (!$found) {
	      $HitFrom[5]++; #Just a random HTTP link
	      $PageRefs{$felter[6]}{$felter[10]}++;
	  }
      } 
  }
  elsif ($felter[10] =~ /^\"news/) {
      $HitFrom[2]++;
      $PageRefs{$felter[6]}{"News article"}++;
  } elsif ($felter[10] =~ /^\"file/) {
      $HitFrom[3]++;
      $PageRefs{$felter[6]}{"File URL"}++;
  }
  #else error in log file
      
  $TotalHits += 1;
}
close INN;

#------- URL GROUPING ------

#%Aliases{oldurl} eq newurl;

# Correcting hits to pages that are to be aliased		   
foreach $hit (keys %PageRefs) {
    if ($Aliases{$hit}) {
	foreach $ref (keys %{$PageRefs{$hit}}) {
	    $PageRefs{$Aliases{$hit}}{$ref} += $PageRefs{$hit}{$ref};
	}
	delete $PageRefs{$hit}; # Removes the alias
	delete $HitsTo{$hit};
    }
}

# Correcting hits from pages that are to be aliased
# COMMENTED OUT UNTIL I KNOW WHAT TO DO ABOUT THIS. MAY NOT WANT THIS FEATURE!
		   
# foreach $hit (keys %PageRefs) {
#     foreach $ref (keys %{$PageRefs{$hit}}) {
# 	if ($Aliases{$ref}) {
# 	    $PageRefs{$hit}{$Aliases{$ref}} += $PageRefs{$hit}{$ref};
# 	    delete $PageRefs{$hit}{$ref}; # Removes the alias
# 	}
#     }
# }
		   
#------- Main header -------

print "<H3>Referrer statistics</H3>\n";

print "<TABLE>\n";
print "<TR><TD><STRONG>Total:</STRONG>         <TD>$TotalHits\n";
$_=int($HitFrom[0]/$TotalHits*100)." %";
print "<TR><TD><STRONG>Unspecified:</STRONG >  <TD>$HitFrom[0] <TD>$_\n";
$_=int($HitFrom[2]/$TotalHits*100)." %";
print "<TR><TD><STRONG>News article:</STRONG>  <TD>$HitFrom[2] <TD>$_\n";
$_=int($HitFrom[3]/$TotalHits*100)." %";
print "<TR><TD><STRONG>File link:</STRONG>     <TD>$HitFrom[3] <TD>$_\n";
$_=int($HitFrom[4]/$TotalHits*100)." %";
print "<TR><TD><STRONG>Search engine:</STRONG> <TD>$HitFrom[4] <TD>$_\n";
$_=int($HitFrom[5]/$TotalHits*100)." %";
print "<TR><TD><STRONG>HTTP link:</STRONG>     <TD>$HitFrom[5] <TD>$_\n";
$_=int($HitFrom[6]/$TotalHits*100)." %";
print "<TR><TD><STRONG>Internal link:</STRONG> <TD>$HitFrom[6] <TD>$_\n";
print "</TABLE>\n";

#------- Referrals by search engine

print "<H3>Referrals by search engine</H3>\n";

print "<TABLE>\n";
print "<TR><TH>Search engine  <TH>Referrals  <TH>Percentage\n";

@SPages=sort { $SortDir*$SEReferrals{$a} <=> $SortDir*$SEReferrals{$b} }
		   keys %SEReferrals;
		   
foreach $SE (@SPages) {
    $_=int($SEReferrals{$SE}/$HitFrom[4]*100)." %";
    print "<TR><TD>$SE  <TD>$SEReferrals{\"$SE\"}  <TD>$_\n";
}

print "</TABLE>\n";

#------- Statistics by URL

print "<H3>Referrers by page</H3>\n";

@SPages=sort { $SortDir*$HitsTo{$a} <=> $SortDir*$HitsTo{$b} }
        keys %PageRefs;

foreach $page (@SPages) {
    @SRefs =sort { $SortDir*$PageRefs{$page}{$a} <=>
		   $SortDir*$PageRefs{$page}{$b} }
            keys (%{$PageRefs{$page}});

    if (($SortDir<0 && $PageRefs{$page}{$SRefs[0]}>=$ReferMinHit) ||
	($SortDir>0 && $PageRefs{$page}{$SRefs[$SRefs]}>=$ReferMinHit)) {
	
	print "<H4>$page</H4>\n";
	print "<TABLE>\n";

	$count=0;
	foreach $from (@SRefs) {
	    if ((($SortDir<0 && $count<$ReferMaxNo) ||
		 ($SortDir>0 && $#SRefs-$ReferMaxNo < $count)) &&
		$PageRefs{$page}{$from}>=$ReferMinHit) {

		if ($ReferLinks && ($from =~ /(ftp|http):\/\//)) {
		    print "<TR><TD><A HREF=$from>$from</A>\n";
		    print "    <TD>$PageRefs{$page}{$from}\n";
		} else {
		    print "<TR><TD>$from<TD>$PageRefs{$page}{$from}\n";
		}
		$count++;
	    }
	}
	print "</TABLE>\n";
    }
}

#------- Finish page

&PrintPageBot;


