#! /usr/bin/perl

# CommPort:/cron/fetch.pl 
#
# Performs the fetching and parcing of external channels.


# The contents of this file are subject to the CommPort Public License 
# Version 1.00 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at  
# (URL:http://www.tc.ca/commport/license.html).
#
# Software distributed under the License is distributed on an "AS IS" 
# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 
# the License for the specific language governing rights and limitations 
# under the License.  
#
# The Original Code is ${CPHOME}/cron/fetch.pl
#
# The Initial Developer of the Original Code is Telecommunities
# Canada, Inc.  Portions created by Telecommunities Canada, Inc. are
# Copyright (C) 1999-2000 by Telecommunities Canada, Inc.
# All Rights Reserved.
#
# Modifications or contributions of Larger Works should be sent to
# (mailto:commport@tc.ca) for consideration towards inclusion 
# into the root public codebase.
# 
# Contributor(s): Chris Halsall, Ian White, 
#                 Gary and Mae Shearman.
#                 Industry Canada's Community Access Program.


# Changelog:
#
# 1999.08.02 - CH  - Added this copyright/changelog section.
# 1999.08.03 - CH  - Modified XML parcer to handle CBC National.
# 1999.09.15 - CH  - Added image caching functions.
# 1999.09.16 - CH  - GetAndScan - can deal with bad (empty) files.
# 1999.09.21 - CH  - Added special CBC parce type 12.
# 1999.09.22 - CH  - Added special HRDC parce type 13.
# 2000.06.04 - CH  - Added Web-cam type 5.

use LWP::UserAgent;
use HTML::LinkExtor;
use URI::URL;
use DBI;

require "/home/piwcommport/common/common.pl";
require "/home/piwcommport/common/images.pl";

$PID = $$;

$Which = shift;

$ua = new LWP::UserAgent;
$ua->agent("CommPort/${CP_Ver} http://commport.csp.net");

$rslt = `whoami`;
$mask = umask;
print "whoami: $rslt ; mask: $mask\n";

&OpenDB;

srand(time|$$);
$RValue = rand(45);

$cmd = "select ID,Name,FetchURL,F_Time,F_Last,Type"
.",Img_Mode,Img_Cache,Img_Cached,Img_W,Img_H,PlugIn"
.",mod(ID * $RValue,31) as S"
." from Channel where ";

if ($Which > 0) {
$cmd .= "ID=${Which}";
} else {
$cmd .= " F_Time > 0 "
." AND UNIX_TIMESTAMP(F_Last) + F_Time * 60 < UNIX_TIMESTAMP()"
." order by S";
}

print "$cmd\n\n";

$sth = $dbh->prepare($cmd);
$rv = $sth->execute or die "can't execute the query: $sth->errstr\n";

$Cnt = 0;
while (@rec = $sth->fetchrow_array) {

   $Img_Mode = $rec[6];
   $Img_Cache = $rec[7];
   $Img_Cached = $rec[8];
   $Img_W = $rec[9];
   $Img_H = $rec[10];
   $PlugIn = $rec[11];

   print "@rec" . "\n\n";
   $Reslt = &GetAndScan($rec[0],$rec[2],$rec[5]);

   print "piwcommport - $rec[1] - $Reslt\n";
   &DoUpdate($rec[0], $Reslt);

}


exit;


sub DoUpdate {
   local ($ID,$Resp) = @_;
   local ($cmd,$sth);

   $cmd = "update Channel set F_Last=now(),F_Resp=${Resp} where ID=${ID}";

   $sth = $dbh->do($cmd) || &DumpError($sth->errstr);

   return 0;
}


sub DoImg {
   local ($ID,$T,$S,$L) = @_;
   local ($cmd,$sth);
   local ($req,$res,$Data);
   local ($W, $H, $IT) = (0,0,"");
   local ($RawPath);

   print "DI: $Img_Mode - $Img_Cache - $Img_Cached - $Img_W - $Img_H\n";

   $cmd = "update Channel set Img_URL=\"${L}\""
         .",Img_Text=\"${T}\",Img_Src=\"${S}\"";


   $RawPath = "${SYSROOT}/fetched/imgs/${ID}.raw";

   if ($Img_Cached == 0 || $Img_Cache == 2 ||
            $Img_W == 0 || $Img_H==0) {

      print "DoImg: Want to get : $ID, $S, $Type\n";
 
      $req = new HTTP::Request 'GET', $S;

      $res = $ua->request($req);

      if ($res->is_success) {
         $Data = $res->content;

         open (OUT, ">${SYSROOT}/fetched/imgs/${ID}.raw");
         print OUT $Data;
         close OUT;

         ($IT,$W,$H) = ImageTypeAndSize("${SYSROOT}/fetched/imgs/${ID}.raw");

         print "Image Type: $IT ($W x $H)\n";

         $cmd .= ",Img_W=\"$W\",Img_H=\"$H\"";


         if ($Img_Mode >= 2 && $IT ne "" && $W > 0 && $H > 0) {
            open (OUT, ">${HTDOCROOT}/cimgs/${ID}.${IT}");
            print OUT $Data;
            close OUT;

            $cmd .= ",Img_Local=\"/cimgs/${ID}.${IT}\",Img_Cached=1";

         }
      }
   }

   $cmd .= " where ID=${ID}";

   print "About to do: \"$cmd\"\n";

   $sth = $dbh->do($cmd) || &DumpError($sth->errstr);

   return 0;
}


sub RawImport {
   local ($TmpFile, $F) = @_;
   local ($t,@s,$State);

   open (OUT, ">${TmpFile}");

   print OUT $F;
   close OUT;

   return length($F);
}


sub ScanRawCBC {
   local ($TmpFile, $F) = @_;
   local ($t,@s,$State);

   open (OUT, ">${TmpFile}");

   @s = split (/\n/, $F);

   $URL = "";
   foreach $t (@s) {

      $t =~ s/\<A HREF="[^"]*".*\>\<IMG SRC="[^"]*".*\>\<\/a\>//i;

      if ($t =~ /<A HREF="([^"]*)".*/i) {
         $URL = $1;
         next;
      }
      $t =~ s/\<\/a\>//i;

      if ($URL ne "") {
         print OUT "<a href=\"${URL}\">$t</a>\n";
         $URL = "";
      }
   }
   close OUT;

   return length($F);
}


sub ScanBCGovNews {
   local ($TmpFile, $F) = @_;
   local ($t,@s,$State);
   local ($Cnt) = (0);

   open (OUT, ">${TmpFile}");
   $F =~ s/\n/\r/g;
   @s = split (/\r/, $F);

   foreach $t (@s) {

#print STDERR "BCG: $t\n";

      if ($t =~ /^\<td\>\d+\/\d+\/\d+\<\/td\>/) {
         $t =~ s/^\<td\>\d+\/\d+\/\d+\<\/td\>\<td\>\<a href="([^"]*)">([^<]*)<\/a>//;

#print STDERR "BCG: - $1 - $2\n";
         if ($1 ne "" and $2 ne "") {
            print OUT "<a href=\"http://142.36.183.50/${1}\">${2}</a>\n";
            $Cnt++;
         }
      }
   }

   close OUT;

   return $Cnt;
}
   

sub ScanHRDC {
   local ($TmpFile, $F) = @_;
   local ($t,@s,$State);
   local ($Cnt) = (0);

   open (OUT, ">${TmpFile}");
   @s = split (/\n/, $F);

   foreach $t (@s) {

#print "SHRDC0: $t\n";

#      if ($t =~ /^<tr valign=top><td>\d*<td><font size=-1><A HREF="([^"]*)">(.*)<\/a><td>[^<]*<td>([^<])<td>/i) {
      if ($t =~ /^<tr valign=top><td>\d*<td><font size=-1><A HREF="([^"]*)">(.*)<\/a><td>[^<]*<td>([^<]*)<td align=center>/i) {

#print "SHRDC1: $t\n";
#print "SHRDC2: $1 - $2 - $3\n";

         if ($1 ne "" and $2 ne "") {
            print OUT "<a href=\"http://jb-ge.hrdc-drhc.gc.ca${1}\">${2} - ${3}</a>\n";
            $Cnt++;
         }
      }
   }

   close OUT;

   return $Cnt;
}
   

sub ScanCBCNews {
   local ($TmpFile, $F) = @_;
   local ($t,@s,$State);
   local ($Cnt) = (0);

   open (OUT, ">${TmpFile}");
   @s = split (/\n/, $res->content);

   foreach $t (@s) {
      $t =~ s/^<[^>]*>//i;
      print OUT "$t\n";
      $Cnt++;
   }

   close OUT;

   return $Cnt;
}
   

sub Scan32Bit {
   local ($TmpFile, $F) = @_;
   local ($t,@s,$State);
   local ($Cnt) = (0);

   open (OUT, ">${TmpFile}");

   @s = split (/\n/, $res->content);

   $State = 0;
   foreach $t (@s) {
#print "$State - $t\n";

      if ($State == 0) {
         $Title = $t;  $State++;
      } elsif ($State == 1) {
         $State++;
      } elsif ($State == 2) {
         print OUT "<a href=\"${t}\">$Title</a>\n";
         $Cnt++;
         $State++;
      } elsif ($State == 3) {
         $State=0;
      }
   }

   close OUT;

   return $Cnt;
}

sub ScanScienceDaily {
   local ($TmpFile, $F) = @_;
   local ($t,@s,$State);
   local ($Cnt) = (0);

   open (OUT, ">${TmpFile}");

   @s = split (/\n/, $res->content);

   $State = 0;
   foreach $t (@s) {
#print STDERR "SD: State $State - $t\n";
      if ($State == 0) {
        if ($t == "<CENTER><FONT size=\"2\"><I>Click on the banner above to visit one of our advertisers.</I></FONT></CENTER><TABLE width=\"470\" border=\"0\" cellspacing=\"5\" cellpadding=\"0\"><TR><TD></TD></TR></TABLE>") {
#print STDERR "Found click banner etc\n";
         $State++;
        }
      } elsif ($State == 1) {
         $t =~ s/^<LI><FONT size="3" face="Arial, Helvetica"><B><A class=main href="([^"]*)">([^<]*)\s*<\/A><\/B><\/FONT>//i; 
#print STDERR "SD: URL $1 - $2\n";
        if ($2 ne "") {
          $Cnt++;
          print OUT "<a href=\"$1\">$2</a>\n";
        }
      }
   }

   close OUT;

   return $Cnt;
}


sub ScanBBCNews {
   local ($TmpFile, $F) = @_;
   local ($t,@s,$State);
   local ($Cnt) = (0);

   open (OUT, ">${TmpFile}");

   @s = split (/\n/, $res->content);

   $State = 0;
   foreach $t (@s) { 
#print STDERR "BBC: State $State - $t\n";
      if ($State == 0) {
        if ($t == "<!--NewsOnLineQLX-->") { 
# print STDERR "Found <!--NewsOnLineQLX-->\n";
         $State++;
        }
      } elsif ($State == 1) {
         $t =~ s/^\s*<A href="([^"]*)">([^<]*)<\/A>//i; 
# print STDERR "BBC: URL $1 - $2\n";
        if ($2 ne "") {
          $Cnt++;
          print OUT "<a href=\"http://news.bbc.co.uk/$1\">$2</a>\n";
        }
      }
   }

   close OUT;
 
   return $Cnt;
} 

sub ScanBBCWorld {
   local ($TmpFile, $F) = @_;
   local ($t,@s,$State,$URL,$Topic);
   local ($Cnt) = (0);

   open (OUT, ">${TmpFile}");

   @s = split (/\n/, $res->content);

   $State = 0;
   foreach $t (@s) {
#print STDERR "BBCW: State $State - $t\n";
      if ($State == 0) {
        if ($t == "<H2>BBC News Online: <A HREF=\"/low/english/world/default.stm\" CLASS=\"index\"><B>World</B> </A></H2>") {  
#         print STDERR "Found <H2>BBC News Online: <A HREF=\"/low/english/world/default.stm\" CLASS=\"index\"><B>World</B> </A></H2>\n";
         $State++;
        }
      } elsif ($State == 1) {
         $t =~ s/^\s*<A href="([^"]*)">//i;
        $URL = $1;
#         print STDERR "BBCW: URL $URL\n";
        if ($URL ne "") {
         $State++;
        }
      } elsif ($State == 2) {
         $t =~ s/^\s*<H3>(.*)<\/H3><BR>//i;
        $Topic = $1;
#        print STDERR "BBC: TOPIC $Topic\n";
        if ($Topic ne "" and $Topic ne "Back to top") {
          $Cnt++;
          print OUT "<a href=\"http://news.bbc.co.uk/$URL\">$Topic</a>\n";
        $State = 1;
        }
      }
   }

   close OUT;

   return $Cnt;
}



sub ScanNewsHub {
   local ($TmpFile, $F) = @_;
   local ($t,@s,$State);
   local ($Cnt) = (0);
 
   open (OUT, ">${TmpFile}");
 
   @s = split (/\n/, $res->content);
 
   $State = 0;
   foreach $t (@s) {
#print STDERR "Hub: State $State - $t\n";
      if ($State == 0) {
        if ($t == "<!-- End Left Table -->") {
#         print STDERR "Found <!-- End Left Table -->\n";
         $State++;
        }
      } elsif ($State == 1) {
         $t =~ s/^<li><a href=([^=]*)>([^<]*)<\/a>//i;
#         print STDERR "Hub: URL $1 - $2\n";
        if ($2 ne "") {
          $Cnt++;
          print OUT "<a href=\"http://www.newshub.com$1\">$2</a>\n";
        }
      }
   }
 
   close OUT;
 
   return $Cnt;
}

sub ScanYahooEsNews {
   local ($TmpFile, $F) = @_;
   local ($t,@s,$State);
   local ($Cnt) = (0);
 
   open (OUT, ">${TmpFile}");
 
   @s = split (/\n/, $res->content);
 
   $State = 0;
   foreach $t (@s) {
#print STDERR "YahooEs: State $State - $t\n";
      if ($State == 0) {
        if ($t == "<table cellspacing=0 cellpadding=4><tr><td></td>") {
#         print STDERR "Found <table cellspacing=0 cellpadding=4><tr><td></td>\n";
         $State++;
        }
      } elsif ($State == 1) {
         $t =~ s/^<b><a href=([^=]*)>([^<]*)<\/a><\/b>//i;
#         print STDERR "YahooEs: URL $1 - $2\n";
        if ($2 ne "") {
          $Cnt++;
          print OUT "<a href=\"http://es.news.yahoo.com$1\">$2</a>\n";
        }
      }
   }
 
   close OUT;
 
   return $Cnt;
}


 
sub ScanMTVNews {
   local ($TmpFile, $F) = @_;
   local ($t,@s,$State);
   local ($Cnt) = (0);
 
   open (OUT, ">${TmpFile}");
 
   @s = split (/\n/, $res->content);
 
   $State = 0;
   foreach $t (@s) {
#print STDERR "MTV: State $State - $t\n";
      if ($State == 0) {
        if ($t eq "<FONT FACE=\"Helvetica,Arial,sans-serif\" SIZE=\"2\" COLOR=\"Black\"><B>") {
#         print STDERR "Found start of articles\n";
         $State++;
        }
      } elsif ($State == 1) {
         $t =~ s/^<A HREF="([^"]*)">([^<]*)<\/A><P>//i;
#         print STDERR "MTV: URL $1 - $2\n";
        if ($2 ne "") {
          $Cnt++;
          print OUT "<a href=\"http://www.mtv.com$1\">$2</a>\n";
        }
      }
   }
 
   close OUT;
 
   return $Cnt;
}

# Note: this is a hack, and not a true XML parcer. 
sub ScanRDF {
   local ($TmpFile, $F) = @_;
   local ($t,@s,$Title,$Link);
   local ($I_T, $I_S, $I_L);
   local ($Cnt) = (0);

   open (OUT, ">${TmpFile}");

   @s = split (/\n/, $F);

   foreach $t (@s) {
#print "-> $t\n";

      next if ($t =~ /^$/);

      if ($t =~ /<image\>/i) {
         $Image = 1; next;
      }

      if ($Image) {
print "Img: $t\n";
print "Img: $I_T, $I_S, $I_L\n";
         if ($t =~ /.*<title>(.*)<\/title>/i) {
            $I_T = $1; }

         if ($t =~ /.*<url>(.*)<\/url>/i) {
            $I_S = $1; }

         if ($t =~ /.*<link>(.*)<\/link>/i) {
            $I_L = $1; }

         if ($t =~ /<\/image\>/i) { $Image = 0; next; }
      }

      if ($t =~ /<item([^>]*)>/i) {
         $T = $1;
         $T =~ s/.*HREF="([^"]*)".*/\1/;
         $Link = $T;
         $Item = 1; next;
      }

#print "$Item - \"$Title\" - \"$Link\"\n";
#print "\"$t\"\n";

      next if (!$Item);

      if ($t =~ /.*<title>(.*)<\/title>/i) {
         $Title = $1;
         $Title =~ s/&lt;!--[^&]*&gt;//ig;
         $Title =~ s/&apos;/'/g; # This is for MacCentral...
         $Title =~ s/&amp;#34;/"/g; # This is for MacCentral...
         $Title =~ s/&#39;/'/g; # This is for ScienceDaily...
         next;
      }

      if ($t =~ /.*<link>(.*)<\/link>/) {
         $Link = $1;
         $Link =~ s/ww.newshub.com\/tech\/cgibin\/rd.cg/ww.newshub.com\/cgibin\/rd.cg/;
         $Link =~ s/ww.newshub.com\/science\/cgibin\/rd.cg/ww.newshub.com\/cgibin\/rd.cg/;
      }

      if ($t =~ /.*<\/item>/i) {

#print "  -- Have MAtch! \n";
         if ($Title ne "" && $Link ne "") {
            print OUT "<a href=\"${Link}\">$Title</a>\n";
            $Cnt++;
         }
         $Title = $Link = "";
         $Item = 0;
      }
   }

   close OUT;

print "RSS: $I_T, $I_S, $I_L\n";

   if ($I_S ne "") {
      &DoImg($ID, $I_T, $I_S, $I_L);
      $Cnt++;
   }

   return $Cnt;
}


sub GetAndScaleImg {
   local ($TmpFile, $F, $ID) = @_;
   local ($ImgFile);
   local ($Cnt) = (0);
   local ($T,$W,$H);
   local ($cmd,@rec);
 
   $ImgFile = "${SYSROOT}/fetched/imgs/${ID}.raw";
   $ImgThum = "${HTDOCROOT}/cimgs/${ID}.jpg";

   open (OUT, ">${ImgFile}");
   print OUT $F;
   close OUT;

   ($T,$W,$H) = ImageTypeAndSize($ImgFile);

   print "T:$T W:$W H:$H\n";

   $shellcmd = "${Cmd_Loc_Convert} -geometry ${CAM_X_LIMIT}x${CAM_Y_LIMIT}!"
." ${T}:${ImgFile} ${ImgThum}";
   $rslt = `$shellcmd`;

   $cmd = "select URL from Channel where ID=${ID}";
   $sth = $dbh->prepare($cmd);
   $rv = $sth->execute or die "can't execute the query: $sth->errstr\n";

   @rec = $sth->fetchrow_array;
   $URL = @rec[0];
 
   open (OUT, ">${TmpFile}");
   print OUT "<center><a href=\"${URL}\"><img src=\"/cimgs/${ID}.jpg\""
." width=\"${CAM_X_LIMIT}\""
." height=\"${CAM_Y_LIMIT}\" border=\"0\"></a></center>";
   close (OUT);


   return 1;
}


sub DoPlugIn {
   local ($TmpFile, $F, $ID, $PI) = @_;
   local ($RawFile);
   local ($Cnt) = (0);
   local ($T,$W,$H);
   local ($cmd,@rec);
 
   $RawFile = "${SYSROOT}/fetched/${ID}.raw";

   open (OUT, ">${RawFile}");
   print OUT $F;
   close OUT;

   $PI =~ s/^\.*//g; # Get rid of any prefixing "."'s.
   $PI =~ s/\/\.*//g;# Or any "/."'s.
   $PI =~ s/\///g; # Or any "/"'s in general.
   $PI =~ s/^\.*//g; # Get rid of any prefixing "."'s after all of the above.

   $shellcmd = "${SYSROOT}/cron/plugins/${PI} ${ID} ${RawFile} ${TmpFile}";
print "About to do \"${shellcmd}\"\n";
   $rslt = `$shellcmd`;

   print "Result is \"$rslt\"\n";
   return 1;
}


sub GetAndScan {
   local ($ID, $URL, $Type) = @_;
   local ($req,$res,$Data);
   local ($Cnt,$OutStr);
  
print "Want to get : $ID, $URL, $Type\n";
 
   $req = new HTTP::Request 'GET', $URL;

   $res = $ua->request($req);

   # check the outcome
   if ($res->is_success) {

      $Data = $res->content;
      if ($Type != 5 && $Type != 7) {
        $Data =~ s/\r\n/\n/g;
        $Data =~ s/\n\r/\n/g;
        $Data =~ s/\r/\n/g;
      }

      open (OUT, ">${SYSROOT}/fetched/${ID}.raw");
      print OUT $Data;
      close OUT;

      $OutStr = "${SYSROOT}/cached/${ID}${$}.inc";

      if ($Type == 2) {
         $Cnt = &RawImport($OutStr,$Data);
      } elsif ($Type == 5) {
         $Cnt = &GetAndScaleImg($OutStr,$Data,$ID);
      } elsif ($Type == 7) {
        $Cnt = &DoPlugIn($OutStr,$Data,$ID,$PlugIn);
      } elsif ($Type == 10) {
         $Cnt = &Scan32Bit($OutStr,$Data);
      } elsif ($Type == 11) {
         $Cnt = &ScanCBCNews($OutStr,$Data);
      } elsif ($Type == 12) {
         $Cnt = &ScanRawCBC($OutStr,$Data);
      } elsif ($Type == 13) {
         $Cnt = &ScanHRDC($OutStr,$Data);
      } elsif ($Type == 15) {
         $Cnt = &ScanBCGovNews($OutStr,$Data);
      } elsif ($Type == 16) {
         $Cnt = &ScanMTVNews($OutStr,$Data);
      } elsif ($Type == 17) {
         $Cnt = &ScanBBCNews($OutStr,$Data);
      } elsif ($Type == 18) {
         $Cnt = &ScanNewsHub($OutStr,$Data);
      } elsif ($Type == 19) {
         $Cnt = &ScanYahooEsNews($OutStr,$Data);
      } elsif ($Type == 20) {
         $Cnt = &ScanRDF($OutStr,$Data);
      } elsif ($Type == 21) {
        $Cnt = &ScanScienceDaily($OutStr,$Data);
      } elsif ($Type == 22) {
        $Cnt = &ScanBBCWorld($OutStr,$Data);
      }

      if ($Cnt > 0) {
         rename($OutStr, "${SYSROOT}/cached/${ID}.inc");
         return 0;
      } else {
         unlink($OutStr);
         return 111; # Our own code for empty channels.
      }

   } else {
      return $res->code;
   }
}
