#!/usr/bin/perl -w
#
##############################################################################
#                           get_blacklist
# Script to update the squidGuard blacklists from a given url
#                          from Lars Rupp
#############################################################################
# 
# Including some additional modules from  Lars Erik Håland 'sgclean'
#
# sgclean.pl removes redundant entries in domain files and url files
# although sgclean.pl makes a backup of the old files, it's always a
# good idea to make your own backup before running the program
#
# By  Lars Erik Håland 1999 (leh@nimrod.no)
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License (version 2) as
# published by the Free Software Foundation.  It is distributed in the
# hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
# PURPOSE.  See the GNU General Public License (GPL) for more details.
#  
# You should have received a copy of the GNU General Public License
# (GPL) along with this program.
#
#############################################################################

use strict;
use File::Path;
use FileHandle;
use DB_File;
use Fcntl;

# enter some useful values here:
my $DEBUG=0;
my $loglevel=1; # 0 : quiet - no message, no logfile
				# 1 : normal - useful messages for normal usage
				# 2 : verbose - let's fill up the log...
my $sg_config = "/etc/squid/squidguard.conf";
my $squid_config="/etc/squid/squid.conf";
my $blacklist_server_file="/etc/squid/blacklist_servers";
my $tempdir="/tmp/blacklists";
my $logfile= "get_blacklist.log";

# internal variables
my $pid=$$;
my $pidfile="/var/run/get_blacklist.pid";
my $tmpfile = "$tempdir/squidGuard.db";
my $tmpfile_delete = "$tempdir/squidGuard.delete.db";

# define some globel external helper files
my $FILE="/usr/bin/file";
my $TAR="/bin/tar";
my $LS="/bin/ls";

############################################################################
# scripting-area
# you shouldn't do anything below here...
############################################################################

if ($DEBUG) {use Data::Dumper};
my $VERSION = "0.0.2";

# get information about the configuration
my @sgconfig = &ReadSquidGuardConf();
my @squidconfig = &ReadSquidConf();
# dbhome
my $sec=&find_section( 'config' => \@sgconfig,
                      'sectype' => 'dbhome' );
my $dbhome=$sec->{'dbhome'};
# logdir
$sec=&find_section( 'config' => \@sgconfig,
                   'sectype' => 'logdir' );
my $logdir=$sec->{'logdir'};
# user
$sec=&find_section( 'config' => \@squidconfig,
                   'sectype' => 'cache_effective_user' );
my $user=$sec->{'cache_effective_user'};
# group
$sec=&find_section( 'config' => \@squidconfig,
                   'sectype' => 'cache_effective_group' );

my $group=$sec->{'cache_effective_group'};

# debuging output
print STDERR "\nSquidGuard-Config:\n".Data::Dumper->Dump([@sgconfig]) if ($DEBUG);
print STDERR "\nSquid-Config:\n".Data::Dumper->Dump([@squidconfig]) if ($DEBUG);

# initialise
my $log="$logdir/$logfile";
my $downloaddir=$dbhome."/download";

################################################################################
# let's do it... (public)
################################################################################

check($pid);

logfile('open');

makedir($downloaddir,1);

if (defined (@ARGV)){ 
	foreach my $url (@ARGV){
	 LOG("Update startet URL: $url\n") if ($loglevel>=1);
	 getURL($url);
	 Extract($url);
	 my $contentref=getContent($url);
	 includeLists($url,$contentref);
	 removedir($tempdir);
	}
} else { # update from all 'active' servers
	foreach my $url (ReadActiveURLs()){
	LOG("Update startet URL: $url\n") if ($loglevel>=1);
		getURL($url);
    	Extract($url);
    	my $contentref=getContent($url);
     	includeLists($url,$contentref);
		removedir($tempdir);
	}
}

cleanup();

system ("chown -R $user $dbhome");

system ("chgrp -R $group $dbhome");

system ("/usr/sbin/squidGuard -c $sg_config -C all");

system ("chown -R $user $dbhome");

system ("chgrp -R $group $dbhome");

system ("/usr/sbin/rcsquid reload");

LOG("Update ended normaly \n") if ($loglevel>=1);

logfile('close');

################################################################################
# Functions (let's say: private)
################################################################################
sub check {
  my $pid= shift;
  my $fh = new FileHandle($pidfile, O_RDWR | O_CREAT );
  unless ($fh){
	print STDERR "couldn't open $pidfile\n";
	LOG("Warning: couldn't open $pidfile\nAborting!") if ($loglevel>0);
	exit 1;
  }
  binmode($fh);
  $fh->autoflush(1);
 
  my $oldpid=$fh->getline() || '';
  chomp($oldpid);
  if ($oldpid ne ""){
     LOG("get_blacklist is still running. Pid: $oldpid") if ($DEBUG);
	 $fh->close();
	 exit 1;
  } else {
	$fh->print($pid);
  }	
$fh->close();
}

sub makedir {
 my $dir = shift;
 my $remove = shift;
 if (-e "$dir") {
	if ($remove){
		rmtree($dir,1,1);
	}
 }
 my $ret = eval { my $temp=mkpath($dir,1,0750) };
 if ($ret != 1) {
    LOG("Directory $dir exists $@") if ($loglevel>1);
 }
}


sub getURL{
	use  LWP::Simple;
	my $url      = shift;
	my ( $file ) = $url =~ m~^.+/([^/]+)$~ ;
	my $local    = $downloaddir."/".$file;
	my $content  = mirror($url, $local);
	if ( $content == 304 ){
		LOG("File $file hasn't changed since last update.") if ($loglevel>1);
		$content=0;
	} elsif ($content == 200 ){
		LOG("Downloading $url successfully.") if ($loglevel>1);
		$content=0;
	} else {
		LOG("File $file from $url could not be downloaded!") if ($loglevel>0);
		LOG("Errormessage was: $content") if ($loglevel>0);
	}
	return 1 unless defined $content;
}


sub ReadSquidGuardConf {
  open(CONF, $sg_config);
    my @c=<CONF>;
  close(CONF);
  my @config=();

for (my $i=0; $i < @c; $i++) {
    next if (!$c[$i] || ($c[$i] =~ /^#/));

if ($c[$i] =~ /^dbhome\s+(\S+)/) {

      ###### DB Home

      my %section;
         $section{'sectype'}='dbhome';
         $section{'dbhome'}=$1;
         $section{'line'}=$i;

      push(@config, \%section);
		
    } elsif ($c[$i] =~ /^logdir\s+(\S+)/) {

      ###### Log Dir

      my %section;
         $section{'sectype'}='logdir';
         $section{'logdir'}=$1;
         $section{'line'}=$i;

     push(@config, \%section);
	} elsif ($c[$i] =~ /^(dest|destination)\s+([-_.a-zA-Z0-9]+)(\s+(within|outside)\s+([-_.a-zA-Z0-9]+))?\s+\{\s*$/) {

      ###### destination group

      my %section;
         $section{'sectype'}='dest';
         $section{'secname'}=$2;
         $section{'line'}=$i;
	
      while(($i <= scalar(@c)) && ($c[$i] !~ /^\s*\}/)) {
		while ($c[$i] !~ /\}/) {
        if ($c[$i] =~ /\s+domainlist\s+(\S+)/) {
          $section{'domainlist'}=$1;
          $section{'domainlist_line'} = $i;
        } elsif ($c[$i] =~ /\s+urllist\s+(\S+)/) {
          $section{'urllist'}=$1;
          $section{'urllist_line'} = $i;
        } elsif ($c[$i] =~ /\s+expressionlist\s+(\S+)/) {
          $section{'exprlist'}=$1;
          $section{'exprlist_line'} = $i;
        }
		$i++;
	  	}
      push(@config, \%section);
	}}
 } # end for
return wantarray ? @config : \@config;
} # end sub


sub ReadSquidConf {
  open(CONF, $squid_config);
    my @c=<CONF>;
  close(CONF);
  my @config=();

for (my $i=0; $i < @c; $i++) {
    next if (!$c[$i] || ($c[$i] =~ /^#/));

	if ($c[$i] =~ /^cache_effective_user\s+(\S+)/) {
      my %section;
         $section{'sectype'}='cache_effective_user';
         $section{'cache_effective_user'}=$1;
         $section{'line'}=$i;

      push(@config, \%section);
	} elsif ($c[$i] =~ /^cache_effective_group\s+(\S+)/){
      my %section;
         $section{'sectype'}='cache_effective_group';
         $section{'cache_effective_group'}=$1;
         $section{'line'}=$i;

      push(@config, \%section);
	}
 } # end for
return wantarray ? @config : \@config;
} #end sub


sub find_section {
  my %args=@_;
  my $c;
  foreach $c (@{$args{'config'}}) {
    my $ok=1;
    for (keys %args) {
      next if ($_ eq 'config');
      if (defined $c->{$_} && $c->{$_} !~ /^$args{$_}$/) {
        $ok=0;
        last;
      }
    }
    return $c if ($ok);
  }
 return undef;
}

sub logfile {
    my $action = shift;
    if (("$action" eq "open" ) || ("$action" eq "new" )) {
        open (LOGFILE, ">>$log") || warn "Couldn't open $log !\n";
        flock(LOGFILE,2)         || warn "Can't get lock for $log !\n";
        LOG("startet. Pid: $pid") if ($loglevel>0);
    } else {
        close LOGFILE;
    }
}

sub ReadDestinations {
  use Config::IniFiles;
  my $url = shift ;
  if( ! $url )
  {
    return [];
  }
  my @dests;
  my $i=1;
  my $ini = new Config::IniFiles( -file => "$blacklist_server_file",
                                  -allowcontinue =>1
                                #  -nocase=>1 
								);
  if( ! $ini ){	
	LOG("Couldn't open $blacklist_server_file") if ($loglevel>0);;
	return 1;
  }

  foreach my $section ( $ini->Sections()) {
	my $serverurl = $ini->val($section, 'url');
	next if( ! $serverurl );
	if ( "$serverurl" eq "$url"){
		foreach my $line ($ini->val($section,'dests')){
			my ($external,$local)=split /=>/,$line,2;
			my %section;
			$section{'sectype'}='bl_dest';
         	$section{'local'}=$local;
         	$section{'extern'}=$external;
         	$section{'number'}=$i;
			$i++;
		   	push(@dests, \%section);	
		}
		if ($DEBUG){
            print STDERR "My Section: ".$section."\nMy Url: ".$serverurl."\n";
            print STDERR "\nDestinations:\n".Data::Dumper->Dump([@dests]);
        }
	}
  } # end for
return wantarray ? @dests : \@dests;
} # end sub


sub ReadActiveURLs {
my @serverurls;
my $ini = new Config::IniFiles( -file => "$blacklist_server_file",
                                  -allowcontinue =>1
                                #  -nocase=>1 
                                );
	if( ! $ini ){
    	LOG("Couldn't open $blacklist_server_file") if ($loglevel>0);
  	}
	foreach my $section ( $ini->Sections()) {
		if ( defined ($ini->val($section,'active')) && ($ini->val($section,'active'))	eq 'yes' ){
			push(@serverurls,($ini->val($section,'url')));
		}
	}
return wantarray ? @serverurls : \@serverurls;
} # end sub


sub Extract{
	my $url=shift;
	my ( $file ) = $url =~ m~^.+/([^/]+)$~ ;	
	my @type = split / /,qx($FILE $downloaddir/$file);
	
	makedir($tempdir,1);	
	
	if ( "$type[1]" eq "gzip") {
		print STDERR $file." is a gzip\n" if ($DEBUG);
 		eval { `cd $tempdir; $TAR -zxf $downloaddir/$file` };
	} elsif ( "$type[1]" eq "tar") {
        print STDERR $file." is a tar\n" if ($DEBUG);
		eval { `cd $tempdir; $TAR -xf $downloaddir/$file` };
	} elsif ( "$type[1]" eq "bzip2"){
		print STDERR $file." is a bzip2\n" if ($DEBUG);
		eval { `cd $tempdir; $TAR -jxf $downloaddir/$file` };
	} elsif ( "$type[1]" eq "cannot"){
		LOG("WARNING: Could not open: $file") if ($loglevel>0);
		return 1;
	} elsif ( ! $file ) {
		LOG("WARNING: empty file name!") if ($loglevel>0);
		return 1;
	} else {
		LOG("WARNING: Could not recognize the filetype of: $file") if ($loglevel>0);
		return 1;
	}

	if ($@) {
    	LOG("Couldn't unpack: $/$file $@") if ($loglevel>0);
		return 1;
 	}
return;
}

sub getContent {
	my $url = shift;
	my ( $file ) = $url =~ m~^.+/([^/]+)$~ ;
	my @content = `$LS -I $file -I . -I .. $tempdir`;
	return \@content;
}

sub includeLists {
	my $url=shift;
	my $dirref=shift;
	my $destref=ReadDestinations($url);

    foreach my $dir (@$dirref) {
		chomp($dir);
		for (my $i=1; $i <= scalar(@$destref); $i++){
			$sec=&find_section( 'config' => $destref,
    	               		   'number' => $i );
			my $local=$sec->{'local'};
			my $extern=$sec->{'extern'};
			chomp($local);
			chomp($extern);

			my $sec=&find_section( 'config' => \@sgconfig,
             				       'sectype' => 'dest',
								   'secname' => $local );

			if (defined($sec->{'domainlist'})) {
				my $list=$dbhome."/".$sec->{'domainlist'} if (defined($sec->{'domainlist'}));
				my $localdir=(split /\//,$sec->{'domainlist'})[0];
				# check, if local directory exists
				if (! -e $dbhome."/".$localdir ){
					LOG("Warning: creating $dbhome/$localdir") if ($loglevel>0);
					makedir($dbhome."/".$localdir,0);				
				}
				# check, if external file exists
				if ( -e $tempdir."/".$dir."/".$extern."/domains" ) {
					system ("cat $tempdir/$dir/$extern/domains >> $list");
					LOG("Adding: $tempdir/$dir/$extern/domains") if ($loglevel>1);
					sg_clean($list,'domainlist');
				}
			}

			if (defined($sec->{'urllist'})) {
				my $list=$dbhome."/".$sec->{'urllist'};
                my $localdir=(split /\//,$sec->{'urllist'})[0];
                if (! -e $dbhome."/".$localdir ){
                    LOG("Warning: creating $dbhome/$localdir") if ($loglevel>0);
                    makedir($dbhome."/".$localdir,0);
                }
                if ( -e $tempdir."/".$dir."/".$extern."/urls" ) {
					system ("cat $tempdir/$dir/$extern/urls >> $list");
					LOG("Adding: $tempdir/$dir/$extern/urls") if ($loglevel>1);
					sg_clean($list,'urls');
				}
			}
	
			if (defined($sec->{'exprlist'})) {
				my $list=$dbhome."/".$sec->{'exprlist'};			
                my $localdir=(split /\//,$sec->{'exprlist'})[0];
                if (! -e $dbhome."/".$localdir ){
                    LOG("Warning: creating $dbhome/$localdir") if ($loglevel>0);
                    makedir($dbhome."/".$localdir,0);
                }
                if ( -e $tempdir."/".$dir."/".$extern."/exprlist" ) {
					system ("cat $tempdir/$dir/$extern/exprlist >> $list");
					LOG("Adding: $tempdir/$dir/$extern/exprlist") if ($loglevel>1);
#					sg_clean(clean_file($list),'exprlist');
				}
			}
		}
	}
}

sub removedir {
 my $dir = shift;
 if (-e "$dir") {
   my $temp=rmtree($dir,1,1);
 }
}

sub cleanup {
	LOG("Cleaning up...") if ($loglevel>0);
	system("/bin/rm $pidfile");
}

sub clean_file {
 my $filename = shift;
 my $type = shift;
 my @lines;
 open( FILE, "< $filename" ) || die "Can't open $filename : $!" ;
 open( WRITE,"> $filename.$$" ) || die "Can't write to $filename.$$ : $!" ;

 while( <FILE> ) {
	     s/#.*//;            # ignore comments by erasing them
    	 next if /^(\s)*$/;  # skip blank lines
#	     chomp;              # remove trailing newline characters
         print WRITE "$_";   # write into temporary file
      }
 close FILE;
 close WRITE;
print STDERR $filename.$$ if ($DEBUG);
return $filename.$$;
}

sub sg_clean {
  my $file = shift;
  my $type = shift;
  LOG("cleaning $type : $file") if ($loglevel>1);
  open(F,$file)       || die "can't open $type file $file: $!";
  open(W,">$file.$$") || die "can't write to $file.$$: $!";
  sg_clean_dbfiles();
  my(%SG,%SGD);
  tie(%SG, 'DB_File',$tmpfile,O_RDWR|O_CREAT,0640,$DB_BTREE);
  tie(%SGD, 'DB_File',$tmpfile_delete,O_RDWR|O_CREAT,0640,$DB_BTREE);
  my $count = 1;
  my $i = 0; 
  while(<F>){
    chomp;
    my($dburl,$redirect) = split;
    $redirect = "" if(!defined $redirect);
    $SG{$dburl} = $redirect;
    $count++;
  }
  close(F);
  my($dburl,$redirect);
  while (($dburl,$redirect) = each %SG) {
    my $keep = undef;
    if($type eq "domainlist"){
      $keep = sg_clean_domain($dburl,\%SG,1);
    } elsif($type eq "urllist"){
      $keep = sg_clean_url($dburl,\%SG,1);
    }
    if(!defined $keep){
      $SGD{$dburl}++;
    }
    if($i % 100 == 0){
      my $p = ($i * 100)/$count;
      #print STDERR "." if(int($p) % 10 == 0);
    }
    $i++;
  }
  LOG("cleaning complete") if ($loglevel>1);
  LOG("updating file: $file")  if ($loglevel>1);
  $i = 0;
  while (($dburl,$redirect) = each %SG) {
    next if(defined $SGD{$dburl});
    my $line = "$dburl" . ($redirect ? " $redirect\n" : "\n");
    print W "$line";
    if($i % 100 == 0){
      my $p = ($i * 100)/$count;
      #print STDERR "." if(int($p) % 10 == 0);
    }
    $i++;
  }
  LOG("Update complete") if ($loglevel>1);
  close(W);
  sg_update_files($file);
  untie(%SG);
  untie(%SGD);
  sg_clean_dbfiles();
}

sub sg_clean_domain {
  my $domain = shift;
  my $tie = shift;
  my $exists_ok = shift;
  my $parts = [split(/[.]/,$domain)];
  my $d = "";
  for(reverse @$parts){
    $d = "$_$d";
    if(defined $tie->{$d}){
      if($domain eq $d){
        print STDERR "$domain exists, skipping\n"  if ($DEBUG);
        return 1 if($exists_ok);
      } else {
        print STDERR "$domain is subdomain of $d, skipping\n"  if ($DEBUG);
      }
      return undef;
    }
    $d = ".$d";
  }
  return 1;
}

sub sg_clean_url {
  my $dburl = shift;
  my $tie = shift;
  my $exists_ok = shift;
  my $parts = [split(/[\/]/,$dburl)];
  my $d = "";
  for(@$parts){
    $d = "$d$_";
    if(defined $tie->{$d}){
      if($dburl eq $d){
        print "$dburl exists, skipping\n" if ($DEBUG);
        return 1 if($exists_ok);
      } else {
        print "$dburl is part of $d, skipping\n" if ($DEBUG);
      }
      return undef;
    }
    $d = "$d/";
  }
  return 1;
}
sub sg_clean_dbfiles {
  if(-e "$tmpfile"){
    unlink("$tmpfile") || warn "can't remove $tmpfile: $!";
  }
  if(-e "$tmpfile_delete"){
    unlink("$tmpfile_delete")|| warn "can't remove $tmpfile_delete: $!";
  }
}

sub sg_update_files {
  my $file = shift;
  if(-e "$file"){
    system("cp $file $file.old");
  }
  if(-e "$file.$$"){
    rename("$file.$$",$file) || warn "can't rename $file.$$ to $file: $!";
  }
}

sub LOG {
  my $message = shift;
  my $time = localtime(time);
  my $name = 'get_blacklist';
  print LOGFILE "$time $name : $message\n";
} 
