#!/usr/bin/perl
#
# parse_squid_access_log.pl ver.2.3
#  by Phil2k@gmail.com
#

##############################
# User modificable constants:
##############################
$min_http_code = 200;
$max_http_code = 599;
@proxy_result_codes=(
  "TCP_HIT",
  "TCP_MISS",
  "TCP_REFRESH_HIT",
  "TCP_REF_FAIL_HIT",
  "TCP_REFRESH_MISS",
  "TCP_CLIENT_REFRESH_MISS",
  "TCP_IMS_HIT",
  "TCP_SWAPFAIL_MISS",
  "TCP_NEGATIVE_HIT",
  "TCP_MEM_HIT",
  "TCP_DENIED",
  "TCP_OFFLINE_HIT",
  "UDP_HIT",
  "UDP_MISS",
  "UDP_DENIED",
  "UDP_INVALID",
  "UDP_MISS_NOFETCH",
  "NONE"
  );
##############################

use Socket;

$syntax = "Syntax: $0 [-a <search_addr>] [-m <http_method>] [-s <min_size>] [-S <max_size>] [-i <date_time_interval>] [-c <proxy_code>] [-h <http_code>] [-u <url>] [-o <output_data_files_suffix>]
Where: -a <search_address> = the IP or address/bitlen or address/netmask to search, can be specified multiple times
       -m <http_method> = HTTP method (GET, POST, PUT, HEADER), can be specified multiple times
       -s <min_size> = minimum bytes of logged access to search, can be sufixed with K for KiloBytes, M for MegaBytes or G for GigaBytes
       -S <max_size> = maximum bytes of logged access to search, can be suffixed with K for KiloBytes, M for MegaBytes or G for GigaBytes
       -i <date_time_interval> = date or date+time interval to search (ex.: 2009/12/30-2010/01/01 or 2009/12/30:22:11:20-2010/01/01:07:54:00)
       -c <proxy_code> = proxy return code (try \"-c help\" to see all available proxy return codes), can be specified multiple times
       -h <http_code> = HTTP code/status number or interval (ex.: 200, or: 200-299), range: $min_http_code-$max_http_code, can be specified multiple times
       -u <url> = URL or part of URL to search, can be specified multiple times
       -o <output_data_files_suffix> = output file prefix, if you want output data for each IP,year,month containing bytes and host on each row; output to stdout will be quiet\n\n";
%search_addr=();
$is_search_addr=0;
@search_http_method=();
$search_min_size=undef;
$search_max_size=undef;
$search_date_time_interval=undef;
$time_start = undef;
$time_end = undef;
@search_proxy_code=();
@search_http_code=();
@search_url=();
$output_data_file_suffix=undef;
$options = 1;
$opt = "";
while($#ARGV>-1) {
  $arg = shift @ARGV;
  if ($options) {
    if (length($opt)) {
      if ($opt eq "a") {
        $search_addr{$arg} = [];
        $opt="";
        }
      elsif ($opt eq "m") {
        push @search_http_method, uc($arg);
        $opt="";
        }
      elsif ($opt eq "s") {
        $search_http_min_size = $arg;
        $opt="";
        }
      elsif ($opt eq "S") {
        $search_http_max_size = $arg;
        $opt="";
        }
      elsif ($opt eq "i") {
        $search_date_time_interval = $arg;
        $opt="";
        }
      elsif ($opt eq "c") {
        push @search_proxy_code, uc($arg);
        $opt="";
        }
      elsif ($opt eq "h") {
        push @search_http_code, $arg;
        $opt="";
        }
      elsif ($opt eq "u") {
        push @search_url, lc($arg);
        $opt="";
        }
      elsif ($opt eq "o") {
        $output_data_file_suffix=$arg;
        $opt="";
        }
      else {
        print STDERR $syntax;
        exit(1);
        }
      }
    elsif ($arg=~/^-(\S)(.*)/) {
      $opt=$1;
      $val=$2;
      $val=~s/^\s+//;
      if ($opt eq "-") { $options = 0; }
      elsif (index("amsSichuo", $opt)==-1) {
        print STDERR $syntax;
        exit(1);
        }
      if (length($val)) {
        if ($opt eq "a") {
          $search_addr{$val} = [];
          }
        elsif ($opt eq "m") {
          push @search_http_method, uc($val);
          }
        elsif ($opt eq "s") {
          $search_http_min_size = $val;
          }
        elsif ($opt eq "S") {
          $search_http_max_size = $val;
          }
        elsif ($opt eq "i") {
          $search_date_time_interval = $val;
          }
        elsif ($opt eq "c") {
          push @search_proxy_code, uc($val);
          }
        elsif ($opt eq "h") {
          push @search_http_code, $val;
          }
        elsif ($opt eq "u") {
          push @search_url, lc($val);
          }
        elsif ($opt eq "o") {
          $output_data_file_suffix = $val;
          }
        else {
          print STDERR $syntax;
          exit(1);
          }
        $opt="";
        }
      }
    }
  }

foreach $addr (keys %search_addr) {
  if ($addr=~/^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/) {
    if (&my_inet_ntoa(&my_inet_aton($addr)) ne $addr) {
      print STDERR "$addr isn't an IP address !\n".$syntax;
      exit(1);
      }
    }
  elsif ($addr=~/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\/(\d{1,2})$/) {
    $a=$1;
    $l=$2;
    if (&my_inet_ntoa(&my_inet_aton($a)) ne $a) {
      print STDERR "address $a from $addr isn't an IP address !\n".$syntax;
      exit(1);
      }
    if (($l ne int($l)) || ($l<0) || ($l>32)) {
      print STDERR "bitlen $l from $addr must be between 0-32 !\n".$syntax;
      exit(1);
      }
    $search_addr{$addr}[0]=&my_inet_aton($a);
    $search_addr{$addr}[1]=&my_inet_aton(&bitlen2netmask($l));
    }
  elsif ($addr=~/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})$/) {
    $a=$1;
    $n=$2;
    if (&my_inet_ntoa(&my_inet_aton($a)) ne $a) {
      print STDERR "address $a from $addr isn't an IP address !\n".$syntax;
      exit(1);
      }
    if ((&my_inet_ntoa(&my_inet_aton($n)) ne $n) || (&bitlen2netmask(&netmask2bitlen($n)) ne $n)) {
      print STDERR "$n from $addr isn't a netmask !\n".$syntax;
      exit(1);
      }
    $search_addr{$addr}[0]=&my_inet_aton($a);
    $search_addr{$addr}[1]=&my_inet_aton($n);
    }
  else {
    print STDERR "$addr isn't an IPv4 or IP/Netmask address !\n".$syntax;
    exit(1);
    }
  if (!$is_search_addr) { $is_search_addr = 1; }
  }
foreach $method (@search_http_method) {
  if ($method!~/^(GET|POST|PUT|HEADER)$/i) {
    print STDERR "http_method(-t) must be GET, POST, PUT or HEADER !\n".$syntax;
    exit(1);
    }
  }
if (defined($search_min_size)) {
  if ($search_min_size=~/^(\d+)([bkmg]).*/i) {
    $n=$1;
    $u=$2;
    if (lc($u) eq "k") { $n=$n*1024; }
    elsif (lc($u) eq "m") { $n=$n*1048576; }
    elsif (lc($u) eq "g") { $n=$n*1073741824; }
    $search_min_size=$n;
    }
  elsif ($search_min_size!~/^\d+$/) {
    print STDERR $syntax;
    exit(1);
    }
  }
if (defined($search_max_size)) {
  if ($search_max_size=~/^(\d+)([bkmg]).*/i) {
    $n=$1;
    $u=$2;
    if (lc($u) eq "k") { $n=$n*1024; }
    elsif (lc($u) eq "m") { $n=$n*1048576; }
    elsif (lc($u) eq "g") { $n=$n*1073741824; }
    $search_max_size=$n;
    }
  elsif ($search_max_size!~/^\d+$/) {
    print STDERR $syntax;
    exit(1);
    }
  }
if (defined($search_date_time_interval)) {
  $delimiters='-_,; |>/';
  for($i=0;$i<length($delimiters);$i++) {
    $c = substr($delimiters, $i, 1);
    @interval=split($c, $search_date_time_interval);
    last if ($#interval==1);
    }
  if ($#interval==1) {
    if (($interval[0] eq "") && (lc($interval[0]) eq "now")) {
      $time_start = time();
      }
    elsif ($interval[0]=~/^(\d+)[^0-9](\d+)[^0-9](\d+)$/) {
      $year = $1;
      $month = $2;
      $day = $3;
      $time_start = mktime(0, 0, 0, $from_day, $from_month-1, $from_year-1900);
      }
    if (defined($time_start)) {
      if (($interval[1] eq "") && (lc($interval[1]) eq "now")) {
        $time_end = time();
        }
      elsif ($interval[1]=~/^(\d+)[^0-9](\d+)[^0-9](\d+)$/) {
        $year = $1;
        $month = $2;
        $day = $3;
        $time_end = mktime(59, 59, 23, $from_day, $from_month-1, $from_year-1900);
        }
      }
    }
  if ((!defined($time_start)) || (!defined($time_end)) || ($time_start>$time_end)) {
    print STDERR $syntax;
    exit(1);
    }
  }
foreach $code (@search_proxy_code) {
  if (!defined(&in_array($code, \@proxy_result_codes))) {
    print STDERR "proxy result code (-c) must be one of the ".join(',', @proxy_result_codes)." !\n".$syntax;
    exit(1);
    }
  }
for($i=0;$i<=$#search_http_code;$i++) {
  $code = $search_http_code[$i];
  if ($code ne int($code)) {
    $delimiters='-_,; |>/';
    for($j=0;$j<length($delimiters);$j++) {
      $c = substr($delimiters, $j, 1);
      @interval=split($c, $code);
      last if ($#interval==1);
      }
    if (($#interval==1) && ($interval[0] eq int($interval[0])) && ($interval[0]>=$min_http_code) && ($interval[0]<=$max_http_code) && ($interval[1] eq int($interval[1])) && ($interval[1]>=$min_http_code) && ($interval[1]<=$max_http_code) && ($interval[0]<=$interval[1])) {
      $search_http_code[$i] = [ @interval ];
      } else {
      print STDERR "http code (-h $code) must be number between $min_http_code and $max_http_code, or a interval of this numbers (ex.: 200-399) !\n".$syntax;
      exit(1);
      }
    }
  elsif (($code<$min_http_code) || ($code>$max_http_code)) {
    print STDERR "http code (-h $code) must be number between $min_http_code and $max_http_code, or a interval of this numbers (ex.: 200-399) !\n".$syntax;
    exit(1);
    } else {
    $search_http_code[$i] = [ ($code, $code) ];
    }
  }

%odata=();

while(length($line=<STDIN>)) {
  chomp($line);
  # 1264062832.636    213 192.168.203.26 TCP_CLIENT_REFRESH_MISS/200 1413 GET http://89.202.157.227/nod_eval/update.ver - DIRECT/89.202.157.227 application/octet-stream
  if ($line=~/^(\d+)(\.\d+)?\s+(\d+)\s+(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s+([^\/]+)\/(\d+)\s+(\d+)\s+(\S+)\s+(\S+)/) {
    $utime=$1;
    $dtime=$2; if (length($dtime)) { $dtime=substr($dtime, 1); }
    $elapsed=$3;
    $ip=$4;
    $nip = &my_inet_aton($ip);
    $proxy_code=$5;
    $http_code=$6;
    $http_size=$7;
    $http_method=$8;
    $http_url=lc($9);
    
    #print "$ip accessed $http_url ($http_size)\n";
    
    # interval
    if (defined($time_start) && defined($time_end)) {
      next if (($utime<$time_start) || ($utime>$time_end));
      }
    
    # ip
    if ($is_search_addr) {
      $valid = 0;
      foreach $addr (keys %search_addr) {
        if ($#{$search_addr{$addr}}==-1) {
          if ($ip eq $addr) {
            $valid = 1;
            last;
            }
          } else {
          if (($nip & $search_addr{$addr}[1])==$search_addr{$addr}[0]) {
            $valid = 1;
            last;
            }
          }
        }
      next if (!$valid);
      }
    
    # proxy code
    if ($#search_proxy_code>-1) {
      $valid = 0;
      foreach $code (@search_proxy_code) {
        if ($proxy_code eq $code) {
          $valid = 1;
          last;
          }
        }
      next if (!$valid);
      }
    
    # http code/status
    if ($#search_http_code>-1) {
      $valid = 0;
      for($i=0;$i<=$#search_http_code;$i++) {
        ($scode, $ecode)=$search_http_code[$i];
        if (($http_code>=$scode) && ($http_code<=$ecode)) {
          $valid = 1;
          last;
          }
        }
      next if (!$valid);
      }
    
    # http size
    next if (defined($search_min_size) && ($http_size<$search_min_size));
    next if (defined($search_max_size) && ($http_size>$search_max_size));
    
    # http method
    if ($#search_http_method>-1) {
      $valid = 0;
      foreach $method (@search_http_method) {
        if ($http_method eq $method) {
          $valid = 1;
          last;
          }
        }
      next if (!$valid);
      }

    # http url
    if ($#search_url>-1) {
      $valid = 0;
      foreach $url (@search_url) {
        if (index($http_url, $url)) {
          $valid = 1;
          last;
          }
        }
      next if (!$valid);
      }    
    
    ($sec,$min,$hour,$day,$month,$year,$wday,$yday,$isdst) = localtime($utime);
    $year+=1900;
    $month++;
    print "$year-$month-$day $ip $http_size $http_url\n" unless (defined($output_data_file_suffix));

    if (defined($output_data_file_suffix)) {
      $host=$http_url;
      $host=~s/^[^:]+:\/\///; # remove http:// or others
      $host=~s/\/.*$//; # remove /path
      $host=~s/:\d+//; # remove :port
      $host=~s/[^@]+@//; # remove user@ or user:pass@
      $odata{$ip}{$year}{$month}{$day}{$host}+=$http_size;
      }
    }
  }

if (defined($output_data_file_suffix)) {
  foreach $ip (keys %odata) {
    foreach $year (sort { $a <=> $b } keys %{$odata{$ip}}) {
      foreach $month (sort { $a <=> $b } keys %{$odata{$ip}{$year}}) {
        open OFILE, ">${output_data_file_suffix}_${ip}_${year}_".sprintf("%02d", $month).".txt";
        foreach $day (sort { $a <=> $b } keys %{$odata{$ip}{$year}{$month}}) {
          foreach $host (sort { $odata{$ip}{$year}{$month}{$day}{$b} <=> $odata{$ip}{$year}{$month}{$day}{$a} } keys %{$odata{$ip}{$year}{$month}{$day}}) {
            print OFILE sprintf("%02d", $day).": ".$odata{$ip}{$year}{$month}{$day}{$host}." $host\n";
            }
          }
        close OFILE;
        }
      }
    }
  }



sub my_inet_aton() {
  return unpack("N4", inet_aton($_[0]));
  }

sub my_inet_ntoa() {
  return inet_ntoa(pack("N", $_[0]));
  }

sub bitlen2netmask() {
  return &my_inet_ntoa((~0)-(2**(32-$_[0]))+1);
  }

sub netmask2bitlen() {
  return 32-(log((~0)-&my_inet_aton($_[0])+1)/log(2));
  }

sub in_array() {
  my ($search, $array_ref)=@_;
  my $i;
  for($i=0;$i<=$#$array_ref;$i++) {
    return $i if ($array_ref->[$i] eq $search);
    }
  return undef;
  }