#!/usr/bin/perl
#
# parse_apache_log.pl by Phil2k@gmail.com
#  ver. 0.4
#


use POSIX qw(mktime);

%months=(
  "Jan" => 0,
  "Feb" => 1,
  "Mar" => 2,
  "Apr" => 3,
  "May" => 4,
  "Jun" => 5,
  "Jul" => 6,
  "Aug" => 7,
  "Sep" => 8,
  "Oct" => 9,
  "Nov" => 10,
  "Dec" => 11
  );



#[Sun Oct 11 14:06:57 2009] [error] [client 66.249.65.37] client denied by server configuration: /usr/htdocs  
$regexp1='^\[(\S+)\s+(\S+)\s+(\d+)\s+(\d+):(\d+):(\d+)\s+(\d+)\]\s+\[([^\]]+)\]\s+\[(\S+)\s+(\S+)\]\s+(.*)$';

#67.195.115.26 - - [08/Oct/2009:03:12:41 +0300] "GET /robots.txt HTTP/1.0" 301 306
#67.195.115.26 - - [08/Oct/2009:03:12:41 +0300] "GET /index.html HTTP/1.0" 200 30621 "http://xxx.bla.ro/?a=b" "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.14) Gecko/2009082707 Firefox/3.0.14"
#188.27.109.204 - - [09/Oct/2009:23:53:19 +0300] "GET /images/green_grid4.png HTTP/1.1" 200 3064 "http://phil.ro/h.css" "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.3) Gecko/20090824 Firefox/3.5.3 (.NET CLR 3.5.30729)"
#188.25.163.57 - - [06/May/2012:03:10:43 +0300] "GET /cache/js/ocazii.js?1333106481 HTTP/1.1" 304 - "http://www.emag.ro/resigilate/p13?&supra_catid=599&catid=75&pret=-1" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19" 0
$regexp2='^(\S+)\s+-\s+-\s+\[(\d+)\/([^\/]+)\/(\d+):(\d+):(\d+):(\d+)(.*?)\]\s+"(\S+)\s+(\S+)\s+(\S+)"\s+(\d+|-)\s+(\d+|-)(.*)$';

$line=<STDIN>; # test first line of input log for the next parsing lines ( to speedup testing )
chomp $line;
if ($line=~/$regexp1/) { $use_regexp=1; }
elsif ($line=~/$regexp2/) { $use_regexp=2; }
while(!eof(STDIN)) {
  my $status_code=undef;
  my $ua="";
  my $time=undef;
  my $tz_shift=undef;
  my $source="";
  my $status=undef;
  my $status_text=undef;
  my $ip=undef;
  my $req_method=undef;
  my $size=0;
  my $duration=undef;
  my $req_path=undef;
  my $referer=undef;
  
  if (($use_regexp==1) && ($line=~/$regexp1/)) {
    $time = mktime($6, $5, $4, $3, $months{$2}, $7-1900);
    $status = $8; # "error"
    $source = $9; # "client"
    $ip = $10; # "66.249.65.37"
    $status_text = $11; # "client denied by ..."
    $status_code = 403;
    }
  elsif (($use_regexp==2) && ($line=~/$regexp2/)) {
    $ip=$1; # "67.195.115.26"
    #mktime(sec, min, hour, mday, mon, year, wday = 0, yday = 0, isdst = 0)
    $time = mktime($7, $6, $5, $2, $months{$3}, $4-1900);
    $tz_shift = $8; # "+300" ... not used yet... maybe in future versions
    $req_method = $9; # GET/POST
    $req_path = $10; # "/index.html"
    $proto_ver = $11; # "HTTP/1.0"
    $status_code = $12; # 200, 404, etc.
    if ($status_code>399) {
      $status="error";
      }
    elsif ($status_code>299) {
      $status="redirect";
      }
    elsif ($status_code>199) {
      $status="ok";
      }
    else {
      $status="?";
      }
    $size = $13;
    my $rest = $14;
    $tz_shift=~s/^\s+//; # left trim
    $rest=~s/^\s+//; # left trim
    if ($rest=~/^"([^"]+)"$/) {
      $referer=$1;
      }
    elsif ($rest=~/^"([^"]+)"\s+"([^"]+)"$/) {
      $referer=$1;
      $ua=$2;
      }
    elsif ($rest=~/^"([^"]+)"\s+"([^"]+)"\s+(\d+)$/) {
      $referer=$1;
      $ua=$2;
      $duration=$3;
      }
    else {
      $referer=$rest;
      }
    $source="client";
    $status_text=$req_path;
    }
  else {
    print STDERR "Cannot parse line: \"$line\" ! Please send this line to phil2k\@gmail.com to resolve it !\n";
    next;
    }
  $now = time();
  ### Here you can enter your code, and use variables:
  ## $time = unix timestamp of date-time extracted from line-log
  ## $now = curent timestamp (to compare with $time if you want)
  ## $ip = remote ip address
  ## $source = source of message ( usualy = "client" )
  ## $status = short status/error ( "ok", "error", "redirect" )
  ## $status_code = HTTP status code ( 200, 404, etc.. )
  ## $status_text = error string for errors or URL
  ## $req_method = HTTP request method ( GET, POST, etc... )
  ## $req_path = request sent to the server containing usualy URL or relative file_path sent to server ( extracted from URL , doesn't contain proto,user,pass,port,params )
  ## $proto_ver = HTTP version ( usualy = "HTTP/1.0" or "HTTP/1.1" )
  ## $url = URL
  ## $ua = user agent ( web client version ... )
  print "$ip $time method=$req_method status($status_code)=$status size=$size bytes text=$status_text\n";
  ###
  $line=<STDIN>;
  chomp($line);
  };