package Lire::Apache;

use Lire::Config;
use Lire::Time;
use Lire::DlfSchema;
use Lire::Logger;

use strict;
use vars qw/ @ISA @EXPORT_OK $combined_dlf_maker $common_dlf_maker 
	     $modgzip_dlf_maker $referer_dlf_maker /;

BEGIN {
    require Exporter;
    @ISA = qw/ Exporter /;
    @EXPORT_OK = qw/ combined2dlf common2dlf modgzip2dlf referer2dlf /;

    Lire::Config->init();

    my $schema	= Lire::DlfSchema::load_schema( "www" );
    my @common_fields = qw/client_host who http_result requested_page_size
			   http_action requested_page http_protocol time/;

    $common_dlf_maker = $schema->make_hashref2asciidlf_func( @common_fields );
    $combined_dlf_maker = 
      $schema->make_hashref2asciidlf_func( @common_fields,
					   qw/referer useragent/);

    $modgzip_dlf_maker = 
      $schema->make_hashref2asciidlf_func( @common_fields,
					   qw/gzip_result compression/ );

    $referer_dlf_maker = 
      $schema->make_hashref2asciidlf_func( qw/requested_page referer/  );
}

sub common2dlf {
    my %dlf = ();
    my $rest = parse_common( $_[0], \%dlf );
    lr_warn("common2dlf ignoring trailing data after the CLF record: '$rest'")
      if length $rest;

    return $common_dlf_maker->( \%dlf );
}

#
# This was rewritten to use a complex regex. Tests shows
# that this performs better than the old one using split and 
# such.
# 
# It has also the advantage of being simpler.
#
# Performance data
#		old-500	    new-500	old-5000	new-5000 
# Real:		 0.91	     1.05	7.59		 7.03
# User:		 0.79	     0.97	6.91		 6.40
# System:	 0.08	     0.06	0.45		 0.39
#
#		old-15000   new-15000	old-30000	new-30000 
# Real:		22.52	    20.25	44.99		40.235
# User:		20.69	    18.50	41.17		36.810
# System:	 1.17	     1.14	 2.45		 2.210
#
# Altough it is faster on small inputs, there is a constant 10% speedup 
# for log files that have more than 5000 lines
sub parse_common {
    my $dlf = $_[1];

    my ($time, $http_request, $rest);
    ( $dlf->{'client_host'},
      $dlf->{'who'},
      $time,
      $http_request,
      $dlf->{'http_result'},
      $dlf->{'requested_page_size'},
      $rest,
    ) = $_[0] =~ m/^
        (\S+)\s+         # client_host
        \S+\s+
        (\S+)\s+         # who
        (\[.*?\])\s+     # time
        "(.*?)"\s+       # http_request
        (\d+|-)\s+       # http_result
        (\d+|-)(.*)      # requested_page_size
      $/x or die "parse_common invalid CLF line '" . $_[0] . "'\n";

    my $l;
    ( $dlf->{'http_action'}, $l ) = $http_request =~ m/^\s*(\w+)\s*(.*)$/;

    if ( $l ) {
	( $dlf->{'requested_page'},
	  $dlf->{'http_protocol'}) = $l =~ m|^(\S+)\s*(HTTP/[\d.]+)\s*$|;

	# In case the protocol made the match failed
	$dlf->{'requested_page'} = $l
	  unless $dlf->{'requested_page'};
    } else {
	# Prevent undefined warnings
	$dlf->{'requested_page'} = "";
    }


    # parse_next_token removes the delimiter
    $dlf->{'time'} = clf2cal( $time );

    # Normalize size field
    $dlf->{'requested_page_size'} = 0
      if ( $dlf->{'requested_page_size'} eq '-' );

    return defined $rest && length $rest ? $rest : '';
}

sub combined2dlf {
    my %dlf = ();
    my $rest = parse_common( $_[0], \%dlf );

    die "combined2dlf line looks like Common Log Format not combined\n"
      unless length $rest;

    ( $dlf{'referer'},
      $dlf{'useragent'}
    ) = $rest =~ /^ "(.*)" "(.*)"$/
      or warn "combined2dlf: can't parse referer and useragent fields: $rest\n";

    return $combined_dlf_maker->( \%dlf );
}

sub modgzip2dlf {
    my %dlf = ();
    my $rest = parse_common( $_[0], \%dlf );
    die "modgzip2dlf line looks like Common Log Format not modgzip"
      unless length $rest;

    ( $dlf{'gzip_result'},
      $dlf{'compression'},
    ) = $rest =~ m!^ (\S+) (\d+|-)$!
      or warn "modgzip2dlf can't parse modgzip specific fields in '$rest'\n";

    return $modgzip_dlf_maker->( \%dlf );
}

sub referer2dlf {
    my %dlf = ();

    ( $dlf{'referer'},
      $dlf{'requested_page'}
    ) = $_[0] =~ /^(.*) -> (.*)$/
      or die "referer2dlf can't parse referer log line\n";
    
    return $referer_dlf_maker->( \%dlf );
}

1

__END__

=pod

=head1 NAME

Lire::Apache - parse various apache logfile lines and generate DlfSchema objects

=head1 SYNOPSIS

 use Lire::Apache qw(common2dlf);

 my $dlf = common2dlf( $logline );
 print join( " ", @$dlf), "\n";

For 'common' one can also read 'combined', 'modgzip' or 'referer'.

=head1 DESCRIPTION

This module offers the subroutines combined2dlf, common2dlf, modgzip2dlf
and referer2dlf.  These routines take a log file line of the specified type
as their argument and return a I<logtype>_dlf_maker->( \%dlf ) object, as
constructed by the appropiate &Lire::DlfSchema::make_hashref2asciidlf_func
call.

=head1 SEE ALSO

combined2dlf(1), common2dlf(1), modgzip2dlf(1), referer2dlf(1)

=head1 VERSION

$Id: Apache.pm,v 1.2 2006/07/23 13:16:36 vanbaal Exp $

=head1 COPYRIGHT

Copyright (C) 2000, 2001 Stichting LogReport Foundation LogReport@LogReport.org

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program (see COPYING); if not, check with
http://www.gnu.org/copyleft/gpl.html.

=head1 AUTHORS

Joost van Baal <joostvb@logreport.org> and Francis J. Lacoste
<flacoste@logreport.org>

=cut

