package Lire::Apache;

use Lire::Time;
use Lire::DlfSchema;
use Lire::Program qw( :msg );

use strict;
use vars qw/ @ISA @EXPORT_OK $combined_dlf_maker $common_dlf_maker 
	     $modgzip_dlf_maker $referer_dlf_maker /;

BEGIN {
    require Exporter;
    @ISA = qw/ Exporter /;
    @EXPORT_OK = qw/ combined2dlf common2dlf modgzip2dlf referer2dlf /;
    
    my $schema	= Lire::DlfSchema::load_schema( "www" );

    my @common_fields = qw/client_host client_domain who http_result 
			   requested_page_size http_action requested_page 
			   requested_page_ext requested_file http_protocol 
			   time/;

    $common_dlf_maker = $schema->make_hashref2asciidlf_func( @common_fields );
    $combined_dlf_maker = 
      $schema->make_hashref2asciidlf_func( @common_fields,
					   qw/referer useragent/);

    $modgzip_dlf_maker = 
      $schema->make_hashref2asciidlf_func( @common_fields,
					   qw/gzip_result compression/ );

    $referer_dlf_maker = 
      $schema->make_hashref2asciidlf_func( qw/requested_page 
					      requested_page_ext 
					      requested_file referer/  );
}

sub common2dlf {
    my %dlf = ();
    my $rest = parse_common( $_[0], \%dlf );
    lr_warn("common2dlf ignoring trailing data after the CLF record: '$rest'")
      if length $rest;

    return $common_dlf_maker->( \%dlf );
}

#
# This was rewritten to use a complex regex. Tests shows
# that this performs better than the old one using split and 
# such.
# 
# It has also the advantage of being simpler.
#
# Performance data
#		old-500	    new-500	old-5000	new-5000 
# Real:		 0.91	     1.05	7.59		 7.03
# User:		 0.79	     0.97	6.91		 6.40
# System:	 0.08	     0.06	0.45		 0.39
#
#		old-15000   new-15000	old-30000	new-30000 
# Real:		22.52	    20.25	44.99		40.235
# User:		20.69	    18.50	41.17		36.810
# System:	 1.17	     1.14	 2.45		 2.210
#
# Altough it is faster on small inputs, there is a constant 10% speedup 
# for log files that have more than 5000 lines
sub parse_common {
    my $dlf = $_[1];

    my ($time, $http_request, $rest);
    ( $dlf->{client_host},
      $dlf->{who},
      $time,
      $http_request,
      $dlf->{http_result},
      $dlf->{requested_page_size},
      $rest,
    ) = $_[0] =~ m/^
        (\S+)\s+         # client_host
        \S+\s+
        (\S+)\s+         # who
        (\[.*?\])\s+     # time
        "(.*?)"\s+       # http_request
        (\d+|-)\s+       # http_result
        (\d+|-)(.*)      # requested_page_size
      $/x or die "parse_common invalid CLF line '" . $_[0] . "'\n";

    # $_[0] =~ m!^(\S+) \S+ (\S+) (\[.*?\]) "(\S*)\s*(.*\S)\s*(HTTP\S*).*" (\d+|-) (\d+|-)(.*)$! 
    #
    # e.g.  GET /index.html HTTP/1.0
    ( $dlf->{http_action},
      $dlf->{requested_page},
      $dlf->{http_protocol}
    ) = $http_request =~ m/^
        (\w+)\s+
        (\S+)\s+
        (HTTP\S+)\s*
      $/x or &lr_info("wacky or no http request '$http_request'");

    # Remove possible uninitialized warnings
    $dlf->{http_action} ||= "-";
    $dlf->{requested_page} ||= "-";
    $dlf->{http_protocol} ||= "-";

    # We need to test IP before name, because the name regex 
    # also match IPs.
    if ($dlf->{client_host} =~ /^(\d+\.\d+\.\d+)\.\d+$/) {
	$dlf->{client_domain} = $1;
    } elsif ( $dlf->{client_host} =~ /^([\w.-]+\.)*([\w.-]+\.[\w.-]+)$/ ) {
	$dlf->{client_domain} = $2;
    } elsif ($dlf->{client_host} =~ /^[\w.-]+$/) {
	# just a hostname
	$dlf->{client_domain} = 'localnet';
    } # other cases handled by default '-'

    # parse_next_token removes the delimiter
    $dlf->{time} = clf2cal( $time );

    parse_url( $dlf );

    # Normalize size field
    $dlf->{requested_page_size} = 0
      if ( $dlf->{requested_page_size} eq '-' );

    return defined $rest && length $rest ? $rest : '';
}

# Fill requested_file and requested_page_ext from
# requested_page 

# TODO : this could better use Lire::WWW::URL
sub parse_url {
    my ( $dlf ) = @_;

    # get rid of http escapes, e.g. map /%7Eedwin/jmj to /~edwin/jmj 
    # use tr or unpack


    # Remove query string to find real file
    my $i = index $dlf->{requested_page}, '?';
    if ( $i < 0 ) {
	$dlf->{requested_file} = $dlf->{requested_page};
    } else {
	$dlf->{requested_file} = substr $dlf->{requested_page}, 0, $i;
    }

    # Parse file extension
    $i = rindex $dlf->{requested_file}, '.';
    $dlf->{requested_page_ext} = substr $dlf->{requested_file}, $i + 1
      unless $i < 0;
}

sub combined2dlf {
    my %dlf = ();
    my $rest = parse_common( $_[0], \%dlf );

    die "combined2dlf line looks like Common Log Format not combined\n"
      unless length $rest;
    my @a=split(/\"/,$rest);

    if ($#a>2) {
	for (my $i=1; $i<$#a-1; $i++) {
	    $dlf{referer}.=$a[$i];
	}
	$dlf{useragent}=$a[$#a];
    } else {
	die "combined2dlf: can't parse referer and useragent fields\n";
    }

    return $combined_dlf_maker->( \%dlf );
}

sub modgzip2dlf {
    my %dlf = ();
    my $rest = parse_common( $_[0], \%dlf );
    die "modgzip2dlf line looks like Common Log Format not modgzip"
      unless length $rest;

    ( $dlf{gzip_result},
      $dlf{compression},
    ) = $rest =~ m!^ (\S+) (\d+|-)$!
      or die "modgzip2dlf can't parse modgzip specific fields in '$rest'\n";

    return $modgzip_dlf_maker->( \%dlf );
}

sub referer2dlf {
    my %dlf = ();

    ( $dlf{referer},
      $dlf{requested_page}
    ) = $_[0] =~ /^(.*) -> (.*)$/
      or die "referer2dlf can't parse referer log line\n";
    
    parse_url( \%dlf );

    return $referer_dlf_maker->( \%dlf );
}

1

__END__

=pod

=head1 NAME

Lire::Apache - parse various apache logfile lines and generate DlfSchema objects

=head1 SYNOPSIS

 use Lire::Apache qw(common2dlf);

 my $dlf = common2dlf( $logline );
 print join( " ", @$dlf), "\n";

For 'common' one can also read 'combined', 'modgzip' or 'referer'.

=head1 DESCRIPTION

This module offers the subroutines combined2dlf, common2dlf, modgzip2dlf
and referer2dlf.  These routines take a log file line of the specified type
as their argument and return a I<logtype>_dlf_maker->( \%dlf ) object, as
constructed by the appropiate &Lire::DlfSchema::make_hashref2asciidlf_func
call.

=head1 SEE ALSO

combined2dlf(1), common2dlf(1), modgzip2dlf(1), referer2dlf(1)

=head1 VERSION

$Id: Apache.pm,v 1.10 2002/02/03 13:19:12 vanbaal Exp $

=head1 COPYRIGHT

Copyright (C) 2000-2001 Stichting LogReport Foundation LogReport@LogReport.org
 
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
 
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
 
You should have received a copy of the GNU General Public License
along with this program (see COPYING); if not, check with
http://www.gnu.org/copyleft/gpl.html or write to the Free Software 
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.

=head1 AUTHORS

Joost van Baal <joostvb@logreport.org> and Francis J. Lacoste
<flacoste@logreport.org>

=cut

