#! @PERL@ -w # vim:syntax=perl use strict; use lib '@LR_PERL5LIBDIR@'; package Lire::Proxy::MS_ISALog; use base qw/Lire::W3CExtendedLog/; use Lire::DlfSchema; use Lire::W3CExtendedLog; use Lire::Program qw( :msg ); use Lire::Utils qw/parse_url/; my $debug = 0; sub debug { $debug and lr_debug($_[0]); } my $schema = Lire::DlfSchema::load_schema( "proxy" ); my %ms_type2regex = ( # MS enhanced string, using tab as field separator ms_string => '([^\t]+)', ); my %ms_identifier2type = ( # MS workaround 'username' => 'ms_string', 'agent' => 'ms_string', 'referred' => 'name', 'host' => 'name', 'port' => 'integer', 'computername' => 'uri', 'object-source' => 'uri', 'operation' => 'uri', 'protocol' => 'uri', 'mime-type' => 'uri', 'rule#1' => 'uri', 'rule#2' => 'uri', ); # see http://www.w3.org/TR/WD-logfile.html my %isa_field2proxy_dlf = ( 'c-ip' => 'client_ip', 'c-host' => 'client_host', 'cs-username' => 'user', 'c-agent' => 'useragent', # e.g. Mozilla/4.0 (compatible; MSIE 5.0; Win32) # or Outlook Express/5.0 (MSIE 5.0; Windows 98; DigExt) # 's-computername' => 'computername', # Proxy name (s-computername) The name of the computer running ISA # Server. This is the computer name that is assigned in Windows # 2000. e.g. GRO1SYX01 # Currently unmapped # 'cs-referred' => 'result_src_host', # Referring server name (cs-referred): If ISA Server is used upstream in # a chained configuration, this indicates the server name of the # downstream server that sent the request. # Unmapped for now. 'r-host' => 'result_src_host', 'r-ip' => 'result_src_ip', 'r-port' => 'result_src_port', # For the Web Proxy service, a hyphen (-) in this field may # indicate that an object was sourced from the Web Proxy server # cache and not from the destination. One exception is negative # caching. In that case, this field indicates a destination IP # address for which a negative-cached object was returned. # 'sc-bytes' => 'bytes', # We'll add up cs-bytes and sc-bytes later on 'sc-status' => 'req_result', 'cs-protocol' => 'protocol', 's-operation' => 'operation', #'cs-uri' => , # This will be parsed for dst_host and requested_url # 's-object-source' => 'result_src_code', # Object source (s-object-source) Indicates the source that was used to # retrieve the current object. This field applies only to the Web Proxy # service log. # This overlaps the cache_result and result_src_result 'time-taken' => 'duration', # This is in millisecond, we need to postprocess it 'rule#1' => 'rule', 'rule#2' => 'rule', # Rule#1 is about protocol rule # Rule#2 is about content filtering # Only one of the two should be defined in each record 'cs-mime-type' => 'type', ); # values and their meanings: # 0 No source information is available. # Cache Source is the cache. Object returned from cache. # Inet Source is the Internet. Object added to cache. # Member Returned from another array member. # NotModified Source is the cache. Client performed an If-Modified-Since # request and object had not been modified. # NVCache Source is the cache. Object could not be verified to source. # Upstream Object returned from an upstream proxy cache. # Vcache Source is the cache. Object was verified to source and had # not been modified. # VFInet Source is the Internet. Cached object was verified to source # and had been modified. my %ms_isa2cache_result = ( 0 => 'NONE', Cache => 'TCP_HIT', Inet => 'TCP_MISS', Member => 'TCP_MISS', NotModified => 'TCP_IMS_HIT', NVCache => 'TCP_REF_FAIL_HIT', Upstream => 'TCP_MISS', VCache => 'TCP_REFRESH_HIT', VFInet => 'TCP_REFRESH_MISS', ); my %ms_isa2result_src_code = ( 0 => 'NONE', Cache => 'NONE', Inet => 'DIRECT', Member => 'SIBLING_HIT', NotModified => 'NONE', NVCache => 'NONE', Upstream => 'PARENT_HIT', VCache => 'NONE', VFInet => 'DIRECT', ); sub build_parser { my ( $self ) = shift; $self->{field2re} = %ms_type2regex; for my $k (keys %ms_identifier2type) { if (defined $ms_identifier2type{$k}) { # possibly overwrite Lire::W3CExtendedLog values $self->{identifier2type}->{$k} = $ms_identifier2type{$k}; } else { if (defined $self->{identifier2type}->{$k}) { lr_warn( "ms_identifier2type undefined in '$k', keeping W3C " . "default '" . $self->{identifier2type}->{$k} . "' for identifier2type" ); } else { lr_warn( "ms_identifier2type undefined in '$k', " . "identifier2type stays undefined" ); } } } $debug and do { debug( "just filled identifier2type hash, dumping it" ); while ((my $k, my $v) = each %{ $self->{identifier2type} }) { if (defined $v) { debug( "identifier2type{'$k'} = '$v'" ); } else { debug( "identifier2type{'$k'} undefined" ); } } debug( "... done" ); }; # Override some types' lexer while ( my ( $type, $rx ) = each %ms_type2regex ) { $self->{type2regex}{$type} = $rx; } $self->SUPER::build_parser( @_ ); my @fields = split /\s+/, $self->{fields}; my %fields = map { $_ => 1 } @fields; my @mapped = (); my @dlf_fields; foreach my $f ( @fields ) { if ( exists $isa_field2proxy_dlf{$f} ) { push @mapped, $f; push @dlf_fields, $isa_field2proxy_dlf{$f}; } } # Create the DLF maker function push @dlf_fields, "time" if $fields{time}; push @dlf_fields, "client_host" if $fields{'c-ip'}; push @dlf_fields, "bytes" if $fields{'cs-bytes'} || $fields{'sc-bytes'}; push @dlf_fields, "dst_host", "requested_url", "dst_port", "protocol" if $fields{'cs-uri'}; push @dlf_fields, "cache_result", "result_src_code" if $fields{'s-object-source'}; # Keep only one of each my %dlf_fields = map { $_ => 1 } @dlf_fields; @dlf_fields = sort keys %dlf_fields; lr_info( "mapped DLF fields: ", join( ", ", @dlf_fields ) ); my $dlf_maker = $schema->make_hashref2asciidlf_func( @dlf_fields ); $self->{proxy_dlf_converter} = sub { my $w3c = $self->{w3c_parser}->( $_[0] ); # Those fields that are mapped directly my %dlf = ( time => $w3c->{lire_time} ); $dlf{bytes} = 0; $dlf{bytes} += $w3c->{'cs-bytes'} if $w3c->{'cs-bytes'} && $w3c->{'cs-bytes'} ne '-'; $dlf{bytes} += $w3c->{'sc-bytes'} if $w3c->{'sc-bytes'} && $w3c->{'sc-bytes'} ne '-'; if ( exists $w3c->{'cs-uri'}) { my $url = parse_url( $w3c->{'cs-uri'} ); $dlf{'dst_host'} = $url->{'host'}; $dlf{'requested_url'} = $url->{'path'}; $dlf{'dst_port'} = $url->{'port'}; $dlf{'protocol'} = $url->{'protocol'}; # That last one may be overriden by cs-protocol } if ( exists $w3c->{'s-object-source'} ) { my $code = $w3c->{'s-object-source'}; if ( $code eq '-' ) { if ( exists $w3c->{'sc-status'} && $w3c->{'sc-status'} == 403 ) { # Special case for denied requests $dlf{cache_result} = 'TCP_DENIED'; $dlf{result_src_code} = 'NONE'; } else { $dlf{cache_result} = 'NONE'; $dlf{result_src_code} = 'NONE'; } } else { $dlf{cache_result} = $ms_isa2cache_result{$code}; $dlf{result_src_code} = $ms_isa2result_src_code{$code}; $dlf{cache_result} = 'TCP_NEGATIVE_HIT' if exists $dlf{req_result} && $dlf{req_result} == 404 && $dlf{cache_result} eq 'TCP_HIT'; } } foreach my $name ( @mapped ) { # Map field only if it has the non-default value '-' $dlf{$isa_field2proxy_dlf{$name}} = $w3c->{$name} if $w3c->{$name} ne '-'; } $dlf{duration} = sprintf '%.3f', $dlf{duration} / 1_000 if exists $dlf{duration}; # Put client_ip into client_host when it is missing. $dlf{client_host} ||= $dlf{client_ip}; return $dlf_maker->( \%dlf ); } } package main; use Lire::Program qw( :msg :dlf ); my $lines = 0; my $dlflines = 0; my $errorlines = 0; init_dlf_converter( "proxy" ); my $parser = new Lire::Proxy::MS_ISALog; # Parse the header my $line; while (defined( $line = <> )) { last unless $line =~ /^#/; $parser->parse_directive( $line ); } lr_err( "invalid W3C extended log file: must start by Version and Fields " . "directives" ) unless defined $parser->{fields} && defined $parser->{version}; my $todlf = $parser->{proxy_dlf_converter}; # Transform into DLF do { $lines++; if ( $line =~ /^#/ ) { eval { $parser->parse_directive( $line ); }; if ( $@ ) { lr_err( $@ ); $errorlines++; last; } } else { eval { my $dlf = $todlf->( $line ); print join( " ", @$dlf), "\n"; $dlflines++; }; if ($@) { lr_warn( $@ ); lr_notice( qq{cannot convert line $. "$line" to proxy dlf, skipping} ); } } $line = <>; } while (defined $line); end_dlf_converter( $lines, $dlflines, $errorlines ); exit 0; __END__ =pod =head1 NAME ms_isa2dlf - convert Microsoft ISA server logs to DLF =head1 SYNOPSIS B B<[>IB<]> =head1 DESCRIPTION B converts Microsoft Internet Security and Acceleration Server log files in the W3C Extended Log Format to the proxy DLF. The ISA log files are documented on http://www.microsoft.com/technet/prodtechnol/isa/proddocs/isadocs/M_S_C_LoggingFields.asp . =head1 DEBUGGING As any Lire 2dlf program, this program needs adjusted LR_DBDIR, LR_DBFILE, LR_ID and PATH variables. These are set in .../etc/lire/defaults and .../etc/lire/profile_lean. After manually source-ing these files, one can run this program as a standalone application, by invoking it as e.g. zcat ms_isa.log.gz | LR_ID=`date +%Y%m%d.%H%M%S` ./ms_isa2dlf > /tmp/dlf . =head1 EXAMPLES To process a log as produced by the Microsoft ISA Server: $ ms_isa2dlf < ms_isa.log ms_isa2dlf will be rarely used on its own, but is more likely called by lr_log2report: $ lr_log2report ms_isa < /var/log/ms_isa.log =head1 THANKS Chainsaw on OPN irc, for supplying log files. =head1 SEE ALSO w3c_extended2dlf(1) =head1 VERSION $Id: ms_isa2dlf.in,v 1.15 2006/07/23 13:16:36 vanbaal Exp $ =head1 COPYRIGHT Copyright (C) 2001 Stichting LogReport Foundation LogReport@LogReport.org This program is part of Lire. Lire is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program (see COPYING); if not, check with http://www.gnu.org/copyleft/gpl.html. =head1 AUTHOR Joost van Baal , heavily inspired by Francis J. Lacoste's w3c_extended2dlf(1) =cut # Local Variables: # mode: cperl # End: