package Lire::Email;

use strict;

use Lire::Syslog;
use Lire::Logger;

use Carp;

use vars qw/ @ISA @EXPORT/;

BEGIN {
    require Exporter;
    @ISA = qw/ Exporter Lire::Syslog /;
    @EXPORT = qw/ print_dlf sanitize sanitize_tos splitemailadress splitstat
      splitrelay/;
}

#
# be vary carefull when setting non-zero
#
my $debug = 0;
sub debug {
    $debug and lr_debug($_[0]);
}

my $undefinedstring = 'UNKNOWN';
my $undefinedint = '0';

sub parse {
    my $subroutine = "Lire::Email::parse";

    my ( $self, $line ) = @_;

    my $rec = $self->SUPER::parse($line);

    die "$subroutine: bogus line '$line': no content or process\n"
      unless defined $rec->{'content'} && defined $rec->{'process'};

    my ($queueid, $flags) = split ":", $rec->{'content'}, 2;

    # May 20 10:00:22 hibou postfix/smtp[11109]: connect to 
    #   subdimension.com[209.150.29.129]: Connection refused (port 25)
    # Dec  1 08:16:11 myhost postfix/smtpd[15922]: connect from 
    #   duh.duh.com[1.4.2.2]
    # May 22 05:43:09 hibou postfix/smtpd[24585]: reject: RCPT from
    #   pool-63.49.34.7.mmph.grid.net[63.49.34.7]: 554 <franksluck2@aol.com>:
    #   Recipient address rejected: Relay access denied;
    #   from=<secret_agent12@hotmail.com> to=<franksluck2@aol.com>
    # Jan  2 22:52:09 topaz postfix/local[22496]: warning: biff_notify: 
    #   Connection refused
    if ($rec->{'process'} =~ /^postfix/) {
        if ($rec->{'content'} !~ /=/ and
          $rec->{'content'} =~ /enabling PIX <CRLF>\.<CRLF> workaround for/) {
            # Mar  3 04:23:54 ohno.example.net postfix/smtp[6181]: [ID 197553
            #  mail.info] 4C986AB436: enabling PIX <CRLF>.<CRLF> workaround
            #  for www.example.com[100.76.64.17]
            lr_notice( "$subroutine: skipping PIX workaround line: we do not use " .
		       "this info (yet)\n" );
            return $rec;
        }

        if ($rec->{'content'} =~ /gethostbyaddr: [^ ]+ != [\d\.]+$/) {
            # Mar  3 03:50:40 ohno.example.net postfix/smtpd[5801]: [ID 484914
            #  mail.notice] gethostbyaddr: lycoseumailbox.example.com. !=
            #  100.193.12.51
            lr_notice( "$subroutine: skipping nonmatching gethostbyaddr line: we " .
		       "do no use this info (yet)\n" );
            return $rec;
        }

	# Postfix queueid has 9-12 digits or letters
	return $rec unless $queueid =~ /^[0-9a-zA-Z]{8,14}$/;
    } elsif ($rec->{'process'} =~ /^sendmail/) {
	# Sendmail queueid has 8-14 digits or letters
	return $rec unless $queueid =~ /^[0-9a-zA-Z]{8,14}$/;
    }

    unless (defined $flags) {
        # bogus line
	die "$subroutine: skipping line '$line': no ':' in content\n"
	  if $rec->{'process'} =~ /^sendmail/ || $rec->{'process'} =~ /^postfix/;

	return $rec;
    }
    my %flags = (
        map { 
            ( m/^\s*([^=]+)=(.+)$/ )
        } ( split ", ", $flags )
    );
    $rec->{'queueid'} = $queueid;
    $rec->{'flags'} = \%flags;

    debug("$subroutine: starting assigning, queueid is '$queueid'");
    for my $k (keys %flags) {
        debug("$subroutine: assigning '" . $flags{$k} .  "' to '$k'");
        $rec->{$k} = $flags{$k};
    }
    debug("$subroutine: finished assigning");

    return $rec;
}

#
# returns nof printed lines
#
sub print_dlf {
    my ($c, $dlf_maker, $msg, $only_to ) = @_;

    my $dlflines = 0;
    my $subroutine = 'print_dlf';

    # Check for guard value
    return 0 if $only_to && $c->{'nrcpts'} == -1;

    my %dlf = map { $_ => $c->{$_} } 
      qw/time from_user from_domain from_relay_host from_relay_ip
         logrelay queueid msgid size stat xstat/;

    my $dlfid = $c->{'logrelay'} . $c->{'queueid'};
    if (! defined $c->{'deliveries'}) {
        # Message rejected before the sender was known
        my $dlf = $dlf_maker->( \%dlf );
        print join( " ", @$dlf ), "\n";
        $dlflines++;
        debug("$subroutine printed '", join( " ", @$dlf), "'");
        delete $msg->{$dlfid};
    } else {
        foreach my $to ( keys %{$c->{'deliveries'}} ) {
            next if defined $only_to && $only_to ne $to;

            my $to_infos = $c->{'deliveries'}{$to};
            foreach my $f ( qw/delay to_user to_domain to_relay_host
                               to_relay_ip stat xstat/ )
            {
                $dlf{$f} = $to_infos->{$f};
            }
            my $dlf = $dlf_maker->( \%dlf );
            print join( " ", @$dlf ), "\n";
            debug("$subroutine printed '", join( " ", @$dlf), "'");
            $dlflines++;

            $c->{'nrcpts'}--;
            delete $c->{'deliveries'}{$to};
        }
        delete $msg->{$dlfid} unless $c->{'nrcpts'};
    }

    return $dlflines;
}



########################

sub sanitize {
    my $subroutine = "sanitize";
    croak "$subroutine: give 3 args"
      unless @_ == 3;

    my $what = shift;
    my $it = shift;
    # we are gonna write $_[0], the last arg.

    croak "$subroutine: called with undef what arg"
      unless defined $what;
    croak "$subroutine: called with undef it arg"
      unless defined $it;

    $it ||= "-";

    # get rid of leading and trailing whitespace
    $it =~ s/\s+$//g;
    $it =~ s/^\s+//g;

    # sanitize internal whitespace
    $it =~ s/ /_/g;

    if ($what eq 'emailadress') {
        if ($it eq "<>") {
            $it = "MAILER-DAEMON";
        } else {
            # we should deal with from=John Kennedy <joe@example.com>
            # and from=<BATCH1@skn.example.com    >
            $it =~ s/^[^<]*<(\S*?)\s*>$/$1/;
            # strip possible trailing dot
            $it =~ s/\.$//;
        }
    } elsif ($what eq 'relay') {
        # if we encouter foo.example.com._[10.1.7.2], we wanna save.... uhh...
        # strip trailing dot. sendmail is reported to have these things.
        $it =~ s/\.$//;
    } elsif ($what eq 'relayhost') {
        # a fqdn
        # strip trailing dot. sendmail is reported to have these things.
        $it =~ s/\.$//;
        # we might have found an ip address instead of a 'real' fqdn,
        #  handle [ and ]
        $it =~ s/^\[?([^\[\]]+)\]?$/$1/;
    } elsif ($what eq 'relayip') {
        # an ip address
        # strip leading [ and trailing ]
        $it =~ s/^\[?([^\[\]]+)\]?$/$1/;
    } elsif ($what eq 'size') {
        # some postfix logs have lines like
        # Dec 3 04:05:29 host postfix/qmgr[25829]: 7F3FC1758: 
        #   from=<root@example.com>, size=986865 (queue active)
        # we wanna strip the '(queue active)'
        my ($size, $tmp);
        if (($size, $tmp) = $it =~ /^(\d+)([^\d]*)$/) {
            $it = $size;
        } elsif ($it eq '-') {
            # don't warn when we were using default anyway
            debug("keeping size - as is");
        } else {
            lr_notice("cannot sanitize size '$it', giving default");
            $it = '-';
        }
    } elsif ($what eq 'stat') {
        ;
    } elsif ($what eq 'msgid') {
        ;
    } elsif ($what eq 'delay') {
        # convert dd+hh:mm:ss to s
        my ($tmp, $dd, $hh, $mm, $ss);
        if (($tmp, $dd, $hh, $mm, $ss) = 
            $it =~ /^(([\+\-]?[0-9]*)\+?)??([\+\-]?[0-9]+):([\+\-]?[0-9]+):([\+\-]?[0-9]+)$/) {
            $dd = 0 unless ($dd);
        } else {
	    die "$subroutine: cannot sanitize delay '$it': ",
	      "invalid format. should look like [dd+]hh:mm:ss\n";
        }
        $it = $dd * 24 * 3600 + $hh * 3600 + 60 * $mm + $ss;
        $it = $undefinedint if ($it < 0);      # this occurs, fields like
            # delay=00:-3:-36 are found in sendmail logs. we do not want 
            # to find such evil stuff in our dlf
    } else {
        croak "$subroutine: cannot sanitize a '$what', not even '$it'";
    }

    #
    # force unicity
    #
    $_[0] = lc $it;

    1;
}

sub sanitize_tos
{
    my $subroutine = 'sanitize_tos';

    my $what = shift;
    my @return;

    for my $to (split /,/, $what) {
        my $new;
        sanitize('emailadress', $to, $new);
	
        # push @return, { 'to' => $new };
        push @return, $new;
        debug("$subroutine: pushed $new from $what");
    }

    return \@return;
}

sub splitemailadress
{
    my $subroutine = 'splitemailadress';

    croak "$subroutine: give 1 arg" 
      unless @_ == 1;

    my $m = shift;
    croak "$subroutine: called with undef arg" 
      unless defined $m;

    my $u = '';
    my $d = '';
    if ($m =~ /@/) {
        $m =~ /^(.*)@([^@]*)$/;
        defined $1 and $u = $1; 
        defined $2 and $d = $2;
    } else {
        $u = $m;
        $d = 'localhost';
    }

    $u eq '' and $u = $undefinedstring;
    $d eq '' and $d = $undefinedstring;

    return ($u, $d);
}


sub splitstat
{
    my $subroutine = 'splitstat';

    croak "$subroutine: give 1 arg" 
      unless @_ == 1;

    my $s = shift;

    croak "$subroutine: called with undef arg" 
      unless defined $s;

    my $a = '';
    my $b = '';
    if ($s =~ /_/) {
        $s =~ /^([^_:\(]*)(_|:|\()(.*)$/;
        defined $1 and $a = $1;
        defined $3 and $b = $3;
    } else {
        # e.g. stat=queued
        $a = $s;
        $b = '-';
    }

    $a eq '' and $a = $undefinedstring;
    $b eq '' and $b = $undefinedstring;

    return ($a, $b);
}

# split host.example.nl._[150.0.0.45] in host.example.nl. and [150.0.0.45]
# split foo.com[209.54.94.99] in foo.com and [209.54.94.99]

sub splitrelay
{
    my $subroutine = 'splitrelay';

    croak "$subroutine: give 1 arg" 
      unless @_ == 1;

    my $s = shift;
    croak "$subroutine: called with undef arg"
      unless defined $s;

    my ($a, $b);

    if ($s =~ /[\[_]/) {
        $s =~ /([^_\[]+)[_]?(\[[^\]]+\])/ and return ($1, $2);
        # handle 'relay=[1.2.3.4]'. Tnx schr!
        $s =~ /^\[[^\]]+\]$/ and return ($undefinedstring, $s);
        # bogus:
        return ($s, $undefinedstring);
    } elsif ($s =~ /localhost$/) {
        return ($s, '127.0.0.1');
    } elsif ($s eq 'local') {
        return ('localhost', '127.0.0.1');
    } else {
        return ($s, $undefinedstring);
    }
}

# return a true value to keep perl happy
1;


__END__

=pod

=head1 NAME

Lire::Email - parse and tidy objects commonly found in email logfiles

=head1 SYNOPSIS

 use Lire::Email;

 my $parser = new Lire::Email();

 my $log = $parser->parser( $line );

 sanitize('emailadress', $logfrom, $curfrom);
 $tos = sanitize_tos($log->{'to'})
 ($user, $domain) = splitemailadress($address);
 ($stat, $xstat) = splitstat($status);

 my $dlf_maker = $schema->make_hashref2asciidlf_func( qw/time logrelay ...
   stat xstat /);
 $dlflines += print_dlf( $cur, $dlf_maker, \%msg, $to );

See also the sendmail2dlf code for usage.

=head1 DESCRIPTION

Lire::Email supplies an object oriented interface to store email logfile
lines, similar to the one supplied by Lire::Syslog: it ISA Lire::Syslog.

Furthermore, it supplies routines to manipulate email addresses and
status messages, as found in email logfiles.

All dlf convertors generating email dlf should use this module.

All functions will die on error.

=head2 sanitize

B<sanitize> takes three arguments: B<sanitize>(I<what>, I<it>, I<sane>).
I<what> is one of 'emailaddress', 'relay', 'relayhost', 'relayip'. 'size',
'stat', 'msgid', 'delay'.  I<it> is the current value, in I<sane> the
"sanitized" value gets written.  E.g. if I<what> is emailadddress and I<it>
is 'Joe User <joe@example.com.>', I<sane> gets assigned 'joe@example.com'.

=head2 sanitize_tos

B<sanitize_tos> takes a comma separated list of email addresses as argument.
It returns an array of B<sanitize>d email addresses.

=head2 splitemailadress

B<splitemailadress> takes a sanitized email address as argument.  It returns
an array, which contains the userpart and the domainpart as elements.

=head2 splitstat

B<splitstat> takes a sanitized stat, like
'sent_(baa28063_message_accepted_for_delivery)' or 'sent_(ok)' or
'sent_(whew!_done!_was_it_as_good_for_you_as_it_was_for_me?)' or 'queued'
as argument.  It splits this in the first word and the rest:  It returns
an array like ('sent', '(ok)').

=head2 splitrelay

B<splitrelay> splits host.example.nl._[100.0.0.45] in host.example.nl. and
[100.0.0.45] and splits foo.com[209.54.94.99] in foo.com and
[209.54.94.99].


=head1 SEE ALSO

Lire::Syslog(3pm), sendmail2dlf(1)

=head1 VERSION

$Id: Email.pm,v 1.2 2006/07/23 13:16:33 vanbaal Exp $

=head1 COPYRIGHT

Copyright (C) 2000-2001 Stichting LogReport Foundation LogReport@LogReport.org

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program (see COPYING); if not, check with
http://www.gnu.org/copyleft/gpl.html. 

=head1 THANKS

Edward Eldred, for reporting a bug.

=head1 AUTHOR

Joost van Baal <joostvb@logreport.org>, based on an idea by Joost Kooij
<joost@topaz.mdcc.cx>

=cut

