processlog.pl

#!/usr/bin/perl

use strict;

use warnings;

use lib qw(/data/perl/lib/perl5/site_perl);

use Getopt::Long;

use POSIX;

use Date::Calc qw(:all);

use XML::Simple;

use Compress::Zlib;

use File::Find;

use Log::Log4perl qw(:easy);

use Net::hostent;

use Socket;

use FileHandle;

use Net::DNS;

use Data::Dumper;
my $site;

my $date;

my $options;

my @proxyLogFiles = ();

my %ipFound;

#config file must be in same directory as this file

my $config = eval { XMLin() } or die();

#configure logging

Log::Log4perl->
    init(
    $config->{ log4perl }
    );

my $log = Log::Log4perl->get_logger();

$options = GetOptions(
    "site=s" => $site, "date:s" => $date
    );

defined($site) or usage();

#initialise %ipFound

foreach my $ipArray (
    $config->{ site }->{ $site }->{ address }
    )
{

    foreach my $ip (@$ipArray) {

        $ipFound{$ip} = 0;

    }

}    #we want to get yesterdays date

if ( defined($date) ) {

    $date = join( "-", Add_Delta_Days( verifyDateFormat($date), 0 ) );

}

else {

    $date = join( "-", Add_Delta_Days( Today(), 0 ) );

}

runInfo( $site, $date, $config );

my $dateMM_DD_YYYY = join( "-", ( split( /-/, $date ) )[ 1, 2, 0 ] );

my $combinedLogFileName =

    sprintf(
    "%s/%s.%s.gz", $config->{ logdir }, $site, $dateMM_DD_YYYY
    );

#sets the @proxyLogFiles variable with list of compressed files to process find( &newFile, $config->{ccelog} );

if (
    -e $combinedLogFileName & amp;
    &
    -s $combinedLogFileName
    )
{

    $log->info( sprintf( "%s already exists", $combinedLogFileName ) );

}

else {    #make sure we have something to do

    if (
        scalar(@proxyLogFiles) & gt;
        0
        )

    {     #combine the logs combineLogs( $site, $date, @proxyLogFiles );

    }

    else {

        $log->logdie("No files found");

    }    #making sure we record what logs files were found

    foreach my $ip ( keys %ipFound ) {

        if ( !$ipFound{$ip} ) {

            $log->info( sprintf( "Log for %s not found", $ip ) );

        }

    }

}

cleanUp(@proxyLogFiles);

yahooMessageBoards( $site, $date );

exit 0;

sub usage {

    printf( "Usage: %s --site=bmssite [--date=YYYY-MM-DD]n", $0 );

    printf("... where site is either usevv, ushpw, jptok, ukchs, aumlbn");

    exit(1);

}    #date needs to be in mm-dd-yyyy format on the command line

#we also want to verify that the date is correct #and we want to return a format that the Date::Calc functions can use

sub verifyDateFormat {

    my ($date) = @_;

    usage() if ( $date !~ /^d{4}-d{2}-d{2}$/ );

    if ( !( check_date( split( /-/, $date ) ) ) ) {

        $log->logdie("Invalid date range");

    }

    return split( /-/, $date );

}

sub newFile {

    my $fileName = $_;

    my $filePath = $File::Find::dir;

    my $fullName = $File::Find::name;

    my $dateYYYYMMDD = join( "", Add_Delta_Days( split( /-/, $date ), -1 ) );

    # my $dateYYYYMMDD = join( "", split( /-/, $date ) );

    foreach my $ipArray (
        $config->{ site }->{ $site }->{ address }
        )
    {

        foreach my $ip (@$ipArray) {

            if ( $fileName =~ /^celog_$ip_$dateYYYYMMDD_d{6}.txt.gz$/ ) {

                push( @proxyLogFiles, $fullName );

                $log->info( sprintf( "Found %s", $fullName ) );

                $ipFound{$ip} = 1;

            }

        }

    }

}

sub combineLogs {

    my ( $site, $date, @fileList ) = @_;

    my $dateMM_DD_YYYY = join( "-", ( split( /-/, $date ) )[ 1, 2, 0 ] );

    my $combinedLogFileName =

        sprintf(
        "%s/%s.%s.gz", $config->{ logdir }, $site, $dateMM_DD_YYYY
        );

    if ( -e $combinedLogFileName ) {

        $log->info( sprintf( "%s already exists", $combinedLogFileName ) );

        cleanUp(@fileList);

        return 0;

    }

    my $combinedLogFile =

        eval { gzopen( sprintf( "%s", $combinedLogFileName ), 'wb' ); } or $log->logdie( sprintf( "Could not open %s for writing", $combinedLogFileName ));

    foreach my $file (@fileList) {

        my ( $buffer, $bytesread, $byteswritten ) = ();

        my $proxyLogFile = eval { gzopen( $file, 'rb' ) } or $log->logdie( sprintf( "Could not open %s for reading", $file ) ); #open each log file and concatenate to combined log file

        while (
            !$proxyLogFile->gzeof()
            )
        {

            $bytesread += $proxyLogFile->gzreadline($buffer) or $log->logdie( sprintf( "%s", $proxyLogFile->gzerror()));

            $byteswritten += $combinedLogFile->gzwrite($buffer) or $log->logdie( sprintf( "%s", $combinedLogFile->gzerror()));

        }

        $log->info( sprintf( "Added %s to %s", $file, $combinedLogFileName ) );

        #want to be sure we did not lose any info

        $log->debug( sprintf( "%s bytes read from %sn", $bytesread, $file ) );

        $log->debug(

            sprintf(

                "%s bytes written to %sn",

                $byteswritten, $combinedLogFileName

                )

            );

        $proxyLogFile->gzclose();

    }

    $combinedLogFile->gzclose();

    cleanUp(@fileList);

    return 1;

}    #what parameters are we running with

sub runInfo {

    my ( $site, $date, $config ) = @_;

    $log->info( sprintf( "DATE = %s", $date ) );

    $log->info( sprintf( "DOWNLOADDIR = %s", $config-> { logdir }));

    $log->info( sprintf( "BINDIR = %s", $config-> { bindir }));

    $log->info( sprintf( "CCEDIR = %s", $config-> { ccelog }));

    $log->info( sprintf( "Host info for %s proxies", $site ) );

    foreach my $ipArray ( $config->{ site }->{ $site }->{ address }) {

        foreach my $ip (@$ipArray) { $log->info( sprintf( "HOST = %s", $ip ) ); }

    }

}

sub cleanUp {

    my (@fileList) = @_;

    foreach my $file (@fileList) { unlink($file); $log->info( sprintf( "Deleted %s", $file ) ); }

}

sub yahooMessageBoards {

    my ( $site, $date ) = @_;

    my $dateMM_DD_YYYY = join( "-", ( split( /-/, $date ) )[ 1, 2, 0 ] );

    my $combinedLogFileName = sprintf( "%s/%s.%s.gz", $config->{ logdir }, $site, $dateMM_DD_YYYY);

    my ( $workstation, $datetime, %ipaddr, %urls, %size, $total_bytes, $hits );

    my $log = Log::Log4perl->
        get_logger();

    my @messageURL = ();

    my ( $buffer, $bytesread ) = ();

    my $combinedLogFile = eval { gzopen( $combinedLogFileName, 'rb' ); } or $log->logdie( sprintf( "Could not open %s for writing", $combinedLogFileName ));

    my $file = sprintf( "%s/yahoo/%s.%s.csv", $config->{ reportdir }, $site, $dateMM_DD_YYYY);

    my $yahooReport = eval { new FileHandle( $file, 'w' ); } or $log->logdie( sprintf( "Could not open %s for writing", $file ) );
    my $dns = eval { new Net::DNS::Resolver } or $log->logdie($$);

    while ( !$combinedLogFile->gzeof()) {

        $bytesread += $combinedLogFile->gzreadline($buffer) or $log->logdie( sprintf( "%s", $combinedLogFile->gzerror()));

        next if ( $buffer !~ /messages.yahoo.com/ );

        next if ( $buffer !~ /tid=bmy/ );

        push( @messageURL, $buffer );

    }

    $combinedLogFile->gzclose();
    $log->info( sprintf( "Read %s MB from %s", $bytesread / ( 1024 * 1024 ), $combinedLogFileName ) );

    foreach my $line (@messageURL) {

        my ( $timestamp, $workstation, $url ) = ( split( ' ', $line ) )[ 0, 2, 6 ];

        if ( ( Time_to_Date(time) )[ 0, 1, 2 ] == ( Add_Delta_Days( ( Time_to_Date($timestamp) )[ 0, 1, 2 ], 1 ) )) {

            my $query = $dns->search($workstation);

            if ($query) {

                foreach my $entries ( $query->answer ) {

                    $workstation = $entries->ptrdname();

                }

            }

            else {

                $log->logwarn( sprintf( "query failed: %s", $dns->errorstring));
            }

        }

        $yahooReport->printf( "%s,%s,%sn", $timestamp, $workstation, $url );

    }

    $yahooReport->close();
    $log->info( sprintf( "Created report for %s in %s", $date, $file ));

}

sub topDomains {

    my ( $site, $date ) = @_;

    my $maxlines = 100;

    my $dateMM_DD_YYYY = join( "-", ( split( /-/, $date ) )[ 1, 2, 0 ] );

    my $combinedLogFileName = sprintf( "%s/%s.%s.gz", $config->{ logdir }, $site, $dateMM_DD_YYYY);

    my $bytesread;

    my $buffer;

    my $log = Log::Log4perl->get_logger();

    my ( %category, %urls, %size, $total_bytes, $hits );

    my $combinedLogFile = eval { gzopen( $combinedLogFileName, 'rb' ); } or $log->logdie( sprintf( "Could not open %s for writing", $combinedLogFileName ));

    while( !$combinedLogFile->gzeof() ) {

        $bytesread += $combinedLogFile->gzreadline($buffer) or $log->logdie( sprintf( "%s", $combinedLogFile->gzerror()));

        chomp($buffer);

        my ( $timestamp, $bytes, $url, $cat ) = ( split( ' ', $buffer, 12 ) )[ 0, 4, 6, 11 ];

        my $datetime = $timestamp;

        my $domain =~ s/w+:\/\/([^/]+).*$/$1/;

        if ( $domain !~ /d{1,3}.d{1,3}.d{1,3}.d{1,3}/ ) {
            my @pieces = split( /./, $domain ); $domain = sprintf( "%s.%s", $pieces[-2], $pieces[-1] );
        }

        if ( !defined($cat) ) { $cat = 'NONE'; }

        print $buffer, "n";

        printf( "%s, %s, %s, %sn", $timestamp, $bytes, $domain, $cat);

        $category{$domain} = $cat;

        $urls{$domain}++;

        $size{$domain} += $bytes;

        $total_bytes += $bytes;

        $hits++;

    }

}

Liked this post? Follow this blog to get more. 

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.

Feel free to comment on this story directly above, but you can also go to copies posted to social media on the left, and reply to or comment on them there. Your responses via Twitter, Facebook, and Google+ will appear below.

To respond on your own website, enter the URL of your response which should contain a link to this post's permalink URL. Your response will then appear (possibly after moderation) on this page. Want to update or remove your response? Update or delete your post and re-enter your post's URL again. (Learn More)