/*
 * ufdbanalyse.c
 *
 * read a Squid log file and produce a report with a table showing
 * percentages for categories.
 *
 * This program is meant to be used as a tool to find out what types
 * of websites are visited.
 *
 * ufdbGuard is copyrighted (C) 2005,2006,2007 by URLfilterDB with all rights reserved.
 *
 * RCS: $Id: ufdbAnalyse.c,v 1.3 2007/11/28 16:42:14 root Exp root $
 */

#include "ufdb.h"
#include "ufdblib.h"

#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>


void ufdbFree( void * ptr )
{
   if (ptr != NULL)
      free( ptr );
}

void * ufdbMalloc( size_t n )
{
   char * mem;
   
   mem = malloc( n );
   return mem;
}

void * ufdbRealloc( void * ptr, size_t n )
{
   char * mem;
   
   mem = realloc( ptr, n );
   return mem;
}

void * ufdbCalloc( size_t n, size_t num )
{
   char * mem;

   mem = calloc( n, num );
   return mem;
}

void ufdbGetMallocMutex( char * fn )
{
}

void ufdbReleaseMallocMutex( char * fn )
{
}

char * ufdbStrdup( const char * s )
{
   int size = strlen( s ) + 1;
   return strcpy( ufdbMalloc(size), s );
}

void ufdbLogMessage( char * line, ... )
{
   fprintf( stderr, "%s\n:", line );
}


void ufdbLogError( char * line, ... )
{
   fprintf( stderr, "%s\n:", line );
}

static void usage( void )
{
   fprintf( stderr, "usage: ufdbanalyse -l <squid-log-file> -d <domainname> -e <email-address> -n <full-name>\n" );
   fprintf( stderr, "flags: -l  Squid log file (parameter in squid.conf: cache_access_log)\n" );
   fprintf( stderr, "       -d  your domainname, e.g. example.com\n" );
   fprintf( stderr, "       -e  your email address, e.g. joe@example.com\n" );
   fprintf( stderr, "       -n  your full name\n" );
   exit( 1 );
}


static void * myrealloc( void * buffer, size_t newsize )
{
#define MY_CHUNK (64*1024)

   return realloc( buffer, newsize + (MY_CHUNK - (newsize+MY_CHUNK) % MY_CHUNK) );
}


static void sendLogfile( int serverSocket, FILE * logfile, char * email )
{
   int    errors;
   int    n;
   long   nbytes_sent;
   int    nlines_sent;
   char * p;
   char * slash;
   char * code;
   char * nbytes;
   char * url;
   char * buffer;
   char * oldbuffer;
   char   linebuf[4192];
   char   output[4192];

   errors = 0;
   nbytes_sent = 0;
   nlines_sent = 0;
   buffer = malloc( MY_CHUNK );

   while (fgets( linebuf, 4192, logfile ) != NULL)
   {
      linebuf[4191] = '\0';

      /* a line in a Squid log file looks like this:
       * 1195621581.452     74 10.1.1.2 TCP_MISS/200 1169 GET http://example.com - DIRECT/194.46.8.130 text/html
       */
      if (strtok( linebuf, " \t" ) == NULL) 	/* time */
      {
         errors++;
	 continue;
      }

      if (strtok( NULL, " \t" ) == NULL)	/* dummy */
      {
         errors++;
	 continue;
      }

      if (strtok( NULL, " \t" ) == NULL)	/* IP */
      {
         errors++;
	 continue;
      }

      code = strtok( NULL, " \t" );		/* Squid code / HTTP code */
      if (code == NULL)
      {
         errors++;
	 continue;
      }
      p = strchr( code, '/' );
      if (p != NULL)
         code = p + 1;

      nbytes = strtok( NULL, " \t" );		/* #bytes */
      if (nbytes == NULL)
      {
         errors++;
	 continue;
      }

      if (strtok( NULL, " \t" ) == NULL)	/* HTTP command */
      {
         errors++;
	 continue;
      }

      url = strtok( NULL, " \t" );		/* URL */
      if (url == NULL)
      {
         errors++;
	 continue;
      }
      /* ignore the rest of the input line */

      /* strip the URL for privacy */
      p = strchr( url, '?' );
      if (p != NULL)
         *p = '\0';
      p = strchr( url, '&' );
      if (p != NULL)
         *p = '\0';
      p = strstr( url, "://" );
      if (p != NULL)
         url = p + 3;
      p = strchr( url, '@' );
      if (p != NULL)
      {
	 slash = strchr( url, '/' );
	 if (slash == NULL  ||  p < slash)
	    url = p + 1;
      }

      n = sprintf( output, "%s %s %s\n", code, nbytes, url );
      /* we assume that realloc is really fast... */
      oldbuffer = buffer;
      buffer = myrealloc( buffer, nbytes_sent+n+1 );
      if (buffer == NULL)
      {
         fprintf( stderr, "Oops: cannot allocate more than %ld KB... truncating input to %d lines...\n", 
	          nbytes_sent/1024, nlines_sent );
	 buffer = oldbuffer;
	 break;
      }
      nlines_sent++;
      if (nlines_sent % 10000 == 0)
      {
         putchar( '.' );
	 fflush( stdout );
      }
      strcpy( &buffer[nbytes_sent], output );
      nbytes_sent += (long) n;
      if (nbytes_sent > 100 * 1024 * 1024L)
      {
         fprintf( stderr, "\n" );
         fprintf( stderr, "Over 100 MB (%d URLs) will be sent to be analysed.\n", nlines_sent );
         fprintf( stderr, "To save bandwidth and processing power, the rest of the input file is ignored.\n" );
	 break;
      }
   }
   printf( "\n" );

   if (errors > 2)
   {
      fprintf( stderr, "There were %d parse errors.\n", errors );
      fprintf( stderr, "Are you sure that a Squid log file (e.g. access.log) is specified ?\n" );
      fprintf( stderr, "No analysis can be performed.\n" );
      strcpy( output, "Content-Length: 0\r\n\r\n" );
      (void) write( serverSocket, output, strlen(output) );
      return;
   }

   if (nlines_sent < 50)
   {
      fprintf( stderr, "The Squid logfile has only %d URLs which is not sufficient for an analysis.\n", nlines_sent );
      fprintf( stderr, "No analysis can be performed.\n" );
      strcpy( output, "Content-Length: 0\r\n\r\n" );
      (void) write( serverSocket, output, strlen(output) );
      return;
   }

   /* send the last line of the header: content length and then the content */
   n = sprintf( output, "Content-Length: %ld\r\n"
                    "\r\n",
		    nbytes_sent );

   if (write( serverSocket, output, n ) != n)
   {
      fprintf( stderr, "\n" );
      fprintf( stderr, "cannot write Content-Length to server: %s\n", strerror(errno) );
      return;
   }

   printf( "I am going to upload %d URLs (%d KB) ...\n", nlines_sent, nbytes_sent/1024 );

   if (write( serverSocket, buffer, nbytes_sent ) != nbytes_sent)
   {
      fprintf( stderr, "cannot write to server: %s\n", strerror(errno) );
      return;
   }

   printf( "The upload is finished.\n" );
}


int main( int argc, char * argv[] )
{
   char   opt;
   int    n;
   int    s;
   FILE * fp;
   char   answer[32];
   char   domain[128];
   char   email[128];
   char   fullname[128];
   char   logfile[1024];
   char   header[2048];

   logfile[0] = email[0] = domain[0] = fullname[0] = '\0';
   while ((opt  = getopt( argc, argv, "?l:d:e:n:" )) > 0)
   {
      switch (opt)
      {
      case 'd':
         strcpy( domain, optarg );
	 break;
      case 'e':
         strcpy( email, optarg );
	 break;
      case 'n':
         strcpy( fullname, optarg );
	 break;
      case 'l':
         strcpy( logfile, optarg );
	 break;
      case '?':
      default:
	 usage();
	 break;
      }
   }

   if (fullname[0] == '\0' || logfile[0] == '\0' || email[0] == '\0' || domain[0] == '\0')
      usage();

   if (!isatty(fileno(stdin)))
   {
      fprintf( stderr, "ufdbAnalyse requires that standard input is a tty.\n" );
      fprintf( stderr, "Do not use pipes or file redirection.\n" );
      exit( 1 );
   }

   fp = fopen( logfile, "r" );
   if (fp == NULL)
   {
      fprintf( stderr, "cannot read Squid logfile \"%s\"\n", logfile );
      exit( 1 );
   }

   printf( "Please do not upload more than 1 file and wait for the results.\n" );
   printf( "The results will be sent by the support desk via email to %s\n", email );
   printf( "Type \"yes\" to analyse %s ", logfile );
   answer[0] = '\0';
   fgets( answer, 30, stdin );
   if (strncmp( answer, "yes", 3 ) != 0)
   {
      fprintf( stderr, "Answer \"yes\" to upload the file.  The upload is aborted.\n" );
      exit( 0 );
   }

   s = UFDBopenSocket( UFDB_UPLOAD_UNCATEGORISED_URLS_WEBSITE, 80 );
   if (s < 0)
   {
      fprintf( stderr, "cannot open communication socket with http://%s\n", UFDB_UPLOAD_UNCATEGORISED_URLS_WEBSITE );
      exit( 1 );
   }

   sprintf( header, "POST " UFDB_UPLOAD_ANALYSE_SQUID_LOG_CGI " HTTP/1.1\r\n"
		    "Host: " UFDB_UPLOAD_UNCATEGORISED_URLS_WEBSITE "\r\n"
   	 	    "User-Agent: ufdbAnalyse-" VERSION "\r\n"
		    "Content-Type: text/plain\r\n"
		    "Connection: close\r\n"
		    "X-mydomain: %s\r\n"
		    "X-myemail: %s\r\n"
		    "X-fullname: %s\r\n",
		    domain, email, fullname );
   n = strlen( header );
   if (write( s, header, n  ) != n )
   {
      fprintf( stderr, "cannot write header to http://%s\n", UFDB_UPLOAD_UNCATEGORISED_URLS_WEBSITE );
      exit( 1 );
   }

#if 0
   printf( "header sent to %s\n", UFDB_UPLOAD_UNCATEGORISED_URLS_WEBSITE );
   printf( "%s", header );
#endif

   sendLogfile( s, fp, email );

   close( s );
   fclose( fp );

   exit( 0 );
   /*NOTREACHED*/
   return 0;
}

