/*
 * genTable.c - URLfilterDB
 *
 * ufdbGuard is copyrighted (C) 2005,2006,2007 by URLfilterDB with all rights reserved.
 * ufdbGuard is based on squidGuard.
 *
 * squidGuard is copyrighted (C) 1998 by
 * ElTele st AS, Oslo, Norway, with all rights reserved.
 *
 * Generate a binary table file (.ufdb) from unordered ASCII files 
 * with domains and urls.
 *
 * usage: ufdbGenTable [-n] [-C] [-k <key>] -t <tableName> -d <domains> [-u <urls>]
 *
 * RCS $Id: genTable.c,v 1.40 2007/11/09 20:39:26 root Exp root $
 */

#if defined(__OPTIMIZE__) && defined(__GNUC__)  && defined(GCC_INLINE_STRING_FUNCTIONS_ARE_FASTER)
#define __USE_STRING_INLINES
#endif

#include "ufdb.h"
#include "ufdblib.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <ctype.h>
#include <time.h>
#include <sys/types.h>
#include <unistd.h>
#include <pthread.h>

#include "bzlib.h"


static FILE * fin;
static char * inFileName;
static char * urlsFileName;
static char * tableName;

static struct UFDBtable * table;

static int numEntries = 0;
static int doCrypt = 1;
static int doCompress = 0;
static int doProd = 0;
static int doWarnings = 1;

#if defined(UFDB_DO_DEBUG) || 0
#define DEBUG(x) fprintf x 
#else
#define DEBUG(x) 
#endif

#define ROUNDUPBY      4
#define ROUNDUP(i)     ( (i) + (ROUNDUPBY - ((i)%ROUNDUPBY) ) )

#define BIGROUNDUPBY   64
#define BIGROUNDUP(i)  ( (i) + (BIGROUNDUPBY - ((i)%BIGROUNDUPBY) ) )


/* genTable.c is part of UFDB but it is a stand-alone single-threaded utility
 * that does not need the thread-safe version of ufdbMalloc/Realloc/Free
 * so we provide our own.
 */
pthread_mutex_t ufdb_malloc_mutex = UFDB_STATIC_MUTEX_INIT;

void ufdbFree( void * ptr )
{
   if (ptr != NULL)
      free( ptr );
}

void * ufdbMalloc( size_t n )
{
   char * mem;
   
   mem = malloc( n );
   return mem;
}

void * ufdbRealloc( void * ptr, size_t n )
{
   char * mem;
   
   mem = realloc( ptr, n );
   return mem;
}

void * ufdbCalloc( size_t n, size_t num )
{
   char * mem;

   mem = calloc( n, num );
   return mem;
}

void ufdbGetMallocMutex( char * fn )
{
}

void ufdbReleaseMallocMutex( char * fn )
{
}

char * ufdbStrdup( const char * s )
{
   int size = strlen( s ) + 1;
   return strcpy( ufdbMalloc(size), s );
}



static void usage( void )
{
   fprintf( stderr, "usage: ufdbGenTable [-n] [-q] [-C] [-k <key>] -t <tableName> -d <domains> [-u <urls>]\n" );
   fprintf( stderr, "flags: -n  no encryption\n" );
   fprintf( stderr, "       -k  16-char encryption key\n" );
   fprintf( stderr, "       -q  be quiet (suppress warnings)\n" );
   fprintf( stderr, "       -C  use bz2 compression\n" );
   fprintf( stderr, "       -t  tablename\n" );
   fprintf( stderr, "       -d  domains\n" );
   fprintf( stderr, "       -u  urls\n" );
   exit( 1 );
}


void ufdbLogMessage( char * line, ... )
{
   fprintf( stderr, "%s\n:", line );
}


void ufdbLogError( char * line, ... )
{
   fprintf( stderr, "%s\n:", line );
}


void initTable( char * tableName )
{
   table = (struct UFDBtable *) malloc( sizeof( struct UFDBtable ) + sizeof(struct UFDBtable*) );
   table->tag = (unsigned char *) strdup( tableName );
   table->nNextLevels = 0;
   table->nextLevel = NULL;

   numEntries = 0;
}


static __inline__ void * _trealloc( void * p, int n )
{
   int nup;

   if (n < BIGROUNDUPBY)
   {
      nup = ROUNDUP(n);
      if (nup == ROUNDUP(n-1))
         return p;
   }
   else
   {
      nup = BIGROUNDUP(n);
      if (nup == BIGROUNDUP(n-1))
         return p;
   }

   return realloc( p, nup * sizeof(struct UFDBtable) );
}

#include "strcmpurlpart.static.c"

int UFDBinsertURL( struct UFDBtable * t, UFDBrevURL * revURL, UFDBurlType type )
{
   /*
    * find the index where our URL has to be inserted before or is equal to
    * e.g. the level "net" is either "< nl" or "= net".
    */
   int b, e, i;
   int cmp;
   int rv = 0;

   DEBUG(( stderr, "      UFDBinsertURL( 0x%08x, 0x%08x )\n", t, revURL ));

   if (revURL == NULL)
   {
      if (t != NULL)
      {
         DEBUG(( stderr, "        revURL=NULL t: nNextLevels=%d,tag=%s\n", t->nNextLevels, t->tag ));
	 if (t->nNextLevels > 0)
	 {
	    /* interesting... we are trying to insert "xxx.com" while the tree already
	     * has one or more members with subdomains like yyy.xxx.com.
	     * Lets optimise this  and get rid of the subdomains !
	     */
	    rv = 1;
	    t->nNextLevels = 0;
	    free( t->nextLevel );	/* TO-DO: should free() a tree ! */
	    t->nextLevel = NULL;
	 }
      }
      else
         DEBUG(( stderr, "        revURL=NULL t=NULL\n" ));
      return rv;
   }

   /* binary search */
   cmp = -999;
   i = b = 0;
   e = t->nNextLevels - 1;
   while (b <= e)
   {
      i = (b + e) / 2;
      cmp = strcmpURLpart( (char *) revURL->part, (char *) t->nextLevel[i].tag );
      /* DEBUG(( stderr, "       %d = strcmp( %s, %s )\n", cmp, revURL->part, t->nextLevel[i]->tag )); */
      if (cmp == 0)
         break;		/* found an exact match */
      if (cmp < 0)
         e = i - 1;
      else
         b = i + 1;
   }

   DEBUG(( stderr, "      UFDBinsertURL after bsearch: part=%s, cmp=%d, i=%d, b=%d, e=%d, nNextLevels=%d\n", 
           (revURL==NULL ? (unsigned char *)"NULL" : revURL->part), cmp, i, b, e, t->nNextLevels ));
   
   /* implemented optimisations: 
    * do not add subdom.abc.com/aurl if abc.com is already in the tree
    * do not add subdom.abc.com if abc.com is already in the tree
    * remove subdom.abc.com from tree if abc.com is being inserted
    */

   if (t->nNextLevels == 0)		/* the very first entry at this level */
   {
      t->nNextLevels = 1;
      t->nextLevel = malloc( ROUNDUP(1) * sizeof(struct UFDBtable) );
      t->nextLevel[0].tag = (unsigned char *) strdup( (char *) revURL->part );
      t->nextLevel[0].nNextLevels = 0;
      t->nextLevel[0].nextLevel = NULL;

      rv = UFDBinsertURL( &(t->nextLevel[0]), revURL->next, type );
   }
   else if (cmp == 0)				/* an exact match at this level */
   {
      /* optimisation: do not add site.com/blah if site.com is in the table */
      if (type == UFDBurl)
      {
         if (t->nextLevel[i].nNextLevels != 0)
	    rv = UFDBinsertURL( &(t->nextLevel[i]), revURL->next, type );
      }
      else
      {
	 rv = UFDBinsertURL( &(t->nextLevel[i]), revURL->next, type );
      }
   }
   else if (cmp < 0)				/* this entry < nextLevel[i] */
   {
      t->nNextLevels++;
      t->nextLevel = _trealloc( t->nextLevel, t->nNextLevels );

      /* make space in the array */
      if (t->nNextLevels >= i)
      {
	 memmove( &(t->nextLevel[i+1]), &(t->nextLevel[i]), (t->nNextLevels-i) * sizeof(struct UFDBtable) );
      }

      /* insert the current revURL into the array */
      t->nextLevel[i].nNextLevels = 0;
      t->nextLevel[i].tag = (unsigned char *) strdup( (char *) revURL->part );
      t->nextLevel[i].nextLevel = NULL;

      /* process the tail of revURL */
      rv = UFDBinsertURL( &(t->nextLevel[i]), revURL->next, type );
   }
   else 					/* this entry > nextLevel[i] */
   {
      i++;
      
      t->nNextLevels++;
      t->nextLevel = _trealloc( t->nextLevel, t->nNextLevels );

      /* make space in the array */
      if (t->nNextLevels > i)
      {
	 memmove( &(t->nextLevel[i+1]), &(t->nextLevel[i]), (t->nNextLevels-i) * sizeof(struct UFDBtable) );
      }

      /* insert the current revURL into the array */
      t->nextLevel[i].nNextLevels = 0;
      t->nextLevel[i].tag = (unsigned char *) strdup( (char *) revURL->part );
      t->nextLevel[i].nextLevel = NULL;

      /* process the tail of revURL */
      rv = UFDBinsertURL( &(t->nextLevel[i]), revURL->next, type );
   }

   return rv;
}


/* generate a binary table file
 */
void writeTableToFile( struct UFDBtable * t, FILE * output )
{
   int i;

   fputs( (char *) t->tag, output );
   if (t->nNextLevels > 0)
   {
      fputc( UFDBsubLevel, output );
   }

   for (i = 0; i < t->nNextLevels; i++)
   {
      writeTableToFile( &(t->nextLevel[i]), output );
      if (t->nextLevel[i].nNextLevels == 0)
      {
	 if (i < t->nNextLevels - 1)
	    fputc( UFDBsameLevel, output );
      }
      else
	 fputc( UFDBprevLevel, output );
   }
}


static __inline__ void addDomain( 
   UFDBthreadAdmin * admin,
   unsigned char *   domain, 
   UFDBurlType       type )
{
   UFDBrevURL *      revUrl;
   int               rv;
   
   /* DEBUG: print function entry of addDomain to stderr */
#if 0
   fprintf( stderr, "addDomain( %s )\n", domain ); /* */
#endif
   numEntries++;

   revUrl = UFDBgenRevURL( admin, domain );

#if 0
   UFDBprintRevURL( revUrl ); 
#endif

   /* first do a lookup of the domain, it is already matches, it should
    * not be added !
    */
   rv = UFDBlookupRevUrl( table, revUrl );
   if (rv)
   {
      fprintf( stderr, "url/domain %s is not added because it was already matched.\n", domain );
   }
   else
   {
      rv = UFDBinsertURL( table, revUrl, type );
      if (rv)
      {
	 fprintf( stderr, "domain %s has optimised subdomains.\n", domain );
      }
   }

   UFDBfreeRevURL( admin, revUrl );
}


static char randomChar( void )
{
   static char * a = "1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
   return a[random() % 62];
}


static void generateRandomKey( char * encryptKey )
{
   srandom( getpid() * time(NULL) );

   encryptKey[0]  = randomChar();
   encryptKey[1]  = randomChar();
   encryptKey[2]  = randomChar();
   encryptKey[3]  = randomChar();
   encryptKey[4]  = randomChar();
   encryptKey[5]  = randomChar();
   encryptKey[6]  = randomChar();
   encryptKey[7]  = randomChar();
   encryptKey[8]  = randomChar();
   encryptKey[9]  = randomChar();
   encryptKey[10] = randomChar();
   encryptKey[11] = randomChar();
   encryptKey[12] = randomChar();
   encryptKey[13] = randomChar();
   encryptKey[14] = randomChar();
   encryptKey[15] = randomChar();
   encryptKey[16] = '\0';
}


static void copyKey( char * key, char * encryptKey )
{
   key[0]  = encryptKey[0];
   key[1]  = encryptKey[1];
   key[2]  = encryptKey[2];
   key[3]  = encryptKey[3];
   key[4]  = '-';
   key[5]  = encryptKey[4];
   key[6]  = encryptKey[5];
   key[7]  = encryptKey[6];
   key[8]  = encryptKey[7];
   key[9]  = '-';
   key[10] = encryptKey[8];
   key[11] = encryptKey[9];
   key[12] = encryptKey[10];
   key[13] = encryptKey[11];
   key[14] = '-';
   key[15] = encryptKey[12];
   key[16] = encryptKey[13];
   key[17] = encryptKey[14];
   key[18] = encryptKey[15];
   key[19] = '\0';
}


#if 0
static void encryptFile( FILE * f, unsigned char * key )
{
   ufdbCrypt uc;
   long pos;
   int n;
#define N 8192
   unsigned char iblock[N];
   unsigned char oblock[N];

   ufdbCryptInit( &uc, key, 16 );
   
   while (!feof(f))
   {
      pos = ftell( f );
      n = fread( iblock, 1, N, f );
      ufdbEncryptText( &uc, oblock, iblock, n );
      fseek( f, pos, SEEK_SET );
      fwrite( oblock, 1, n, f );
      if (n < N)
         break;
   }
}
#endif


static void encryptMemory( char * to, char * from, long n, unsigned char * key )
{
   ufdbCrypt uc;

#if 0
   fprintf( stderr, "encryptMemory( %08x %08x %ld %16.16s )\n", (unsigned int)to, (unsigned int)from, n, key );
#endif

   ufdbCryptInit( &uc, key, 16 );
   ufdbEncryptText( &uc, (unsigned char *) to, (unsigned char *) from, n );
}


static long compressMemory( char * to, char * from, long size )
{
   unsigned int new_size;

   new_size = (unsigned int) (size + 2048);
   if (BZ_OK != BZ2_bzBuffToBuffCompress( to, &new_size, from, size, 6, 0, 40 ))
   {
      fprintf( stderr, "compression failed.\n" );
      exit( 1 );
   }

#if 0
   fprintf( stderr, "compressMemory from size %ld to %d\n", size, new_size );
#endif

   return new_size;
}


static void updateSizeInHeader( FILE * f, long size )
{
   int  n;
   char version[8];
   char prefix[32];
   char tableName[32];
   char key[32];
   char date[32];
   char flags[8+1];
   struct UFDBfileHeader header;

#if 0
   fprintf( stderr, "updateSizeInHeader %ld\n", size );
#endif

   fseek( f, 0, SEEK_SET );
   fscanf( f, "%5s %7s %20s %11d key=%30s date=%20s %8s",
           prefix, version, tableName, &n, key, date, flags );

   fseek( f, 0, SEEK_SET );
   sprintf( header.string, "%s %s %s %ld key=%s date=%s %8s \n",
            "UFDB", version, tableName, size, key, date, flags );
   fprintf( f, "%s", header.string );
   for (n = sizeof(header.string) - strlen(header.string); n > 0; n--)
      fputc( '\0', f );
   fflush( f );

   fseek( f, 0, SEEK_END );
}


static void doCryptCompress( FILE * f, char * encryptKey )
{
   long   size;
   long   orig_size;
   char * buffer1;
   char * buffer2;

   fflush( f );
   fseek( f, 0, SEEK_END );
   orig_size = size = ftell( f ) - sizeof(struct UFDBfileHeader);

#if 0
   fprintf( stderr, "doCryptCompress orig_size %ld bytes\n", orig_size );
#endif

   buffer1 = malloc( size + 2048 );
   buffer2 = malloc( size + 2048 );
   if (buffer1 == NULL || buffer2 == NULL)
   {
      fprintf( stderr, "cannot allocate memory for encryption and/or compression (size=%ld)\n", size );
      exit( 1 );
   }

   /* read file into buffer1 */
   fseek( f, sizeof(struct UFDBfileHeader), SEEK_SET );
   if (1 != fread( buffer1, size, 1, f ))
   {
      fprintf( stderr, "cannot read file for encryption and/or compression.\n" );
      exit( 1 );
   }

   /* make sure the 'result' is in buffer2 */
   if (doCompress)
      size = compressMemory( buffer2, buffer1, size );
   else
      memcpy( buffer2, buffer1, size );

   /* crypt from buffer2 into buffer1 */
   if (doCrypt)
      encryptMemory( buffer1, buffer2, size, (unsigned char *) encryptKey );
   else
      memcpy( buffer1, buffer2, size );

   /* rewrite the file header and put the original size in there for the decompression function */
   updateSizeInHeader( f, orig_size );

   /* write buffer1 to the file */
   fseek( f, sizeof(struct UFDBfileHeader), SEEK_SET );

   if (1 != fwrite( buffer1, size, 1, f ))
   {
      fprintf( stderr, "fwrite failed.\n" );
      exit( 1 );
   }
   fflush( f );


   /* truncate the file (if we did compression) */
   if (doCompress && size < orig_size)
      ftruncate( fileno(f), size + sizeof(struct UFDBfileHeader) );

   free( buffer1 );
   free( buffer2 );
}


int main( int argc, char * argv[] )
{
   int    n;
   int    opt;
   time_t t;
   struct tm * tm;
   char   encryptKey[16+1];
   char   key[16+3+1];
   char * version;
   char   flags[8+1];
   FILE * fout;
   char * fout_buffer;
   struct UFDBfileHeader header;
   UFDBthreadAdmin * admin;
   char   date[64];
   char   outFileName[512];
   unsigned char   domain[4096];

   UFDBappInit();
   admin = UFDBallocThreadAdmin();
   inFileName = NULL;
   urlsFileName = NULL;
   tableName = "table";
   encryptKey[0] = '\0';

   while ((opt = getopt( argc, argv, "?k:t:d:u:nCqP" )) > 0)
   {
      switch (opt)
      {
      case '?':
         usage();
	 break;
      case 't':
         tableName = optarg;
	 break;
      case 'd':
         inFileName = optarg;
	 break;
      case 'u':
         urlsFileName = optarg;
	 break;
      case 'k':
         strncpy( encryptKey, optarg, 16 );
	 encryptKey[16] = '\0';
	 if (strlen( encryptKey ) != 16)
	 {
	    fprintf( stderr, "key \"%s\" is not valid.\n", encryptKey );
	    usage();
	 }
	 break;
      case 'n':
         doCrypt = 0;
	 break;
      case 'P':
         doProd = 1;
	 break;
      case 'C':
         doCompress = 1;
	 break;
      case 'q':
         doWarnings = 0;
	 break;
      default:
         usage();
	 break;
      }
   }

   if (strlen(tableName) > 15)
   {
      fprintf( stderr, "the tableName must be shorter than 15 characters" );
      usage();
   }

   if (inFileName == NULL)
      usage();

   fin = fopen( inFileName, "r" );
   if (fin == NULL)
   {
      fprintf( stderr, "cannot read from \"%s\"\n", inFileName );
      usage();
   }

   strcpy( outFileName, inFileName );
   strcat( outFileName, UFDBfileSuffix );
   fout = fopen( outFileName, "w+" );
   if (fout == NULL)
   {
      fprintf( stderr, "cannot write to \"%s\"\n", outFileName );
      usage();
   }
   fout_buffer = malloc( 16384 );
   setvbuf( fout, fout_buffer, _IOFBF, 16384 );

   /* setlinebuf( stderr ); */
   initTable( tableName );


   /* process the domains ********************************************/
   n = 0;
readdomains:
   while (!feof(fin))
   {
      unsigned char * ptr;
      ptr = domain;

      while ((*ptr = fgetc(fin)) != '\n')
      {
         if (feof(fin))
	    goto eof;
	 if (*ptr >= 'A'  &&  *ptr <= 'Z')
	    *ptr = *ptr + 'a' - 'A';
	 ptr++;
	 if (ptr > &domain[4090])
	 {
	    *ptr = '\0';
	    fprintf( stderr, "URL too long: %s\n", domain );
	    while (!feof(fin) && fgetc(fin) != '\n')
	       ;
	    goto readdomains;
	 }
      }
      *ptr = '\0';

      if (domain[0] != '#')
      {
	 if (doWarnings)
	 {
	    if (ptr - domain > 66)
	       fprintf( stderr, "warning: long domain name: %s\n", domain );
	    if (strncmp( (char *) domain, "www.", 4 ) == 0)
	       fprintf( stderr, "warning: domain name starts with \"www.\": %s (strip www. ?)\n", domain );
	 }

	 addDomain( admin, domain, UFDBdomain );
      }
   }
eof:
   fclose( fin );

   /* process the urls ***********************************************/
   if (urlsFileName != NULL)
   {
      fin = fopen( urlsFileName, "r" );
      if (fin == NULL)
      {
	 fprintf( stderr, "cannot read from \"%s\"\n", urlsFileName );
	 usage();
      }

readurls:
      while (!feof(fin))
      {
	 unsigned char * ptr;
	 unsigned char * first_slash;

	 ptr = domain;

	 while ((*ptr = fgetc(fin)) != '\n')
	 {
	    if (feof(fin))
	       goto eof2;
	    if (*ptr >= 'A'  &&  *ptr <= 'Z')
	       *ptr = *ptr + 'a' - 'A';
	    ptr++;
	    if (ptr > &domain[4090])
	    {
	       *ptr = '\0';
	       fprintf( stderr, "URL too long: %s\n", domain );
	       while (!feof(fin) && fgetc(fin) != '\n')
		  ;
	       goto readurls;
	    }
	 }
	 *ptr = '\0';

	 if (domain[0] != '#')
	 {
	    if (doWarnings)
	    {
	       if (ptr - domain > 120)
		  fprintf( stderr, "warning: long url: %s\n", domain );
	       else
	       {
		  first_slash = (unsigned char *) strrchr( (char *) domain, '/' );
		  if (first_slash != NULL)
		     if (first_slash - &domain[0] > 66  ||  strlen((char *)first_slash) > 65)
			fprintf( stderr, "warning: long url: %s\n", domain );
	       }
	       if (strncmp( (char *) domain, "www.", 4 ) == 0)
		  fprintf( stderr, "warning: URL starts with \"www.\": %s (strip www. ?)\n", domain );
	    }

	    addDomain( admin, domain, UFDBurl );
	 }
      }
eof2:
      fclose( fin );
   }

   if (encryptKey[0] == '\0')
      generateRandomKey( encryptKey );
   copyKey( key, encryptKey );
   if (doCrypt)
      version = UFDBdbVersion;
   else
      version = "1.1";

   /* write the table header to the output file */
   strcpy( flags, "--------" );
   if (doCompress)
      flags[0] = 'C';
   if (doProd)
      flags[1] = 'P';
   if (doCrypt)
      flags[2] = 'Q';
   t = time( NULL );
   tm = gmtime( &t );
   sprintf( date, "%4d%02d%02d.%02d%02d", 
            tm->tm_year+1900, tm->tm_mon+1, tm->tm_mday, tm->tm_hour, tm->tm_min );
   sprintf( header.string, "%s %s %s %d key=%s date=%s %8s", 
            "UFDB", version, tableName, numEntries, key, date, flags );
   fprintf( fout, "%s \n", header.string );
   for (n = sizeof(header.string) - strlen(header.string); n > 0; n--)
      fputc( '\0', fout );

   /* write the table in binary format to the output file */
   writeTableToFile( table, fout );
   fputc( UFDBendTable, fout );

   /* encrypt and compress the table: rewind, read, compress, crypt and write */
   if (doCrypt || doCompress)
   {
      doCryptCompress( fout, encryptKey );
   }

   fclose( fout );
   free( fout_buffer );

   return 0;
}

/* since ufdbguard (single-threaded) and ufdbguardd (multi-threaded)
 * share source code, we put some pthread dummys here since we don't need/want pthreads.
 */
int pthread_mutex_lock( pthread_mutex_t * mutex )
{
   return 0;
}

int pthread_mutex_trylock( pthread_mutex_t * mutex )
{
   return 0;
}

int pthread_mutex_unlock( pthread_mutex_t * mutex )
{
   return 0;
}


int pthread_cond_signal(pthread_cond_t *cond)
{
   return 0;
}

int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
{
   return 0;
}

