/*
 * uncat.c - URLfilterDB
 *
 * ufdbGuard API is copyrighted (C) 2005,2006,2007 by URLfilterDB with all rights reserved.
 *
 * ufdbGuard API is used to integrate the functionality of ufdbGuard into
 * programs of 3rd parties.
 *
 * This module deals with management of uncategorised URLs
 *
 * RCS $Id: uncat.c,v 1.10 2007/11/28 16:43:25 root Exp root $
 */

#include "ufdb.h"
#include "ufdb-api.h"
#include "version.h"

#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <ctype.h>
#include <netdb.h>
#include <pthread.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/utsname.h>
#include <sys/time.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <errno.h>


static int openSocket( char * serverName, int port )
{
   int                 s;
   struct hostent *    server;
   struct sockaddr_in  addr;
   struct linger       linger;

   errno = 0;

   /* 
    * check the server name.
    * BEWARE: gethostbyname is NOT thread-safe TODO: replace by alternatives
    */
   server = gethostbyname( serverName );
   if (server == NULL)
      return -1;

   /*
    * create the socket to connect to the daemon.
    */
   s = socket( AF_INET, SOCK_STREAM, 0 );
   if (s < 0)
      return -1;

   addr.sin_family = AF_INET;
   memcpy( (void *) &addr.sin_addr, (void *) server->h_addr_list[0], sizeof(addr.sin_addr) );  
   addr.sin_port = htons( port );

   if (connect( s, (struct sockaddr *) &addr, sizeof(addr) ) < 0)
   {
      close( s );
      s = -1;
   }
   else
   {
      int            sock_parm;
      struct timeval tv;

      /*
       * Allow server-side addresses to be reused (don't have to wait for timeout).
       * Turn off NAGLE.
       */
      sock_parm = 1;
      setsockopt( s, SOL_SOCKET, SO_REUSEADDR, (void *) &sock_parm, sizeof(sock_parm) );

      linger.l_onoff = 1;
      linger.l_linger = 1;
      setsockopt( s, SOL_SOCKET, SO_LINGER, (void *) &linger, sizeof(linger) );

      sock_parm = 1;
      setsockopt( s, IPPROTO_TCP, TCP_NODELAY, (void *) &sock_parm, sizeof(sock_parm) );

      sock_parm = 65500;
      setsockopt( s, SOL_SOCKET, SO_SNDBUF, (void *) &sock_parm, sizeof(sock_parm) );


      /*
       *  Prevent excessive blocking on communication with the URLfilterDB webserver.
       */
      tv.tv_sec = 20;
      tv.tv_usec = 0;
      setsockopt( s, SOL_SOCKET, SO_RCVTIMEO, (void *) &tv, sizeof(tv) );

      tv.tv_sec = 20;
      tv.tv_usec = 0;
      setsockopt( s, SOL_SOCKET, SO_SNDTIMEO, (void *) &tv, sizeof(tv) );
   }

   return s;
}


/*
 * API function: UFDBuploadUncategorisedURLs
 *
 * Any saved uncategorised URLs are uploaded to URLfilterDB for categorisation.
 */
int UFDBuploadUncategorisedURLs( char * agent )
{
   char * URLs;
   char * message;
   int    length;
   int    s;
   int    nbytes;
   int    written;
   struct utsname sysinfo;

   URLs = UFDBretrieveUncategorisedURLs();
   length = strlen( URLs );
   if (length == 0)
   {
      UFDBresetUncategorisedURLs();
      return UFDB_API_OK;
   }

   message = ufdbMalloc( 1024 + length );
   if (message == NULL)
      return UFDB_API_ERR_NOMEM;

   if (uname( &sysinfo ) != 0)
   {
      strcpy( sysinfo.nodename, "unknown" );
      strcpy( sysinfo.sysname, "unknown" );
      strcpy( sysinfo.release, "0" );
   }
   else
   {
      sysinfo.nodename[ sizeof(sysinfo.nodename)-1 ] ='\0';
      sysinfo.sysname[ sizeof(sysinfo.sysname)-1 ] ='\0';
      sysinfo.release[ sizeof(sysinfo.release)-1 ] ='\0';
   }

   s = openSocket( UFDB_UPLOAD_UNCATEGORISED_URLS_WEBSITE, 80 );
   if (s < 0)
   {
      UFDBresetUncategorisedURLs();
      return UFDB_API_ERR_SOCKET;
   }

   sprintf( message, 
	    "POST %s HTTP/1.1\r\n"
	    "Host: %s\r\n"
	    "User-Agent: API-" VERSION "-%s\r\n"
	    "Content-Type: text/plain\r\n"
	    "Content-Length: %d\r\n"
	    "Connection: close\r\n"
	    "X-SiteInfo: API 0 %s %s\r\n"
	    "X-NodeName: %s\r\n"
	    "\r\n"
	    "%s\r\n",
	    UFDB_UPLOAD_UNCATEGORISED_URLS_CGI,
	    UFDB_UPLOAD_UNCATEGORISED_URLS_WEBSITE,
            agent, 
	    length, 
	    sysinfo.sysname, sysinfo.release,
	    sysinfo.nodename, 
	    URLs );
   length = strlen( message );
   written = 0;
   while (length > 0)
   {
      nbytes = write( s, message+written, length );
      if (nbytes == 0)
         break;
      if (nbytes < 0)
      {
         if (errno != EINTR)
	    break;
      }
      else
      {
         written += nbytes;
	 length -= nbytes;
      }
   }

   close( s );

   ufdbFree( message );

   UFDBresetUncategorisedURLs();
   return UFDB_API_OK;
}


static pthread_mutex_t mutex_url_history = UFDB_STATIC_MUTEX_INIT;

static char url_history[120000] = "";
static int  url_history_index = 0;


/*
 * API function: UFDBsaveUncategorisedURL
 *
 * The domainname of an uncategorised URL is stored in a buffer.
 * this buffer is uploaded to URLfilterDB with UFDBuploadUncategorisedURLs().
 *
 * Return 1 if the buffer is full, 0 otherwise.
 */
int UFDBsaveUncategorisedURL( char * domain )
{
   int  length;
   char b[200];

   if (url_history_index >= sizeof(url_history) - 2)
      return 1;

   if (domain == NULL  ||  strchr( domain, '.' ) == NULL)
      return 0;

   length = strlen( domain );
   pthread_mutex_lock( &mutex_url_history );

   /* check for buffer overflow */
   if (url_history_index + length >= sizeof(url_history) - 2)
   {
      url_history_index = sizeof(url_history);
      pthread_mutex_unlock( &mutex_url_history );
      return 1;
   }

   /* almost perfect optimisation for duplicates */
   if (length <= 196)
   {
      sprintf( b, "|%s|", domain );
      if (strstr( url_history, b ) != NULL)
      {
	 pthread_mutex_unlock( &mutex_url_history );
	 return 0;
      }
   }

   strcpy( &url_history[url_history_index], domain );
   url_history_index += length;
   url_history[url_history_index] = '|';
   url_history_index++;
   url_history[url_history_index] = '\0';

   pthread_mutex_unlock( &mutex_url_history );

   return 0;
}


/*
 * API function: UFDBretrieveUncategorisedURLs
 *
 * Retrieve the content of the buffer that contains the uncategorised URLs.
 * This must be followed by a UFDBresetUncategorisedURLs().
 */
char * UFDBretrieveUncategorisedURLs( void )
{
   pthread_mutex_lock( &mutex_url_history );
   url_history_index = sizeof(url_history) + 2;		/* prevent additions until reset */
   pthread_mutex_unlock( &mutex_url_history );

   return url_history;
}


/*
 * API function: UFDBresetUncategorisedURLs
 *
 * Empty the buffer with uncategorised URLs.
 */
void UFDBresetUncategorisedURLs( void )
{
   pthread_mutex_lock( &mutex_url_history );
   url_history[0] = '\0';
   url_history_index = 0;
   pthread_mutex_unlock( &mutex_url_history );
}


/*
 * API function: UFDBVerifyURLisUncategorised
 *
 * A URL is checked against *all* categories (including "checked")
 *
 * Return 1 if the URL is uncategorised, 0 otherwise.
 */
int UFDBVerifyURLisUncategorised( 
   char *             URL,
   UFDBrevURL *       revURL,  
   UFDBusedCategory   category[],
   int                n_categories  )
{
   int i;

   /* skip private IP addresses 10.*, 127.* and 192.168.* */
   if (isdigit( *URL ))
   {
      if (strncmp( URL, "10.", 3 ) == 0  ||
          strncmp( URL, "127.", 4 ) == 0 ||
	  strncmp( URL, "192.168.", 8 ) == 0)
      {
         return 0;
      }
   }

   for (i = 0; i < n_categories; i++)
   {
      UFDBcategory *  cat;

      cat = &(category[i].handle);
      if (UFDBlookupRevUrl( &(cat->c_table.table.nextLevel[0]), revURL ))
	 return 0;
   }

   return 1;
}

