/*************************************************************************
 *
 *  $RCSfile: Crawler.java,v $
 *
 *  $Revision: 1.1 $
 *
 *  last change: $Author: abi $ $Date: 2000/11/30 18:03:07 $
 *
 *  The Contents of this file are made available subject to the terms of
 *  either of the following licenses
 *
 *         - GNU Lesser General Public License Version 2.1
 *         - Sun Industry Standards Source License Version 1.1
 *
 *  Sun Microsystems Inc., October, 2000
 *
 *  GNU Lesser General Public License Version 2.1
 *  =============================================
 *  Copyright 2000 by Sun Microsystems, Inc.
 *  901 San Antonio Road, Palo Alto, CA 94303, USA
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License version 2.1, as published by the Free Software Foundation.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 *  MA  02111-1307  USA
 *
 *
 *  Sun Industry Standards Source License Version 1.1
 *  =================================================
 *  The contents of this file are subject to the Sun Industry Standards
 *  Source License Version 1.1 (the "License"); You may not use this file
 *  except in compliance with the License. You may obtain a copy of the
 *  License at http://www.openoffice.org/license.html.
 *
 *  Software provided under this License is provided on an "AS IS" basis,
 *  WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING,
 *  WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
 *  MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
 *  See the License for the specific provisions governing your rights and
 *  obligations concerning the Software.
 *
 *  The Initial Developer of the Original Code is: Sun Microsystems, Inc.
 *
 *  Copyright: 2000 by Sun Microsystems, Inc.
 *
 *  All Rights Reserved.
 *
 *  Contributor(s): _______________________________________
 *
 *
 ************************************************************************/
package com.sun.xmlsearch.indexer;

import sunw.html.*;
import java.net.*;
import java.io.*;
import java.text.*;
import java.util.*;

class CrawlerThread extends Thread {
  private Crawler crawler;
  private Status status;
  private String urlString;
  private String crawlDir;
    
  public CrawlerThread(Crawler argCrawler, Status argStatus,
		       String url, String dir) {
    crawler = argCrawler;
    status = argStatus;
    urlString = url;
    crawlDir = dir;
  }
    
  public void run() {
    Vector links = null;
    String path = null;
    InputStream in = null;
    boolean errors = false;
    try {
      byte[] buffer = new byte[2048 * 8];
      URL url = new URL(urlString);
      URLConnection conn = url.openConnection();
      conn.setDoInput(true);
      conn.connect();
      String contentType = conn.getContentType();
      in = conn.getInputStream();
      if (contentType != null && contentType.equals("text/html")) {
	int bytesRead = in.read(buffer);
	if (bytesRead > 0) {
	  ByteArrayOutputStream out = new ByteArrayOutputStream();
	  do {
	    out.write(buffer, 0, bytesRead);
	    bytesRead = in.read(buffer);
	  }
	  while (bytesRead != -1);
	  out.flush();
	  out.close();

	  byte[] contents = out.toByteArray();

	  String dirName;
	  String fileName;
	  
	  // compute file name
	  dirName = crawlDir + "/PAGES/" +
	    String.valueOf((int)(Math.random() * 64.0));
	  dirName += "/" +
	    String.valueOf(Math.abs(urlString.hashCode()) % 64);
	  fileName = String.valueOf(System.currentTimeMillis()) + ".html";
	  synchronized (crawler) {
	    File directory = new File(dirName);
	    if (directory.exists() || directory.mkdirs()) {
	    
	      path = dirName + "/" + fileName;
	      FileOutputStream outs = new FileOutputStream(path);
	      outs.write(contents);
	      outs.close();
	    
	      ByteArrayInputStream is = new ByteArrayInputStream(contents);
	      Reader reader = new InputStreamReader(is);
	      // comptete for this synchronized method
	      links = crawler.readDocument(urlString, reader);
	    }
	    else
	      System.err.println("couldn't open " + dirName + "/" + fileName);
	  } // end synchronized
	}
      }
      else
	System.out.println(urlString + " not html");
    }
    catch (Exception e) {
      System.err.println(urlString + ' ' + e);
      errors = true;
    }
    finally {
      if (in != null) {
	try {
	  in.close();
	} catch (IOException e) {
	  System.err.println(e);
	}
      }
      if (errors)
	status.threadFinished(null, null, null);
      else
	status.threadFinished(links, urlString, path);
    }
  }
}

class Status {
  private Vector urlsToIndex = new Vector(100000);
  private int MaxThreadCount = 30;
  private int threadCount = 0;
  private int docsCounter = 0;
  private FileWriter _toc;

  public Status(FileWriter toc) {
    _toc = toc;
  }

  public synchronized void addURL(String urlString) {
    urlsToIndex.addElement(urlString);
  }
  
  public synchronized boolean somethingToDo() {
    return (urlsToIndex.size() > 0) || (threadCount > 0);
  }

  public synchronized String nextURL() throws InterruptedException {
    while (threadCount == MaxThreadCount || urlsToIndex.size() == 0)
      wait();
    ++threadCount;
    System.out.println(threadCount + " threads running (+)");
    int n = (int)(Math.random() * (urlsToIndex.size() - 1));
    System.out.println("picking " + n);
    return (String)urlsToIndex.remove(n);
  }
  
  public synchronized void threadFinished(Vector links, String url, String path) {
    --threadCount;
    notify();
    if (links != null)
      urlsToIndex.addAll(links);
    ++docsCounter;
    System.out.println(docsCounter + " documents read\n" +
		       threadCount + " threads running (-)\n" +
		       urlsToIndex.size() + " documents to fetch");
    if (path != null)
      try {
	_toc.write(url + '\n' + path + '\n');
	_toc.flush();
      }
    catch (IOException e) {
      System.err.println("failed to toc " + url);
    }
  }
}

public class Crawler {
  
  class Parser2 extends sunw.html.Parser {
    private Vector hrefs = new Vector();

    public Vector updateLinks(String urlString) {
      Vector result = computeLinks(urlString, hrefs);
      hrefs.clear();
      return result;
    }

    protected void handleEndTag(Tag tag) {
      if (tag.getElement().getName().equals("a")) {
	Attributes attrs = tag.getAttributes();
	if (attrs != null) {
	  String value = attrs.get("href");
	  if (value != null)
	    hrefs.addElement(value);
	}
      }
    }
  }
  
  private DTD dtd = null;
  private Parser2 parser;
  private PrintStream verbose = null;
  private Hashtable urlsSeen = new Hashtable(100000);
  private Status status;

  // for storing pages in files
  private String crawlDir = null;
  private FileWriter toc;

  public Vector computeLinks(String urlString, Vector hrefs) {
    URL base = null;
    try {
      base = new URL(urlString);
    }
    catch (MalformedURLException e) {
      System.err.println(e);
      return null;
    }
    Vector result = new Vector();
    Hashtable uniq = new Hashtable(hrefs.size());
    uniq.put(base, base);	// get rid of self-reference
    for (int i = 0; i < hrefs.size(); i++) {
      String link = (String)hrefs.elementAt(i);
      int anchor = link.lastIndexOf('#');
      if (anchor != -1)
	link = link.substring(0, anchor);
      if (link.endsWith("/"))
	link = link.substring(0, link.length() - 1);
	
      try {
	URL full = new URL(base, link);
	if (uniq.get(full) == null)// hasn't been seen yet
	  {
	    uniq.put(full, full);// doesn't matter what value to put there
	    String resolved = full.toString();
	    if (indexable(resolved))
	      if (urlsSeen.get(resolved) == null) {
		result.addElement(resolved);
		urlsSeen.put(resolved, resolved);
	      }
	  }
      }
      catch (MalformedURLException e) {
	System.err.println(e);
      }
    }
    return result;
  }
  
  public Crawler() {
    Properties props = new Properties(System.getProperties());
    try {
      props.load(new FileInputStream("resources/properties"));
      System.setProperties(props);
    } catch (IOException e) {
      System.err.println("No system properties file");
    }
  }

  public void crawlFrom(String urlString, String dir) {
    crawlDir = dir;
    int counter = 0;
    try {
      toc = new FileWriter(crawlDir + "/TOC");
      status = new Status(toc);
      dtd = DTD.getDTD("html32");
      parser = new Parser2();
      crawl(urlString);
      toc.close();
    }
    catch (Exception e) {
      e.printStackTrace();
    }
  }

  private void crawl(String urlString) throws Exception {
    status.addURL(urlString);
    while (status.somethingToDo())
      tryStartingThread();
    System.out.println("done");
    // !!! remember waiting for all the threads to finish
  }

  private void tryStartingThread() throws InterruptedException {
    String urlString = status.nextURL();
    CrawlerThread thread =
      new CrawlerThread(this, status, urlString, crawlDir);
    thread.start();
  }

  public synchronized Vector readDocument(String urlString, Reader in)
    throws Exception {
      System.out.println("reading: " + urlString);
      parser.parse(in, dtd);
      in.close();
      return parser.updateLinks(urlString);
  }

  
  private String[] domains =
  {".central", ".corp", ".eng", ".east", ".ebay", ".west", ".sun.com"};
  
  private String[] excludedExtensions =
  {".ps", ".doc", ".ppt", ".gif", ".jpg", ".jpeg", ".mpeg", ".z", ".zip",
   ".tar", ".pdf", ".txt", ".au", ".aw", ".tiff", ".bin"};
  
  // prune links to non-indexable data
  public boolean indexable(String url) {
    if (url.length() > 80)
      return false;

    String http = "http://";
    String lower = url.toLowerCase();
    // only http
    if (lower.startsWith(http) == false)
      return false;
    if (lower.indexOf('?') != -1)
      return false;
    if (lower.indexOf("sessionid") != -1)
      return false;

    for (int i = 0; i < excludedExtensions.length; i++)
      if (lower.endsWith(excludedExtensions[i]))
	return false;

    String stripped = lower.substring(http.length());
    int colon = stripped.indexOf(':');
    if (colon != -1)
      stripped = stripped.substring(0, colon);
    else {
      int slash = stripped.indexOf('/');
      if (slash != -1)
	stripped = stripped.substring(0, slash);
    }
    System.out.println(stripped);
  
    for (int i = 0; i < domains.length; i++)
      if (stripped.endsWith(domains[i]))
	return true;

    System.out.println("pruned");
    return false;
  }

  public static void main(String[] args) {
    Crawler crawler = new Crawler();
    crawler.crawlFrom(args[0], args[1]);
  }
}
