/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/*
 * Pan - A Newsreader for Gtk+
 * Copyright (C) 2002-2006  Charles Kerr <charles@rebelbase.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include <config.h>
#include <string>
#include <vector>
#include <string.h>
#include <glib.h>
extern "C" {
#include <glib/gi18n.h>
}
#include <locale.h>
#include <gmime/gmime.h>
#include <pan/general/debug.h>
#include <pan/general/foreach.h>
#include <pan/general/log.h>
#include "utf8-utils.h"

using namespace pan;

namespace
{
  struct LocaleStruct
  {
    const char *locale, *charset;
  }
  locales[] =
  {
    {"en_US",        "ISO-8859-1"},
    {"pt_BR",        "ISO-8859-1"},
    {"ca_ES",        "ISO-8859-15"},
    {"zh_CN.GB2312", "gb2312"},
    {"zh_TW.Big5",   "big5"},
    {"cs_CZ",        "ISO-8859-2"},
    {"da_DK",        "ISO-8859-1"},
    {"de_DE",        "ISO-8859-15"},
    {"nl_NL",        "ISO-8859-15"},
    {"et_EE",        "ISO-8859-15"},
    {"fi_FI",        "ISO-8859-15"},
    {"fr_FR",        "ISO-8859-15"},
    {"el_GR",        "ISO-8859-7"},
    {"hu_HU",        "ISO-8859-2"},
    {"it_IT",        "ISO-8859-15"},
    {"ja_JP",        "ISO-2022-jp"},
    {"ko_KR",        "euc-kr"},
    {"lv_LV",        "ISO-8859-13"},
    {"lt_LT",        "ISO-8859-13"},
    {"no_NO",        "ISO-8859-1"},
    {"pl_PL",        "ISO-8859-2"},
    {"pt_PT",        "ISO-8859-15"},
    {"ro_RO",        "ISO-8859-2"},
    {"ru_RU",        "KOI8-R"},
    {"ru_SU",        "ISO-8859-5"},
    {"sk_SK",        "ISO-8859-2"},
    {"es_ES",        "ISO-8859-15"},
    {"sv_SE",        "ISO-8859-1"},
    {"tr_TR",        "ISO-8859-9"},
    {"uk_UK",        "KOI8-U"}
  };

  /* find_locale_index_by_locale:
   * finds the longest fit so the one who has en_GB will get en_US if en_GB
   * is not defined.
   * This function is lifted from Balsa.
   */
  gint
  get_closest_locale (void)
  {
    const char * locale = setlocale (LC_CTYPE, NULL);
    guint i, j, maxfit = 0, maxpos = 0;

    g_return_val_if_fail (locale != NULL, -1);

    if (!locale || strcmp(locale, "C") == 0)
      return 0;

    for (i = 0; i < G_N_ELEMENTS(locales); i++) {
      for (j=0; locale[j] && locales[i].locale[j] == locale[j]; j++);
      if (j > maxfit) {
        maxfit = j;
        maxpos = i;
      }
    }

    return maxpos;
  }

  const char * PAN_DEFAULT_CHARSET = "ISO-8859-1";

  const char *
  get_charset_from_locale (void)
  {
    gint loc_idx = get_closest_locale ();
    return loc_idx != -1 ? locales[loc_idx].charset : PAN_DEFAULT_CHARSET;
  }

  char*
  g_mime_charset_strndup (const char     * to_charset,
                          const char     * from_charset,
                          const char     * text,
                          int              text_len)
  {
    char * retval;
    gssize out_len;
    GMimeStream * in_stream;
    GMimeStream * in_stream_filter;
    GMimeFilter * charset_filter;
    GMimeStream * out_stream;
    GByteArray * byte_array;

    /* set up an input stream with the desired charset filter */
    in_stream = g_mime_stream_mem_new_with_buffer (text, text_len);
    charset_filter = g_mime_filter_charset_new (from_charset, to_charset);
    in_stream_filter = g_mime_stream_filter_new_with_stream (in_stream);
    g_mime_stream_filter_add (GMIME_STREAM_FILTER(in_stream_filter), charset_filter);

    /* set up an output stream attached to a byte array */
    byte_array = g_byte_array_new ();
    out_stream = g_mime_stream_mem_new ();
    g_mime_stream_mem_set_byte_array (GMIME_STREAM_MEM(out_stream), byte_array);

    /* write the input stream to the output stream */
    out_len = g_mime_stream_write_to_stream (in_stream_filter, out_stream);

    /* if the write was successful, zero-terminate the string and return it. */
    if (out_len < 0) {
      retval = NULL;
      g_byte_array_free (byte_array, TRUE);
    } else {
      g_byte_array_append (byte_array, (guint8*)"", 1);
      retval = (gchar*) byte_array->data;
      g_byte_array_free (byte_array, FALSE);
    }

    /* cleanup */
    g_object_unref (G_OBJECT(out_stream));
    g_object_unref (G_OBJECT(in_stream_filter));
    g_object_unref (G_OBJECT(charset_filter));
    g_object_unref (G_OBJECT(in_stream));

    return retval;
  }
}

char*
pan :: clean_utf8 (const char * in,
                   int          in_len)
{
  GString * str = g_string_new (NULL);
  const char *end;
  char * retval;

  while (g_utf8_validate (in, in_len, &end) == FALSE) {
    if (end > in)
      g_string_append_len (str, in, end-in);
    g_string_append_c (str, '?');
    if (in_len != -1)
      in_len -= (1+end-in);
    in = end+1;
  }

  g_string_append_len (str, in, in_len);
  retval = str->str;
  g_string_free (str, FALSE);

  return retval;
}

namespace
{
  char* h_to_utf8 (const char   * header,
                   int            header_len,
                   const char   * charset)
  {
    char * retval (0);

    if (!header)
      return 0;

    if (header_len < 0)
      header_len = strlen (header);

    // nice girls encode their 8bit strings
    if (g_strstr_len (header, header_len, "=?") != NULL) {
      guchar * pch = (guchar*) g_strndup (header, header_len);
      retval = g_mime_utils_8bit_header_decode (pch);
      g_free (pch);
    }

    // bad boys don't properly encode their 8bit string
    else if (g_mime_utils_text_is_8bit ((const unsigned char*)header, header_len)
         && !g_utf8_validate (header, header_len, NULL))
    {
      if (charset && *charset) {
        retval = g_mime_charset_strndup ("UTF-8", charset, header, header_len);
        if (!g_utf8_validate (retval, -1, NULL)) {
          g_free (retval);
          retval = NULL;
        }
      }
      if (retval == NULL) {
        const char * local_charset = get_charset_from_locale ();
        retval = g_mime_charset_strndup ("UTF-8", local_charset, header, header_len);
        if (!g_utf8_validate (retval, -1, NULL)) {
          g_free (retval);
          retval = NULL;
        }
      }

      if (retval == NULL)
        retval = clean_utf8 (header, header_len);
      else {
        char * tmp = retval;
        retval = clean_utf8 (retval, -1);
        g_free (tmp);
      }
    }

    // fallback
    if (retval == NULL)
      retval = g_strndup (header, header_len);

    return retval;
  }
}

char*
pan :: header_to_utf8 (const StringView  & view,
                       const char        * fallback_charset_or_null)
{
  return h_to_utf8 (view.str, view.len, fallback_charset_or_null);
}

char*
pan :: mime_part_to_utf8 (GMimePart     * part,
                          const char    * fallback_charset)
{
  g_return_val_if_fail (GMIME_IS_PART(part), 0);

  // check for an empty part
  size_t content_len (0);
  const char * content = g_mime_part_get_content (part, &content_len);
  if (!content || !*content)
    return 0;

  const char * cpch = g_mime_object_get_content_type_parameter (GMIME_OBJECT (part), "charset");
  if (!cpch || !*cpch) {
    cpch = fallback_charset;
    fallback_charset = 0;
  }

  return content_to_utf8 (content, content_len, cpch, fallback_charset);
}

char*
pan :: content_to_utf8 (const char * content,
                        int          content_len,
                        const char * fallback_charset1,
                        const char * fallback_charset2)
{
  g_return_val_if_fail (content, 0);
  g_return_val_if_fail (content_len!=0, 0);

  // build a list of charsets to try
  typedef std::vector<std::string> strings_t;
  strings_t encodings;
  if (fallback_charset1 && *fallback_charset1) encodings.push_back (fallback_charset1);
  if (fallback_charset2 && *fallback_charset2) encodings.push_back (fallback_charset2);
  static const char* FALLBACK_ENCODINGS[] = { "CURRENT", "ISO-8859-15" };
  encodings.insert (encodings.end(),
                    FALLBACK_ENCODINGS,
                    FALLBACK_ENCODINGS + G_N_ELEMENTS(FALLBACK_ENCODINGS));


  // detect the content length if necessary.
  if (content_len < 0)
    content_len = strlen (content);

  // is it's already valid utf8?
  char * ret (0);
  if (g_utf8_validate (content, content_len, NULL))
    ret = g_strndup (content, content_len);

  // iterate through the charsets and try to convert to utf8.
  if (!ret) {
    foreach_const (strings_t, encodings, it)
      if ((ret = g_convert (content, content_len, "UTF-8", it->c_str(), 0, 0, 0)))
        break;
  }

  // if we couldn't figure it out, just strip out all the non-utf8 and hope for the best.
  if (!ret) {
    ret = clean_utf8 (content, content_len);
    Log::add_err (
      _("Couldn't determine article encoding.  Non-UTF8 characters were removed."));
  }

  return ret;
}
