# utf8_fixups
#
# $Id: utf8_fixups,v 1.44 2010/04/20 00:53:35 knowledgejunkie Exp $
#
# This file contains fixups for mis-encoded UTF-8 characters that are 
# frequently seen in the source data from the Radio Times.
#
# When the grabber is run with --debug a summary of listings files processed
# containing unhandled mis-encoded characters is created. In order to create a
# fixup, download the source data file from the Radio Times and examine the
# raw bytes to determine i) what the regex should search for, and ii) what the
# replacement characters should be.
#
# ** From analysis of the bad and replacement bytes, it is getting more obvious
# that it may be possible to handle the majority of the mis-encoded characters
# in the grabber directly when certain byte sequences are found **
#
# The file is split into four sections:
#
# 1) Mis-encoded characters in range [C3][A0-AF][C2][80-BF][C2][80-BF]
# 2) Mis-encoded characters in range [C3][B0-BF][C2][80-BF][C2][80-BF][C2][80-BF]
# 3) Mis-encoded single characters represented with [EF][BF][BD] bytes
# 4) Mis-encoded single characters represented with [C3][AF][C2][BF][C2][BD] bytes
#
# Where possible, only the affected bytes are included in the fixup to allow
# them to be applied to other listings data in addition to that which required
# the original fixup.
#
# Each entry comprises two pipe-separated fields:
#
#  i) the hex chars of the mis-encoded character(s), and
# ii) the hex chars of the replacement character(s) to substitute
#
# A useful reference for the correct UTF-8 encodings of all Unicode characters
# is at http://www.eki.ee/letter/
#
################################################################################
#
#
# 1) Characters in range [C3][A0-AF][C2][80-BF][C2][80-BF]
#
# "ále"
\xC3\xA1\xC2\xAC\xC2\xA5|\xC3\xA1\x6C\x65
# "äfi"
\xC3\xA4\xC2\xA6\xC2\xA9|\xC3\xA4\x66\x69
# "ård"
\xC3\xA5\xC2\xB2\xC2\xA4|\xC3\xA5\x72\x64
# "çoi"
\xC3\xA7\xC2\xAF\xC2\xA9|\xC3\xA7\x6F\x69
# "èle"
\xC3\xA8\xC2\xAC\xC2\xA5|\xC3\xA8\x6C\x65
# "ère"
\xC3\xA8\xC2\xB2\xC2\xA5|\xC3\xA8\x72\x65
# "ève"
\xC3\xA8\xC2\xB6\xC2\xA5|\xC3\xA8\x76\x65
# "é D"
\xC3\xA9\xC2\xA0\xC2\x84|\xC3\xA9\x20\x44
# "é ["
\xC3\xA9\xC2\xA0\xC2\x9B|\xC3\xA9\x20\x5B
# "éar"
\xC3\xA9\xC2\xA1\xC2\xB2|\xC3\xA9\x61\x72
# "éba"
\xC3\xA9\xC2\xA2\xC2\xA1|\xC3\xA9\x62\x61
# "ébo"
\xC3\xA9\xC2\xA2\xC2\xAF|\xC3\xA9\x62\x6F
# "éco"
\xC3\xA9\xC2\xA3\xC2\xAF|\xC3\xA9\x63\x6F
# "éd "
\xC3\xA9\xC2\xA4\xC2\xA0|\xC3\xA9\x64\x20
# "ée,"
\xC3\xA9\xC2\xA5\xC2\xAC|\xC3\xA9\x65\x2C
# "éla"
\xC3\xA9\xC2\xAC\xC2\xA1|\xC3\xA9\x6C\x61
# "éle"
\xC3\xA9\xC2\xAC\xC2\xA5|\xC3\xA9\x6C\x65
# "éli"
\xC3\xA9\xC2\xAC\xC2\xA9|\xC3\xA9\x6C\x69
# "éme"
\xC3\xA9\xC2\xAD\xC2\xA5|\xC3\xA9\x6D\x65
# "é. "
\xC3\xA9\xC2\xAE\xC2\xA0|\xC3\xA9\x2E\x20
# "éon"
\xC3\xA9\xC2\xAF\xC2\xAE|\xC3\xA9\x6F\x6E
# "éot"
\xC3\xA9\xC2\xAF\xC2\xB4|\xC3\xA9\x6F\x74
# "ésy"
\xC3\xA9\xC2\xB3\xC2\xB9|\xC3\xA9\x73\x79
# "é|B"
\xC3\xA9\xC2\xBC\xC2\x82|\xC3\xA9\x7C\x42
# "é|G"
\xC3\xA9\xC2\xBC\xC2\x87|\xC3\xA9\x7C\x47
# "é|T"
\xC3\xA9\xC2\xBC\xC2\x94|\xC3\xA9\x7C\x54
# "ë S"
\xC3\xAB\xC2\xA0\xC2\x93|\xC3\xAB\x20\x53
# "ël "
\xC3\xAB\xC2\xAC\xC2\xA0|\xC3\xAB\x6C\x20
# "ía "
\xC3\xAD\xC2\xA1\xC2\xA0|\xC3\xAD\x61\x20
#
#
# 2) Characters in range [C3][B0-BF][C2][80-BF][C2][80-BF][C2][80-BF]
#
# "ña ["
\xC3\xB1\xC2\xA1\xC2\xA0\xC2\x9B|\xC3\xB1\x61\x20\x5B
# "ña) "
\xC3\xB1\xC2\xA1\xC2\xA9\xC2\xA0|\xC3\xB1\x61\x29\x20
# "ñarr"
\xC3\xB1\xC2\xA1\xC2\xB2\xC2\xB2|\xC3\xB1\x61\x72\x72
# "ña|C"
\xC3\xB1\xC2\xA1\xC2\xBC\xC2\x83|\xC3\xB1\x61\x7C\x43
# "ón b"
\xC3\xB3\xC2\xAE\xC2\xA0\xC2\xA2|\xC3\xB3\x6E\x20\x62
#
#
# 3) Characters represented with [EF][BF][BD] bytes - these fixups require
# more context bytes to be given in order to know the intended replacement
# character as the [EF][BF][BD] bytes can represent *any* character.
#
# These fixups are mostly required for "The Community Channel" listings which
# have mis-encoded apostrophe/quotation characters.
#
# "4'11"
\x34\xEF\xBF\xBD\x31\x31|\x34\x27\x31\x31
# "Alzheimer's"
\x41\x6C\x7A\x68\x65\x69\x6D\x65\x72\xEF\xBF\xBD\x73|\x41\x6C\x7A\x68\x65\x69\x6D\x65\x72\x27\x73
# "Luke's"
\x4C\x75\x6B\x65\xEF\xBF\xBD\x73|\x4C\x75\x6B\x65\x27\x73
# "isn't"
\x69\x73\x6E\xEF\xBF\xBD\x74|\x69\x73\x6E\x27\x74
# "country's"
\x63\x6F\x75\x6E\x74\x72\x79\xEF\xBF\xBD\x73|\x63\x6F\x75\x6E\x74\x72\x79\x27\x73
# "surfer's"
\x73\x75\x72\x66\x65\x72\xEF\xBF\xBD\x73|\x73\x75\x72\x66\x65\x72\x27\x73
# " "sustainable development" "
\xEF\xBF\xBD\x73\x75\x73\x74\x61\x69\x6E\x61\x62\x6C\x65\x20\x64\x65\x76\x65\x6C\x6F\x70\x6D\x65\x6E\x74\xEF\xBF\xBD|\x22\x73\x75\x73\x74\x61\x69\x6E\x61\x62\x6C\x65\x20\x64\x65\x76\x65\x6C\x6F\x70\x6D\x65\x6E\x74\x22
# "community's"
\x63\x6F\x6D\x6D\x75\x6E\x69\x74\x79\xEF\xBF\xBD\x73|\x63\x6F\x6D\x6D\x75\x6E\x69\x74\x79\x27\x73
# "Europe's"
\x45\x75\x72\x6F\x70\x65\xEF\xBF\xBD\x73|\x45\x75\x72\x6F\x70\x65\x27\x73
# " "Save the Forests of Jharkhand" "
\xEF\xBF\xBD\x53\x61\x76\x65\x20\x74\x68\x65\x20\x46\x6F\x72\x65\x73\x74\x73\x20\x6F\x66\x20\x4A\x68\x61\x72\x6B\x68\x61\x6E\x64\xEF\xBF\xBD|\x22\x53\x61\x76\x65\x20\x74\x68\x65\x20\x46\x6F\x72\x65\x73\x74\x73\x20\x6F\x66\x20\x4A\x68\x61\x72\x6B\x68\x61\x6E\x64\x22
# " "Torang" "
\xEF\xBF\xBD\x54\x6F\x72\x61\x6E\x67\xEF\xBF\xBD|\x22\x54\x6F\x72\x61\x6E\x67\x22
# " "Snapshot" "
\xEF\xBF\xBD\x53\x6E\x61\x70\x73\x68\x6F\x74\xEF\xBF\xBD|\x22\x53\x6E\x61\x70\x73\x68\x6F\x74\x22
# "Parkinson's"
\x50\x61\x72\x6B\x69\x6E\x73\x6F\x6E\xEF\xBF\xBD\x73|\x50\x61\x72\x6B\x69\x6E\x73\x6F\x6E\x27\x73
# "UK's"
\x55\x4B\xEF\xBF\xBD\x73|\x55\x4B\x27\x73
# " "Your Game" "
\xEF\xBF\xBD\x59\x6F\x75\x72\x20\x47\x61\x6D\x65\xEF\xBF\xBD|\x22\x59\x6F\x75\x72\x20\x47\x61\x6D\x65\x22
# "Naomi's"
\x4E\x61\x6F\x6D\x69\xEF\xBF\xBD\x73|\x4E\x61\x6F\x6D\x69\x27\x73
# "can't"
\x63\x61\x6E\xEF\xBF\xBD\x74|\x63\x61\x6E\x27\x74
# "doesn't"
\x64\x6F\x65\x73\x6E\xEF\xBF\xBD\x74|\x64\x6F\x65\x73\x6E\x27\x74
# " "unwritten" "
\xEF\xBF\xBD\x75\x6E\x77\x72\x69\x74\x74\x65\x6E\xEF\xBF\xBD|\x22\x75\x6E\x77\x72\x69\x74\x74\x65\x6E\x22
# "Women's"
\x57\x6F\x6D\x65\x6E\xEF\xBF\xBD\x73|\x57\x6F\x6D\x65\x6E\x27\x73
# "Deafhood" "
\xEF\xBF\xBD\x44\x65\x61\x66\x68\x6F\x6F\x64\xEF\xBF\xBD|\x22\x44\x65\x61\x66\x68\x6F\x6F\x64\x22
# "Sam's"
\x53\x61\x6D\xEF\xBF\xBD\x73|\x53\x61\x6D\x27\x73
# "he's"
\x68\x65\xEF\xBF\xBD\x73|\x68\x65\x27\x73
# "countryside's"
\x63\x6F\x75\x6E\x74\x72\x79\x73\x69\x64\x65\xEF\xBF\xBD\x73|\x63\x6F\x75\x6E\x74\x72\x79\x73\x69\x64\x65\x27\x73
#
#
# 4) Characters represented with doubly-encoded [EF][BF][BD] bytes which are
# seen in the raw data as bytes [C3][AF][C2][BF][C2][BD] - these fixups require
# more context bytes to be given in order to know the intended replacement
# character as the [C3][AF][C2][BF][C2][BD] bytes can represent *any* character
#
#
# "cliché"
\x63\x6C\x69\x63\x68\xC3\xAF\xC2\xBF\xC2\xBD|\x63\x6C\x69\x63\x68\xC3\xA9
# "Díaz"
\x44\xC3\xAF\xC2\xBF\xC2\xBD\x61\x7A|\x44\xC3\xAD\x61\x7A
# "Hallström"
\x48\x61\x6C\x6C\x73\x74\x72\xC3\xAF\xC2\xBF\xC2\xBD\x6D|\x48\x61\x6C\x6C\x73\x74\x72\xC3\xB6\x6D
# "Hofstätter"
\x48\x6F\x66\x73\x74\xC3\xAF\xC2\xBF\xC2\xBD\x74\x74\x65\x72|\x48\x6F\x66\x73\x74\xC3\xA4\x74\x74\x65\x72
# "tête-à-tête"
\x74\xC3\xAF\xC2\xBF\xC2\xBD\x74\x65\x2D\xC3\xAF\xC2\xBF\xC2\xBD\x2D\x74\xC3\xAF\xC2\xBF\xC2\xBD\x74\x65|\x74\xC3\xAA\x74\x65\x2D\xC3\xA0\x2D\x74\xC3\xAA\x74\x65
# "exposé"
\x65\x78\x70\x6F\x73\xC3\xAF\xC2\xBF\xC2\xBD|\x65\x78\x70\x6F\x73\xC3\xA9
# "Koundé"
\x4B\x6F\x75\x6E\x64\xC3\xAF\xC2\xBF\xC2\xBD|\x4B\x6F\x75\x6E\x64\xC3\xA9
# "Carré"
\x43\x61\x72\x72\xC3\xAF\xC2\xBF\xC2\xBD|\x43\x61\x72\x72\xC3\xA9
# "Fouchécourt"
\x46\x6F\x75\x63\x68\xC3\xAF\xC2\xBF\xC2\xBD\x63\x6F\x75\x72\x74|\x46\x6F\x75\x63\x68\xC3\xA9\x63\x6F\x75\x72\x74
# "Mélanie"
\x4D\xC3\xAF\xC2\xBF\xC2\xBD\x6C\x61\x6E\x69\x65|\x4D\xC3\xA9\x6C\x61\x6E\x69\x65
# "Déborah"
\x44\xC3\xAF\xC2\xBF\xC2\xBD\x62\x6F\x72\x61\x68|\x44\xC3\xA9\x62\x6F\x72\x61\x68
# "François"
\x46\x72\x61\x6E\xC3\xAF\xC2\xBF\xC2\xBD\x6F\x69\x73|\x46\x72\x61\x6E\xC3\xA7\x6F\x69\x73
# "Bonnaffé"
\x42\x6F\x6E\x6E\x61\x66\x66\xC3\xAF\xC2\xBF\xC2\xBD|\x42\x6F\x6E\x6E\x61\x66\x66\xC3\xA9
# "López"
\x4C\xC3\xAF\xC2\xBF\xC2\xBD\x70\x65\x7A|\x4C\xC3\xB3\x70\x65\x7A
# "Verdú"
\x56\x65\x72\x64\xC3\xAF\xC2\xBF\xC2\xBD|\x56\x65\x72\x64\xC3\xBA
# "Peña"
\x50\x65\xC3\xAF\xC2\xBF\xC2\xBD\x61|\x50\x65\xC3\xB1\x61
# "Jürgen"
\x4A\xC3\xAF\xC2\xBF\xC2\xBD\x72\x67\x65\x6E|\x4A\xC3\xBC\x72\x67\x65\x6E
# "Süskind"
\x53\xC3\xAF\xC2\xBF\xC2\xBD\x73\x6B\x69\x6E\x64|\x53\xC3\xBC\x73\x6B\x69\x6E\x64
# "mêlée"
\x6D\xC3\xAF\xC2\xBF\xC2\xBD\x6C\xC3\xAF\xC2\xBF\xC2\xBD\x65|\x6D\xC3\xAA\x6C\xC3\xA9\x65
# "Déjà"
\x44\xC3\xAF\xC2\xBF\xC2\xBD\x6A\xC3\xAF\xC2\xBF\xC2\xBD|\x44\xC3\xA9\x6A\xC3\xA0
# "François"
\x46\x72\x61\x6E\xC3\xAF\xC2\xBF\xC2\xBD\x6F\x69\x73|\x46\x72\x61\x6E\xC3\xA7\x6F\x69\x73
# "José"
\x4A\x6F\x73\xC3\xAF\xC2\xBF\xC2\xBD|\x4A\x6F\x73\xC3\xA9
# "André"
\x41\x6E\x64\x72\xC3\xAF\xC2\xBF\xC2\xBD|\x41\x6E\x64\x72\xC3\xA9
# "Hélène"
\x48\xC3\xAF\xC2\xBF\xC2\xBD\x6C\xC3\xAF\xC2\xBF\xC2\xBD\x6E\x65|\x48\xC3\xA9\x6C\xC3\xA8\x6E\x65
# "Berléand"
\x42\x65\x72\x6C\xC3\xAF\xC2\xBF\xC2\xBD\x61\x6E\x64|\x42\x65\x72\x6C\xC3\xA9\x61\x6E\x64
# "Mühe"
\x4D\xC3\xAF\xC2\xBF\xC2\xBD\x68\x65|\x4D\xC3\xBC\x68\x65
# "Hübner"
\x48\xC3\xAF\xC2\xBF\xC2\xBD\x62\x6E\x65\x72|\x48\xC3\xBC\x62\x6E\x65\x72
# "vérité"
\x76\x65\x72\x69\x74\xC3\xAF\xC2\xBF\xC2\xBD|\x76\xC3\xA9\x72\x69\x74\xC3\xA9
# "vérité"
\x76\xC3\xAF\xC2\xBF\xC2\xBD\x72\x69\x74\xC3\xAF\xC2\xBF\xC2\xBD|\x76\xC3\xA9\x72\x69\x74\xC3\xA9
# "Cuarón"
\x43\x75\x61\x72\xC3\xAF\xC2\xBF\xC2\xBD\x6E|\x43\x75\x61\x72\xC3\xB3\x6E
# "Mamá También"
\x4D\x61\x6D\xC3\xAF\xC2\xBF\xC2\xBD\x20\x54\x61\x6D\x62\x69\xC3\xAF\xC2\xBF\xC2\xBD\x6E|\x4D\x61\x6D\xC3\xA1\x20\x54\x61\x6D\x62\x69\xC3\xA9\x6E
# "Bankolé"
\x42\x61\x6E\x6B\x6F\x6C\xC3\xAF\xC2\xBF\xC2\xBD|\x42\x61\x6E\x6B\x6F\x6C\xC3\xA9
# "Echevarría"
\x45\x63\x68\x65\x76\x61\x72\x72\xC3\xAF\xC2\xBF\xC2\xBD\x61|\x45\x63\x68\x65\x76\x61\x72\x72\xC3\xAD\x61
# "fiancé"
\x66\x69\x61\x6E\x63\xC3\xAF\xC2\xBF\xC2\xBD|\x66\x69\x61\x6E\x63\xC3\xA9
