#!/usr/bin/perl -w

=pod

=head1 NAME

tv_grab_sn - Grab TV listings for Sweden or Norway.

=head1 SYNOPSIS

tv_grab_sn --help

tv_grab_sn [--config-file FILE] --configure [--loc] [--list-channels]

tv_grab_sn [--config-file FILE] [--output FILE] [--days N]
           [--offset N] [--quiet]

=head1 DESCRIPTION

Output TV listings for several channels available in Sweden or Norway.
The data comes from www.dagenstv.com which is owned by the largest
company providing TV listings in Sweden.  The grabber relies on
parsing HTML so it might stop working at any time.

First run B<tv_grab_sn --configure> to choose whether you want Swedish
or Norwegian listings, and which channels you want to download.  Then
running B<tv_grab_sn> with no arguments will output listings in XML
format to standard output.

B<--configure> Prompt for Sweden or Norway, prompt for which channels,
and write the configuration file.

B<--config-file FILE> Set the name of the configuration file, the
default is B<~/.xmltv/tv_grab_sn.conf>.  This is the file written by
B<--configure> and read when grabbing.

B<--output FILE> write to FILE rather than standard output.

B<--days N> grab N days.  The default is ten.

B<--offset N> start N days in the future.  The default is to start
from today.

B<--quiet> suppress the progress messages normally written to standard
error.

=head1 SEE ALSO

L<xmltv(5)>.

=head1 AUTHOR

Originally written by Stefan G:orling, stefan@gorling.se. Maintained
by Staffan Malmgren, staffan@tomtebo.org

=head1 BUGS

The data source does not include full channels information and the
channels are identified by short names rather than the RFC2838 form
recommended by the XMLTV DTD.

=cut

use strict;
use XMLTV::Version '$Id: tv_grab_sn,v 1.45 2003/08/28 20:56:58 epaepa Exp $ ';
use Getopt::Long;
use HTML::TableExtract; # parse html better than regular expressions.
use Date::Manip;
use IO::File;

use XMLTV;
use XMLTV::Memoize;
use XMLTV::Ask;
use XMLTV::TZ qw(parse_local_date);
use XMLTV::Europe_TZ qw(utc_offset);
use XMLTV::Config_file;
use XMLTV::Get_nice;
use XMLTV::Mode;

# Todo: perhaps we should internationalize messages and docs?
use XMLTV::Usage <<END
$0: get Swedish or Norwegian television listings in XMLTV format
To configure: $0 --configure [--config-file FILE]
To grab listings: $0 [--config-file FILE] [--output FILE] [--days N]
        [--offset N] [--quiet]
END
  ;

# Use Term::ProgressBar if installed.
use constant Have_bar => eval { require Term::ProgressBar; 1 };

# Use Log::TraceMessages if installed.
BEGIN {
    eval { require Log::TraceMessages };
    if ($@) {
	*t = sub {};
	*d = sub { '' };
    }
    else {
	*t = \&Log::TraceMessages::t;
	*d = \&Log::TraceMessages::d;
	Log::TraceMessages::check_argv();
    }
}

# Memoize some date parsing routines, if possible.  FIXME move to
# XMLTV::Memoize.
#
eval { require Memoize };
unless ($@) {
    foreach (qw(nextday utc_offset ParseDate UnixDate dc Date_Cmp)) {
	Memoize::memoize($_) or warn "cannot memoize $_";
    }
}

sub xhead( $ );
sub configure();
sub get_channels( $ );
sub tidy( $ );
sub nextday( $ );
sub extract_channel_data( $$ );
sub process_table( $$$$ );
sub decode_char( $ );
sub dc( $$ );

my %COUNTRIES = (Sweden => 'se', Norway => 'no');

# Whether zero-length programmes should be included in the output.
my $WRITE_ZERO_LENGTH = 0;

# The base timezone ("winter time") for Sweden and Norway
my $BASE_TZ = "CET";

# For an unknown reason, the site includes '&#xxxx' character
# escapes even though the page is Latin-1 and so should not
# have chars outside the 8-bit range.  Try to make them into
# something legal.
#
# This is partly based on web searches for examples of Spanish
# words with the missing letter filled in, so it may not be
# accurate.
my %char_map
  = (8217 => 'i',
     8212 => chr(242),		# LATIN SMALL LETTER O WITH GRAVE
     8211 => chr(241),		# LATIN SMALL LETTER N WITH TILDE
     8240 => chr(226),		# LATIN SMALL LETTER A WITH CIRCUMFLEX
     8226 => chr(239),		# LATIN SMALL LETTER I WITH DIAERESIS
     8218 => 'i',
     2014 => '--',
     382  => chr(251),		# LATIN SMALL LETTER U WITH CIRCUMFLEX
     353  => chr(220),          # LATIN CAPITAL LETTER U WITH DIAERESIS
     339  => 'u',
    );

# Get options, including undocumented --cache option.
XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux');
my ($opt_days, $opt_offset, $opt_help, $opt_output,
    $opt_list_channels, $opt_loc,
    $opt_configure, $opt_config_file, $opt_quiet);
$opt_days  = 10; # default
$opt_offset = 0; # default
$opt_quiet  = 0; # default
GetOptions('days=i'        => \$opt_days,
	   'offset=i'      => \$opt_offset,
	   'help'          => \$opt_help,
	   'configure'     => \$opt_configure,
	   'config-file=s' => \$opt_config_file,
	   'output=s'      => \$opt_output,
	   'quiet'         => \$opt_quiet,
	   'loc=s'         => \$opt_loc,
	   'list-channels' => \$opt_list_channels,
	  )
  or usage(0);
die 'number of days must not be negative'
  if (defined $opt_days && $opt_days < 0);
usage(1) if $opt_help;

my $mode = XMLTV::Mode::mode('grab', # default
			     $opt_configure => 'configure',
			     $opt_list_channels => 'list-channels',
			    );

# NB parse_local_date() before Date_Init().
my $now = dc(parse_local_date('now'), "$opt_offset days");
Date_Init('TZ=UTC');

# File that stores which channels to download.
my $config_file
  = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_sn', $opt_quiet);

my @config_lines; # used only in mode 'grab'
if ($mode eq 'configure') {
    configure();
    exit();
}
elsif ($mode eq 'grab') {
    @config_lines = XMLTV::Config_file::read_lines($config_file);
}
elsif ($mode eq 'list-channels') {
    # Config file not used.
}
else { die }

# Not configuring, must be writing output of some kind.
die if $mode ne 'grab' and $mode ne 'list-channels';
my %w_args;
if (defined $opt_output) {
    my $fh = new IO::File(">$opt_output");
    die "cannot write to $opt_output: $!" if not defined $fh;
    $w_args{OUTPUT} = $fh;
}
$w_args{encoding} = 'ISO-8859-1';
my $writer = new XMLTV::Writer(%w_args);

if ($mode eq 'list-channels') {
    if (not defined $opt_loc) {
	my $msg = "--loc option required with --list-channels:\n";
	foreach (sort keys %COUNTRIES) {
	    $msg .= "    --loc $COUNTRIES{$_} for $_\n";
	}
	die $msg;
    }
    $writer->start(xhead($opt_loc));
    foreach my $chan (get_channels($opt_loc)) {
	$writer->write_channel($chan);
    }
    $writer->end();
    exit();
}

# Not configuring or listing channels, must be grabbing.
die if $mode ne 'grab';
my ($country, %channels, $ch_did, $ch_name);
my $line_num = 0;
foreach (@config_lines) {
    ++ $line_num;
    next if not defined;
    my $where = "$config_file:$line_num";
    if (/^country:?\s+(\w\w)$/) {
	warn "$where: already seen country\n"
	  if defined $country;
	$country = $1;
    }
    elsif (/^channel:?\s+(\d+)\s+([^\#]+)/) {
	$ch_did = $1;
	$ch_name = $2; 
	$ch_name =~ s/\s*$//;
	$channels{$ch_did} = $ch_name;
    }
    else {
	warn "$where: bad line\n";
    }
}
die "No channels specified, run me with --configure\n"
  if not %channels;

# TODO: standardize these things between grabbers.
$writer->start(xhead($country));

my @to_get;

# the order in which we fetch the channels do not matter
foreach my $ch_did (keys %channels) {
    my $ch_name = $channels{$ch_did};
    my $ch_xid = "$ch_did.dagenstv.com";
    $writer->write_channel({ id => $ch_xid,
			     'display-name' => [ [ $ch_name ] ] });
    my $day=$now;
    for (my $i = 0; $i < $opt_days; $i++) {
	#for each day
	if ($i > 0) {
	    $day = dc($day, '+ 1 day');
	}
	push @to_get, [ $day, $ch_xid, $ch_did ];
    }
}

# This progress bar is for both downloading and parsing.  Maybe
# they could be separate, as with tv_grab_uk.
#
my $bar = new Term::ProgressBar('getting listings', scalar @to_get)
  if Have_bar && not $opt_quiet;
foreach (@to_get) {
    my ($day, $ch_xid, $ch_did) = @$_;
    process_table($writer, $day, $ch_xid, $ch_did);
    update $bar if Have_bar && not $opt_quiet;
}
$writer->end();
exit();

sub xhead( $ ) {
    my $country = shift;
  return { 'source-info-url'     => 'http://www.dagenstv.com/',
	   'source-data-url'     => "http://www.dagenstv.com/$country/frontpage",
	   'generator-info-name' => 'XMLTV',
	   'generator-info-url'  => 'http://membled.com/work/apps/xmltv/',
	 };
}

sub configure() {
    XMLTV::Config_file::check_no_overwrite($config_file);

    # FIXME commonize this configuration stuff within the XMLTV project.
    open(CONF, ">$config_file") or die "cannot write to $config_file: $!";
    my $default_cn = 'Sweden';
    my $cn = askQuestion('Grab listings for which country?',
			 $default_cn,
			 sort keys %COUNTRIES);
    my $c = $COUNTRIES{$cn}; die if not defined $c;
    print CONF "country $c\t# $cn\n";

    # Ask about each channel.
    my @chs = get_channels($c);
    my @names = map { $_->{'display-name'}->[0]->[0] } @chs;
    my @qs = map { "add channel $_?" } @names;
    my @want = askManyBooleanQuestions(1, @qs);
    foreach (@chs) {
	my $w = shift @want;
	warn("cannot read input, stopping channel questions"), last
	  if not defined $w;
	# No need to print to user - XMLTV::Ask is verbose enough.

	# Print a config line, but comment it out if channel not wanted.
	print CONF '#' if not $w;
	my $name = shift @names;
	print CONF "channel $_->{id} $name\n";
	# TODO don't store display-name in config file.
    }

    close CONF or warn "cannot close $config_file: $!";
    say("Finished configuration.");
}

# get the channels for a country
sub get_channels($) {
    my $c = shift;
    my @r;
    my $bar = new Term::ProgressBar('getting list of channels', 1)
      if Have_bar && not $opt_quiet;
    my $url="http://www.dagenstv.com/$c/frontpage";
    my $data = get_nice($url);
    die "could not get channel listing $url, aborting\n"
      if not defined $data;
    update $bar if Have_bar && not $opt_quiet;
    return extract_channel_data($data, $c);
}

# Given a page of HTML with a 'cha' dropdown, return a list of channel
# hashes.
#
# (the front page has two select dropdowns, one for channels and one
# for categories. If you look at only the <option> tag, there's no way
# to tell them apart. we have to cut out just the relevant <select>
# element.)
#
# Parameters:
#   string of HTML
#   language of the channel names
#
sub extract_channel_data( $$ ) {
    my ($data, $lang) = @_;
    my @r;
    $data =~ /<select name="cha"(.*?)<\/select>/s
	  or die "didn't see <select name=\"cha\"... in HTML";
    $_ = $1;
    while (/<option\s*value=\"(\d+)\"\s*>([^>]+)<\/option>/sg) {
        my $ch_name=tidy($2);
	my $ch_did=$1;
	my $ch = { 'display-name' => [ [ $ch_name, $lang ] ],
		   'id' => $ch_did };
	push @r, $ch;
    }
    return @r;
}

#
#Arguments:
#    XMLTV::Writer object to write to
#    Date::Manip object giving the day to grab
#    xmltv id of channel
#    dagenstv id of channel
sub process_table( $$$$ ) {
    my ($w, $date, $ch_xmltv_id, $ch_dagenstv_id) = @_;
    t 'process_table() ENTRY';
    t '$w=' . d $w;
    t '$date=' . d $date;
    t '$ch_xmltv_id=' . d $ch_xmltv_id;
    t '$ch_dagenstv_id=' . d $ch_dagenstv_id;

    my $today = UnixDate($date, '%Y-%m-%d');
    my $url = "http://www.dagenstv.com/$country/chart/?dat=$today&cha=$ch_dagenstv_id";
    t "getting URL: $url";
    my $data=get_nice($url);
    if (not defined $data) {
	warn "could not fetch $url, skipping this channel\n";
	return;
    }
    t 'got data: ' . d $data;
    local $SIG{__WARN__} = sub {
	warn "$url: $_[0]";
    };

    # Previous versions used HTML::TableExtract to find tables at a
    # fixed position in the page.  But since the listings for Sweden
    # and Norway now differ in layout, and allowing for future layout
    # changes, it's easier to just look at all the tables and find the
    # one which contains listings.
    #
    t 'creating HTML::TableExtract object';
    my $te = new HTML::TableExtract(keep_html => 1);
    $te->parse($data);
    my ($best_ts, $best_num_matches);
    foreach my $ts ($te->table_states()) {
	t 'found table: ' . d $ts;
	my $num_matches = 0;
	t 'set $num_matches=0';
	foreach my $r ($ts->rows()) {
	    t 'testing row: ' . d $r;
	    foreach (@$r) {
		t 'testing field: ' . d $_;
		if (/^\d\d:\d\d$/) {
		    t 'matches';
		    ++ $num_matches;
		}
	    }
	}
	if (not defined $best_ts or $num_matches > $best_num_matches) {
	    $best_ts = $ts;
	    $best_num_matches = $num_matches;
	}
    }
    if (not defined $best_ts) {
	warn "no tables found in page (strange), skipping";
	return;
    }
    if ($best_num_matches == 0) {
	warn "did not find any table containing listings, skipping";
	return;
    }

    $today =~ tr/-//d;
    my %program = (channel => $ch_xmltv_id);
    foreach my $row ($best_ts->rows()) {
	t 'doing row of table: ' . d $row;

	# Get the interesting fields of this row.
	my ($first, $third) = (tidy($row->[0]), tidy($row->[2]));
	t 'interesting fields: ' . d $first;
	t '...and: ' . d $third;

	if ($first=~m/(\d\d:\d\d)/) {
	    if (defined $program{title} && not defined $program{stop}) {
		#We have a record without endtime. use next starttime as endtime
		$program{stop} = utc_offset("$today $1", $BASE_TZ);
		my $cmp = Date_Cmp($program{start}, $program{stop});
		if ($cmp > 0) {
		    $today = nextday($today);
		    $program{stop} = utc_offset("$today $1", $BASE_TZ);
		}

		if ($WRITE_ZERO_LENGTH or $cmp) {
		    $w->write_programme(\%program);
		}
		%program = (channel => $ch_xmltv_id);
	    }

	    $program{start} = utc_offset("$today $1", $BASE_TZ);
	    
	    if (defined $third) {
		my $title_class = 'charteventname';
		my %classes = ($title_class => sub {
				   push @{$program{title}}, [ $_ ];
			       },
			       chartshowview => sub {
				   if (/^\[(\d+)\]$/) {
				       $program{showview} = $1;
				   }
				   else {
				       warn "bad showview number '$_'\n";
				   }
			       },
			       chartdescription => sub {
				   push @{$program{desc}}, [ $_ ];
			       },
			      );
		while ($third=~s/<span id=\"\d+\" class=\"([^>]+?)\">\s*([^<]+)\s*//) {
		    my ($class, $content) = ($1, $2);
		    my $handler = $classes{$class};
		    if (not defined $handler) {
			warn "unknown class $class in HTML\n";
			next;
		    }
		    for ($content) { $handler->() }
		}

		if ($third=~m/<\/span>\s*\((\d\d:\d\d)\)\s*</s) {
		    if (not defined $program{title}) {
			warn "saw no title ('$title_class') for program, discarding\n";
		    }
		    else {
			$program{stop}=utc_offset("$today $1", $BASE_TZ);
			my $cmp = Date_Cmp($program{start}, $program{stop});
			if ($cmp > 0) {
			    $today = nextday($today);
			    $program{stop} = utc_offset("$today $1", $BASE_TZ);
			}
			
			if ($WRITE_ZERO_LENGTH or $cmp) {
			    $w->write_programme(\%program);
			}
		    }
		    %program = (channel => $ch_xmltv_id);
		}
	    }			# if we have a time row.
	}
    }
}


# Trim spaces and newlines, fix rogue characters.
my %warned_char;
# I hope that these &#xxx; entities are decimal not octal.
sub decode_char( $ ) {
    for (shift) {
	if ($_ < 256) {
	    # It might not be a legal Latin-1 value but we deal with
	    # those afterwards.
	    #
	    return chr($_);
	}
	elsif ($_ >= 256 and $_ < 65536) {
	    # 16-bit character, really shouldn't occur.  We can
	    # correct some of these.
	    #
	    my $to = $char_map{$_};
	    if (not defined $to) {
		warn "stripping unknown 16-bit character $_ in HTML\n"
		  unless $warned_char{$_}++;
		return '';
	    }
	    return $to;
	}
	else {
	    warn "stripping very strange character $_ in HTML\n"
	      unless $warned_char{$_}++;
	    return '';
	}
    }
}
# Remove weird characters and entities in the HTML.  Some of these
# come from Per Wigren's dagenstv2xmltv grabber, but most are just
# observed from what the site returns.
#
# Could use HTML::Entites for at least some of these things.
sub tidy($) {
    for (my $tmp=shift) {
	return undef if not defined;
	s/\n$//sg;
	s/\s*$//sg;
	s/^\s*//sg;

	# Deal with &#xxx; entities, including those which specify
	# illegal characters.
	s/&\#(\d{3,});/decode_char($1)/eg;
	s/&\#[xX](\d{3,});/decode_char(oct("0x$1"))/eg;

	# We've reduced the string to 8-bit chars, but some of them
	# are not legal Latin-1.
	s/([\000-\037])//g; # remove control characters
	tr/\220/\352/;      # LATIN SMALL LETTER E WITH CIRCUMFLEX
	tr/\235/\371/;      # LATIN SMALL LETTER U WITH GRAVE
	tr/\x86/?/;         # no idea, FIXME
	tr/\217/e/;
	tr/\200-\237//d && warn "removing illegal high-bit-set chars\n";

	s/&[rl]squo;/chr(39)/eg; # ' character
	s/&[rl]dquo;/chr(22)/eg; # " character
	s/&mdash;/--/g;

	# dagenstv2xmltv does &[aeiou](grave|acute|circ|tilde|uml|ring);
	# entities, but I have never seen them in the site's output.

	# It's not necessary to change the copyright sign 169, because
	# that is a legal character in the Latin-1 output.

	s/&lt;/</g;
	s/&gt;/>/g;
	s/&amp;/&/g;

	return $_;
    }
}

# Bump a YYYYMMDD date by one.
sub nextday( $ ) {
    my $d = shift; $d =~ /^\d{8}$/ or die;
    my $p = ParseDate($d);
    my $n = dc($p, '+ 1 day');
    return UnixDate($n, '%Q');
}

# Wrapper for DateCalc().
sub dc( $$ ) {
    my $err;
    my $r = DateCalc(@_, \$err);
    die "DateCalc() failed with $err" if $err;
    die 'DateCalc() returned undef' if not defined $r;
    return $r;
}
