#!/usr/bin/perl -w

=pod

=head1 NAME

tv_grab_nl - Grab TV listings for Holland.

=head1 SYNOPSIS

tv_grab_nl --help

tv_grab_nl [--config-file FILE] --configure

tv_grab_nl [--config-file FILE] [--output FILE] [--days N]
           [--offset N] [--quiet] [--slow]

=head1 DESCRIPTION

Output TV listings for several channels available in Holland.
The data comes from www.tvgids.nl. The grabber relies on
parsing HTML so it might stop working at any time.

First run B<tv_grab_nl --configure> to choose, which channels you want
to download. Then running B<tv_grab_nl> with no arguments will output
listings in XML format to standard output.

B<--configure> Prompt for which channels,
and write the configuration file.

B<--config-file FILE> Set the name of the configuration file, the
default is B<~/.xmltv/tv_grab_nl.conf>.  This is the file written by
B<--configure> and read when grabbing.

B<--output FILE> write to FILE rather than standard output.

B<--days N> grab N days.  The default is one week.

B<--offset N> start N days in the future.  The default is to start
from today.

B<--slow> Fetch full programme details from the site.  This gives
richer output but involves many more page fetches.

B<--quiet> suppress the progress messages normally written to standard
error.

=head1 SEE ALSO

L<xmltv(5)>.

=head1 AUTHOR

Guido Diepen and Ed Avis (ed@membled.com).  Originally based on
tv_grab_fi by Matti Airas.

=head1 BUGS

The data source does not include full channels information and the
channels are identified by short names rather than the RFC2838 form
recommended by the XMLTV DTD.

The site has many more channels than it lists in the channel selector,
so the configure stage is not able to offer a complete choice of
channels.  You can however get extra ones by hand-editing the config
file.  IE<39>m still hoping to find a web page that lists all the
channels the site has.

=cut

######################################################################
# initializations

use strict;
use XMLTV::Version '$Id: tv_grab_nl,v 1.50 2003/09/14 17:43:02 epaepa Exp $ ';
use Getopt::Long;
use Data::Dumper;
use HTML::TreeBuilder;
use HTML::Entities; # parse entities
use IO::File;
use URI;
use Date::Manip;
use XMLTV;
use XMLTV::Memoize;
use XMLTV::Ask;
use XMLTV::Config_file;
use XMLTV::Europe_TZ;
use XMLTV::Get_nice;
use XMLTV::Mode;
# Todo: perhaps we should internationalize messages and docs?
use XMLTV::Usage <<END
$0: get Dutch television listings in XMLTV format
To configure: $0 --configure [--config-file FILE]
To grab listings: $0 [--config-file FILE] [--output FILE] [--days N]
        [--offset N] [--quiet] [--slow]
END
  ;

# Use Log::TraceMessages if installed.
BEGIN {
    eval { require Log::TraceMessages };
    if ($@) {
	*t = sub {};
	*d = sub { '' };
    }
    else {
	*t = \&Log::TraceMessages::t;
	*d = \&Log::TraceMessages::d;
	Log::TraceMessages::check_argv();
    }
}

# Use Term::ProgressBar if installed.
use constant Have_bar => eval { require Term::ProgressBar; 1 };

# Function prototypes.
sub time_tot_str( $ );
sub time_van_str( $ );
sub get_channels();
sub process_summary_page( $$$ );
sub parse_dutch_date( $ );
sub process_details_page( $$$ );

# Whether zero-length programmes should be included in the output.
my $WRITE_ZERO_LENGTH = 0;

# Base timezone for the Netherlands.  Summer time is one hour ahead of
# this.
#
my $TZ = '+0100';

# default language
my $LANG = 'nl';

######################################################################
# get options

# Get options, including undocumented --cache option.
XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux');
my ($opt_days, $opt_offset, $opt_help, $opt_output,
    $opt_configure, $opt_config_file, $opt_quiet,
    $opt_list_channels, $opt_slow);
$opt_days   = 7; # default
$opt_offset = 0; # default
GetOptions('days=i'        => \$opt_days,
	   'offset=i'      => \$opt_offset,
	   'help'          => \$opt_help,
	   'configure'     => \$opt_configure,
	   'config-file=s' => \$opt_config_file,
	   'output=s'      => \$opt_output,
	   'quiet'         => \$opt_quiet,
	   'slow'	   => \$opt_slow,
	   'list-channels' => \$opt_list_channels,
	  )
  or usage(0);
die 'number of days must not be negative'
  if (defined $opt_days && $opt_days < 0);
usage(1) if $opt_help;

my $mode = XMLTV::Mode::mode('grab', # default
			     $opt_configure => 'configure',
			     $opt_list_channels => 'list-channels',
			    );

# File that stores which channels to download.
my $config_file
  = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_nl', $opt_quiet);

if ($mode eq 'configure') {
    XMLTV::Config_file::check_no_overwrite($config_file);
    open(CONF, ">$config_file") or die "cannot write to $config_file: $!";
    # find list of available channels
    my $bar = new Term::ProgressBar('getting list of channels', 1)
      if Have_bar && not $opt_quiet;
    my %channels = get_channels();
    die 'no channels could be found' if (scalar(keys(%channels)) == 0);
    update $bar if Have_bar && not $opt_quiet;

    # Ask about each channel.
    my @chs = sort keys %channels;
    my @names = map { $channels{$_} } @chs;
    my @qs = map { "add channel $_?" } @names;
    my @want = askManyBooleanQuestions(1, @qs);
    foreach (@chs) {
	my $w = shift @want;
	warn("cannot read input, stopping channel questions"), last
	  if not defined $w;
	# No need to print to user - XMLTV::Ask is verbose enough.

	# Print a config line, but comment it out if channel not wanted.
	print CONF '#' if not $w;
	my $name = shift @names;
        print CONF "channel $_ $name\n";
	# TODO don't store display-name in config file.
    }

    close CONF or warn "cannot close $config_file: $!";
    say("Finished configuration.");

    exit();
}

# Not configuring, we will need to write some output.
die if $mode ne 'grab' and $mode ne 'list-channels';

# But if grabbing, check the config file is sane before we write
# anything.
#
my @config_lines;
if ($mode eq 'grab') {
    @config_lines = XMLTV::Config_file::read_lines($config_file);
}

my %w_args;
if (defined $opt_output) {
    my $fh = new IO::File(">$opt_output");
    die "cannot write to $opt_output: $!" if not defined $fh;
    $w_args{OUTPUT} = $fh;
}
$w_args{encoding} = 'ISO-8859-1';
my $writer = new XMLTV::Writer(%w_args);
# TODO: standardize these things between grabbers.
$writer->start
  ({ 'source-info-url'     => 'http://www.tvgids.nl/',
     'source-data-url'     => 'http://www.tvgids.nl/',
     'generator-info-name' => 'XMLTV',
     'generator-info-url'  => 'http://membled.com/work/apps/xmltv/',
   });

if ($mode eq 'list-channels') {
    my $bar = new Term::ProgressBar('getting list of channels', 1)
      if Have_bar && not $opt_quiet;
    my %channels = get_channels();
    die 'no channels could be found' if (scalar(keys(%channels)) == 0);
    update $bar if Have_bar && not $opt_quiet;

    foreach my $ch_did (sort(keys %channels)) {
	my $ch_name = $channels{$ch_did};
	my $ch_xid = "$ch_did.tvgids.nl";
	$writer->write_channel({ id => $ch_xid,
				 'display-name' => [ [ $ch_name ] ] });
    }
    $writer->end();
    exit();
}

# Not configuring or writing channels, must be grabbing listings.
die if $mode ne 'grab';
my (%channels, @channels, $ch_did, $ch_name);
my $line_num = 0;
foreach (@config_lines) {
    ++ $line_num;
    next if not defined;

    # FIXME channel data should be read from the site, and then the
    # config file only gives the XMLTV ids that are interesting.
    #
    if (/^channel:?\s+(\S+)\s+(.+)/) {
	$ch_did = $1;
	$ch_name = $2;
	$ch_name =~ s/\s*$//;
	push @channels, $ch_did;
	$channels{$ch_did} = $ch_name;
    }
    else {
	warn "$config_file:$line_num: bad line\n";
    }
}

######################################################################
# begin main program

# $opt_offset is taken into account later, we don't need to lie about
# $now.  This does make it impossible to use --offset together with
# --cache to reuse an old cache file from a few days ago - to do that
# you need to change $now below.  But --cache is undocumented so I
# don't consider this a problem.
#
my $now = ParseDate('now');
die if not defined $now;
# Any Date_Init('TZ=UTC') would go here.  But it may not be needed
# with parse_eur_date().
#

my @to_get;

# the order in which we fetch the channels matters
foreach $ch_did (@channels) {
    $ch_name = $channels{$ch_did};
    my $ch_xid = "$ch_did.tvgids.nl";
    $writer->write_channel({ id => $ch_xid,
			     'display-name' => [ [ $ch_name ] ] });
    for (my $i = $opt_offset;$i<($opt_offset + $opt_days);$i++) {
	my $url = 'http://www.tvgids.nl/zoekprogramma.php'
	  . "?station=$ch_did&interval=$i";
	my $day = UnixDate(DateCalc($now, "+ $i days"), '%Y-%m-%d');
	die if not defined $day;
	die if ref $url;
	push @to_get, [ $url, $day, $ch_xid, $ch_did ];
    }
}

my %warned_ch_name; # suppress duplicate warnings

my @summary_page_data;
my $bar = new Term::ProgressBar('downloading summary', scalar @to_get)
  if Have_bar && not $opt_quiet;
foreach (@to_get) {
    my ($url, $day, $ch_xmltv_id, $ch_tvgids_id) = @$_;
    die if ref $url;
    push @summary_page_data,
      [ $url, $ch_xmltv_id, [ process_summary_page($url, $day, $day) ] ];
    update $bar if Have_bar and not $opt_quiet;
}

my @summary_programmes;
my %detail_url_to_summary_url;
foreach (@summary_page_data) {
    my ($summary_url, $ch_xmltv_id, $data) = @$_;
    
    # Try to do something with the programmes that have no start
    # time.  Put them into a clump with the preceding valid
    # programme.
    #
    my $clump;			# [ start, stop, programmes ].  stop may be undef.
    my @clumps;
    foreach (@$data) {
	my ($start, $stop, $title, $url) = @$_;
	die if ref $url;
	for ($detail_url_to_summary_url{$url}) {
	    warn "more than one programme with same details page $url"
	      if defined;
	    $_ = $summary_url;
	}
	
	# Start and stop are common to a whole clump, but these
	# two are given individually for each programme.
	#
	my $details = [ $title, $url ];
	
	if (not defined $start) {
	    if (not $clump) {
		warn "programme '$title' at beginning of page has no start time, dropping\n";
		next;
	    }
	    t 'found programme with no start time';
	    
	    if (defined $clump->[1]) {
		t 'make it start at stop of last clump';
		die if not defined $clump->[0];
		push @clumps, $clump;
		$clump = [ $clump->[1], $stop, [ $details ] ];
		die if not defined $clump->[0];
	    }
	    else {
		t 'current clump has no stop, add to clump';
		push @{$clump->[2]}, $details;
		t 'maybe set stop of current clump';
		$clump->[1] = $stop;
	    }
	}
	else {
	    t 'programme has start time, make new clump';
	    if ($clump) {
		die if not defined $clump->[0];
		push @clumps, $clump;
	    }
	    $clump = [ $start, $stop, [ $details ] ];
	    t 'started a new clump, now: ' . d $clump;
	    die if not defined $clump->[0];
	}
    }
    if ($clump) {
	die if not defined $clump->[0];
	push @clumps, $clump;
    }
    t '\@clumps=' . d \@clumps;
    
    # Now add the clumpidx attributes.
    foreach (@clumps) {
	my ($start, $stop, $l) = @$_;
	die if not defined $start;
	my $num_in_clump = @$l;
	if ($num_in_clump == 1) {
	    # Common case, no clumpidx needed.
	}
	elsif ($num_in_clump > 1) {
	    foreach my $i (0 .. $num_in_clump - 1) {
		# Add clumpidx as last thing in list.
		push @{$l->[$i]}, "$i/$num_in_clump";
	    }
	}
	else { die }
    }
    t 'after adding clumpidxes, \@clumps=' . d \@clumps;
    
    # Finally turn the data into programmes.
    foreach (@clumps) {
	my ($start, $stop, $l) = @$_;
	die if not defined $start;
	foreach (@$l) {
	    my ($title, $url, $clumpidx) = @$_;
	    my %h = (channel => $ch_xmltv_id,
		     title => [ [ $title, $LANG ] ],
		    );
	    for (date_to_eur($start, $TZ)) {
		$h{start} = UnixDate($_->[0], '%q') . " $_->[1]";
	    }
	    if (defined $stop) {
		for (date_to_eur($stop, $TZ)) {
		    $h{stop} = UnixDate($_->[0], '%q') . " $_->[1]";
		}
	    }
	    
	    if (defined $url) {
		die if ref $url;
		$h{url} = [ $url ];
	    }
	    $h{clumpidx} = $clumpidx if defined $clumpidx;
	    push @summary_programmes, \%h;
	}
    }
}

my @to_write;
if ($opt_slow) {
    $bar = new Term::ProgressBar('getting details', scalar @summary_programmes)
      if Have_bar && not $opt_quiet;
    foreach my $s (@summary_programmes) {
	my $urls = delete $s->{url};
	if (not defined $urls) {
	    push @to_write, $s;
	    update $bar if $bar;
	    next;
	}
	die if not @$urls;
	warn "strange, more than one URL for programme, picking first"
	  if @$urls > 1;
	my $url = $urls->[0];
	die if not defined $url;
	$url = "$url"; die if ref $url;
	my $summary_url = $detail_url_to_summary_url{$url};
	die if not defined $summary_url;
	die if ref $summary_url;
	my $ch_xmltv_id = $s->{channel};
	die if not defined $ch_xmltv_id;
	die if ref $ch_xmltv_id;
	my $detailed
	  = process_details_page($ch_xmltv_id, $url, $summary_url);
	if (not $detailed) {
	    warn "skipped details page $url\n";

	    # We still have the summary though, we can use that at least.
	    push @to_write, $s;
	    update $bar;
	    next;
	}
	if ($detailed eq 'END') {
	    # Apparently this means nothing more for this day.  But
	    # we're not processing according to days so there is no
	    # easy way to skip the right number of following
	    # programmes.  In any case, we want the programme output
	    # to be the same as in fast mode, so we more or less
	    # ignore this indication from process_details_page().
	    #
	    push @to_write, $s;
	    update $bar;
	    next;
	}

	# Could check 'van' and 'tot' times against those in the
	# original page.
	#
	delete $detailed->{van}; delete $detailed->{tot};

	# Pluck any values from $s that are also in $detailed, and
	# check they match.
	#
	foreach (sort keys %$detailed) {
	    my $old = delete $s->{$_};
	    next if not defined $old;
	    my $new = $detailed->{$_};

	    if ($_ eq 'title') {
		# We know how to merge this.  TODO write general
		# XMLTV::Merge. 
		#
		my %already;
		foreach my $a (@$new) {
		    my $d = Dumper($a);
		    $already{$d}++ && warn "duplicate $_: $d";
		}
		foreach my $o (@$old) {
		    my $d = Dumper($o);
		    push @$new, $o unless $already{$d};
		}
	    }
	    else {
		# Compare the two data structures.  For this to work
		# correctly it requires Data::Dumper 2.12 or later, as
		# shipped with perl 5.8.0.  Older versions don't
		# support $Sortkeys.  But we don't have any version
		# check here - in the worst case all that results from
		# using an older Data::Dumper is a few spurious
		# warning messages.
		#
		my $old_dump = Dumper($old);
		my $new_dump = Dumper($new);
		if ($old_dump ne $new_dump) {
		    warn "mismatch between summary page and details page $url for $_: $old_dump vs $new_dump\n";
		}
	    }
	}

	# Deal with any remaining keys in summary but not in
	# detailed.  This should include start and stop.
	#
	%$detailed = (%$detailed, %$s);

	push @to_write, $detailed;
	update $bar if $bar;
    }
}
else {
    @to_write = @summary_programmes;
}

$writer->write_programme($_) foreach @to_write;
$writer->end();

######################################################################
# subroutine definitions

# Suppress duplicate warnings.
my (%warned_regel, $warned_discarding_parts, $warned_slot);

# Returns a programme hashref, or undef, or the magic 'END'.
sub process_details_page( $$$ ) {
    foreach (@_) { die if ref }
    my ($ch_xmltv_id, $url, $master_url) = @_;
    local $SIG{__WARN__} = sub {
	warn "$url (from $master_url): $_[0]";
    };

    # We make an HTML::TreeBuilder object, get the information
    # from it and them delete it.
    #
    my $t = new HTML::TreeBuilder();
    $t->parse(get_nice($url));
    my @elems = $t->look_down(class => 'detailDeel');
    if (not @elems) {
	warn "did not see any 'detailDeel' elements, skipping page";
	return;
    }
    my @info;
    foreach (@elems) {
	my @cont = grep { ref } $_->content_list();
	my $n = scalar @cont;
	if ($n != 2) {
	    warn "'detailDeel' has $n elements instead of 2";
	    next;
	}
	my ($k, $v) = @cont;
	for ($k->attr('class')) {
	    if (not defined or $_ ne 'detailLabel2') {
		warn "didn't see 'detailLabel2' in 'detailDeel'";
		next;
	    }
	}
	for ($v->attr('class')) {
	    if (not defined or $_ ne 'detailContent2') {
		warn "didn't see 'detailContent2' in 'detailDeel'";
		next;
	    }
	}
	push @info, [ $k->as_text(), $v->as_text() ];
    }
    $t->delete(); undef $t;

    # Process the list of [ heading, data ] pairs.
    my (
	# Exactly one:
	$van, $tot, $naam,

	# At most one:
        $director, $previously_shown, $orig_title, $sub_title, $genre,
        $date, $episode_num, $actors, $writers, $commentators,

	# Zero or more:
        @presenter, @url,
       );
    # NB 'at most one' $actors but that one entry can give several.

    my ($teletext_sub, $widescreen) = 0; # boolean
    my @desc;                            # accumulate bits
    my $seen_tijdstip = 0;
    my $last;
  ELEM: foreach (@info) {
	my ($regel, $text) = @$_;
	foreach ($regel, $text) {
	    s/^\s+//; s/\s+$//;
	    
	    # Remove dodgy characters.
	    tr/\221\222/''/;
	}

	if ($regel eq '') {
	    # Continuation of the previous one, hopefully.
	    $regel = $last;
	}
	else {
	    # They usually end with a colon but not always.
	    $regel =~ s/:$//;
	    $last = $regel;
	}

	if ($regel eq 'Tijdstip') {
	    warn "seen 'Tijdstip' twice\n" if $seen_tijdstip++;
	    if (length($text)<=16) {
		t "'onvolledig' is true, nothing more to write";
		t 'process_details_page() RETURNING';
		return 'END';
	    }
	    else {
		# Extract time strings from the text, but not full
		# Date::Manip objects.
		#
		$van = time_van_str($text);
		$tot = time_tot_str($text);
	    }
	}
	elsif ($regel eq 'Inhoud') {
	    # Empty text for this happens often, just skip it.
	    push @desc, $text if $text ne '';
	}
	elsif ($regel eq 'Programma') {
	    warn "seen 'Programma' twice\n" if defined $naam;
	    # FIXME should really look for 'herhaling' in italics.
	    if ($text =~ s/\bherhaling\s+van\s+(\d\d?)-(\d\d?)-(\d{4})//) {
		warn "seen previously-shown information twice\n"
		  if $previously_shown;
		my ($dd, $mm, $yyyy) = ($1, $2, $3);
		$previously_shown = { start => "$yyyy$mm$dd" };
		$text =~ s/^\s+//; $text =~ s/\s+$//;
	    }
	    elsif ($text =~ s/herhaling\b//) {
		# Repeat, but no previous date given.  NB
		# sometimes we see 'herhaling' without a space
		# before it, as in the redundant
		#
		# 'Netwerk herhalingenherhaling'
		#
		# Hence no \b at the start of the regexp.  We just
		# have to hope there aren't too many compound
		# words ending in 'herhaling'.
		#
		$previously_shown = {};
		$text =~ s/^\s+//; $text =~ s/\s+$//;
	    }
	    $naam = $text;
	}
	elsif ($regel eq 'Genre') {
	    warn "seen 'Genre' twice\n" if defined $genre;
	    # Empty text for this happens often, just skip it.
	    $genre = $text if $text ne '';
	}
	elsif ($regel eq 'Zender') {
	    # I think this means 'broadcaster' but the information
	    # is redundant because we already know the channel.
	    #
	    # Then we should check it and warn if it differs!  But
	    # it does differ - every programme on the channel
	    # Nederland 1, it seems, has Zender of 'Nederland
	    # 2'. So we just ignore this information.
	    #
	}
	elsif ($regel eq 'Omroep') {
	    # FIXME I don't know what this means (the dictionary
	    # says 'wireless telegraph' but that's no help) so
	    # just ignore it.
	    #
	}
	elsif ($regel eq 'Kenmerken') {
	    foreach (split /,\s*/, $text) {
		if ($_ eq 'Teletekst ondertiteld') {
		    # I'm guessing this means teletext subtitles :-).
		    $teletext_sub++
		      && warn 'seen teletext subtitles twice';
		}
		elsif ($_ eq 'Breedbeeld uitzending') {
		    $widescreen++ && warn 'seen widescreen twice';
		}
		else {
		    warn "unknown 'Kenmerken' bit $_"
		      unless $warned_regel{"Kenmerken: $_"}++;	
		    push @desc, $_;
		}
	    }
	}
	elsif ($regel eq 'Presentatie') {
	    push @presenter, $text;
	}
	elsif ($regel eq 'Afleverings nummer') {
	    warn "seen 'Afleverings nummer' twice"
	      if defined $episode_num;
	    if ($text eq 'Slot') {
		# The last episode of a series.  There isn't a way to
		# store this in the current XMLTV format.
		#
		warn "discarding 'Slot'" unless $warned_slot++;
	    }
	    elsif ($text =~ /^\d+$/) {
		if ($text == 0) {
		    warn "I thought episode nums on the site were from 1";
		}
		else {
		    $episode_num = $text - 1;
		}
	    }
	    elsif ($text =~ /^(?:\d+-)+\d$/) {
		# This means multiple episodes.  This ought to be
		# handled by turning the programme into a clump.
		#
		warn "programme covers multiple episodes ($text), not handled";
	    }
	    else {
		warn "bad episode number $text";
	    }
	}
	elsif ($regel eq 'Titel aflevering') {
	    warn "seen 'Titel aflevering' twice"
	      if defined $sub_title;
	    $sub_title = $text;
	}
	elsif ($regel eq 'Webpagina') {
	    # We have to turn the string given, which is normally
	    # just a hostname, into a URL.  I don't see why they
	    # don't just link to it directly, this is a web site
	    # after all.
	    #
	    # Anyway, the URI library doesn't seem to have any way
	    # to take a string and turn it into a URL adding
	    # 'http:' if necessary, so we do this by hand.
	    #
	    if ($text !~ tr/://) {
		$text = "http://$text";
	    }
	    push @url, $text;
	}
	elsif ($regel eq 'Rolverdeling') {
	    warn "seen 'Rolverdeling' twice" if $actors;

	    # 'e.a' appearing in the description means 'and others';
	    # it's implicit in XMLTV that there might be other actors,
	    # so we quietly remove it.
	    #
	    $text =~ s/\s*e\.a\s*$//;
	
	    while (length $text) {
		if ($text =~ s/\s*([^:]+):\s*([^.]+)(?:$|\.)//) {
		    warn "discarding information about the parts played by each actor\n"
		      unless $warned_discarding_parts++;
		    push @$actors, $2;
		}
		elsif ($text =~ s/\s*([^,]+)(?:$|,)//) {
		    push @$actors, $1;
		}
		else {
		    warn "unknown remnant 'Rolverdeling' text '$text'";
		    last;
		}
	    }
	}
	elsif ($regel eq 'Scenario schrijver') {
	    warn "seen 'Scenario schrijver' twice" if $writers;
	    push @$writers, $text;
	}
	elsif ($regel eq 'E-mail') {
	    push @url, "mailto:$text";
	}
	elsif ($regel eq 'Bron') {
	    # FIXME cannot do anything special with this.  It
	    # means 'source' and perhaps by parsing the text we
	    # could find the names of writers or whatever.
	    #
	    push @desc, "$regel: $text";
	}
	elsif ($regel eq 'Commentaar') {
	    push @$commentators, $text;
	}
	elsif ($regel eq 'Jaar van premiere') {
	    # Year of release, I think.
	    warn "seen 'Jaar van premiere' twice"
	      if defined $date;
	    $date = $text;
	}
	elsif ($regel eq 'Regisseur') {
	    warn "seen 'Regisseur' twice" if defined $director;
	    $director = $text;
	}
	elsif ($regel eq 'Orginele titel') {
	    warn "seen 'Orginele titel' twice" if defined $orig_title;
	    $orig_title = $text;
	}
	elsif ($regel eq 'Behaalde prijzen') {
	    # Awards won.  It doesn't seem worth adding a separate
	    # field for this to the XMLTV format, just append to
	    # the description.
	    #
	    push @desc, "$regel: $text";
	}
	else {
	    # Unknown key, but let's add it to the desc so we
	    # don't lose information.  These newlines are just for
	    # the benefit of someone reading the XML by hand.
	    #
	    push @desc, "$regel: $text";
	    warn "unknown programme info key $regel\n"
	      unless $warned_regel{$regel}++;
	}
    }

    if (not defined $naam) {
	warn "did not see programme title, skipping programme\n";
	return;
    }
    if (not defined $van) {
	warn "did not see programme times, skipping programme\n";
	return;
    }
    die if not defined $tot;

    my @title = ([ $naam, $LANG]);
    push @title, [ $orig_title ] if defined $orig_title; # not Dutch!

    # We return a programme hash with 'van' and 'tot' rather than
    # fully parsed times.
    #
    my %prog
      = (channel => $ch_xmltv_id,
	 title   => \@title,
	 van     => $van,
	 tot     => $tot,
	);
	
    # We have lots of bits of description.  But we make them
    # into a single <desc> element because they probably give
    # different information, rather than stating the same
    # information in different ways.
    #
    $prog{desc} = [ [ join("\n\n", @desc), $LANG ] ] if @desc;
	
    $prog{'sub-title'} = [ [ $sub_title, $LANG ] ] if defined $sub_title;
    $prog{subtitles} = [ { type => 'teletext' } ] if $teletext_sub;
    $prog{video} = { aspect => '16:9' } if $widescreen;
    $prog{credits} = { presenter => \@presenter } if @presenter;
    $prog{'episode-num'} = [ $episode_num ] if defined $episode_num;
    $prog{url} = \@url if @url;
    $prog{date} = $date if defined $date;
    $prog{category} = [ [ $genre, $LANG ] ] if defined $genre;
    $prog{'previously-shown'} = $previously_shown if $previously_shown;
	
    my %c;
    $c{actor} = $actors if $actors;
    $c{writer} = $writers if $writers;
    $c{commentator} = $commentators if $commentators;
    $c{director} = [ $director ] if $director;
    $prog{credits} = \%c if %c;
	
    return \%prog;
}

sub time_tot_str( $ ) {
    my $input = shift;
    if (length($input) == 15) {
	$input .= '05:00';
    }
    my $datum = substr($input,0,length($input)-11);
    my $tot   = substr($input,-5);

    $datum =~ /(\d\d?)-(\d\d?)-(\d+)/
      or die "cannot find year in '$datum'";
    my ($dd, $mm, $yyyy) = ($1, $2, $3);
    foreach ($dd, $mm) { $_ = "0$_" if length == 1 }
    $tot =~ /(\d\d):(\d\d)/
      or die "cannot find time in '$tot'";
    my ($HH, $MM) = ($1, $2);

    return "$yyyy-$mm-$dd $HH:$MM:00";
}
sub time_van_str( $ ) {
    my $input = shift;
    if (length($input) == 15) {
	$input .= '06:00';
    }
    my $datum = substr($input,0,length($input)-11);
    my $van = substr($input,-11);
    $van = substr($van,0,5);

    $datum =~ /(\d\d?)-(\d\d?)-(\d{4})/
      or die "cannot find year in '$datum'";
    my ($dd, $mm, $yyyy) = ($1, $2, $3);
    foreach ($dd, $mm) { $_ = "0$_" if length == 1 }
    $van =~ /(\d\d):(\d\d)/
      or die "cannot find time in '$van'";
    my ($HH, $MM) = ($1, $2);
    return "$yyyy-$mm-$dd $HH:$MM:00";
}


# get channel listing
sub get_channels() {
    my %channels;
    my $url = 'http://www.tvgids.nl';

    #All stations are in the select box.
    #The station ID is the option value
    my $t = new HTML::TreeBuilder();
    $t->parse(get_nice($url));
    my @conts = map { [ $_->content_list() ] }
      $t->look_down('_tag' => 'select', 'name' => 'station');
    foreach my $cont (@conts) {
	my @children =@$cont;
	if (scalar(@children) == 0) {
	    warn 'No stations are defined';
	    next;
	}
	foreach my $station_line (@children) {
	    if ($station_line ne ' ') {
		#This if statement is to prevent parsing the last
		#empty element from the list.
			
		my $channel_id = $station_line->attr('value');
		
		#I am only interested in the normal channels.
		#tvgids.nl has some pages for the regional stations also
		#All normal channels have id <0,100>
		#That is at the moment... Could change in future...
		if ($channel_id > 0 && $channel_id < 100) {
		    my $channel_name = $station_line->as_text();
		    $channels{$channel_id} = $channel_name
		}
	    }

	}
    }	
    $t->delete(); undef $t;
    return %channels;
}


# Process a page containing the summary information.
#
# Parameters:
#   URL to fetch
#   Date::Manip object giving day for programmes in page (at least
#     until they cross midnight)
#   Date::Manip object giving official 'date' of page (normally the
#     same as the previous parameter)
#   XMLTV id of channel
#
# Returns a list of tuples of the form [ start, stop, title, url ]
# where start and stop are Date::Manip objects, title is a string, and
# url is the page to download if you want full details.
#
# Note that stop may be unset, since the format doesn't require that
# stop times be known.  More surprisingly, start may be unset as well!
# The meaning of this is that the times for that programme were
# completely nonsensical, and you should do something like putting it
# into a clump with the last valid programme that appeared before it,
# or else drop it with a warning.
#
sub process_summary_page( $$$ ) {
    my ($url, $day, $official_day) = @_;
    die if not defined $url; die if ref $url;
    die if not defined $day; die if ref $day;
    die if not defined $official_day; die if ref $official_day;
    local $SIG{__WARN__} = sub {
	warn "$url: $_[0]";
    };
    local $SIG{__DIE__} = sub {
	die "$url: $_[0]";
    };
    my $t = new HTML::TreeBuilder;
    $t->parse(get_nice($url));
    my %interesting; ++ $interesting{$_} foreach
      qw(lijst_zender lijst_tijd details_programma);
    my @elems = $t->look_down(sub { die if not defined $_[0];
				    my $c = $_[0]->attr('class');
				    return 0 if not defined $c;
				    return $interesting{$c} });
    if (not @elems) {
	warn 'did not find any programmes in page';
	return ();
    }
    my @bits;
    foreach my $e (@elems) {
	my $class = $e->attr('class');
	t "looking at elem of class $class";
	my $href;
      check_content:
	my @cont = $e->content_list();
	my $got_href = $e->attr('href');
	if (defined $got_href) {
	    t "got href: $got_href";
	    warn "seen 'href's contained inside each other"
	      if defined $href;
	    $href = $got_href;
	}
	if (not @cont) {
	    warn "found $class elem without content, ignoring"
	      unless $class eq 'lijst_zender';
	    next;
	}
	elsif (@cont == 1) {
	    for ($cont[0]) {
		if (ref) {
		    # Unpack this extra layer of element.
		    $e = $_;
		    goto check_content;
		}
		s/^\s+//; s/\s+$//;
		push @bits, [ $class, $href, $_ ];
	    }
	}
	elsif (@cont > 1) {
	    warn "found $class elem with more than one elem inside, ignoring"
	      unless $class eq 'lijst_zender';
	    next;
	}
    }
    $t->delete(); undef $t;
    if (not @bits) {
	warn "did not see any content, skipping page";
	return ();
    }
    t 'got bits: ' . d @bits;
    if ($bits[0]->[0] eq 'lijst_zender') {
	my $date_str = $bits[0]->[2];
	my $d = parse_dutch_date($date_str);
	if (defined $d) {
	    my ($d_base, $d_tz) = @{date_to_eur($d, $TZ)};
	    die "date in page $d_base ($date_str) doesn't match expected $official_day"
	      if UnixDate($d_base, '%Q') ne UnixDate($official_day, '%Q');
	    shift @bits;
	}
	# otherwise, leave it for later processing
    }
    my @todo;
    while (@bits >= 3) {
	my $ch_bit = shift @bits;
	my $ch_class = $ch_bit->[0];
	if ($ch_class ne 'lijst_zender') {
	    warn "bit expected to be channel name has class $ch_class not lijst_zender, skipping";
	    next;
	}
	my $ch = $ch_bit->[2];
	t 'shifted bit to get $ch=' . d $ch;
	
	my $times_bit = shift @bits;
	my $times_class = $times_bit->[0];
	if ($times_class ne 'lijst_tijd') {
	    warn "bit expected to be times has class $times_class not lijst_tijd, skipping";
	    next;
	}
	my $times = $times_bit->[2];
	t 'shifted bit to get $times=' . d $times;
	
	my $title_bit = shift @bits;
	my $title_class = $title_bit->[0];
	if ($title_class ne 'details_programma') {
	    warn "bit expected to be title has class $title_class not details_programma, skipping";
	    next;
	}
	my $title_href = $title_bit->[1];
	my $title = $title_bit->[2];
	t 'shifted bit to get $title_href=' . d $title_href . ', '
	  . '$title=' . d $title;

	if ($title =~ /^Ieder heel uur .+, tenzij anders vermeld$/) {
	    # A certain programme on the hour.  But it isn't worth
	    # adding this to the output, ignore it.
	    #
	    next;
	}
	if ($title eq 'NB: Programmering onder voorbehoud') {
	    # Programming subject to change.  There isn't a way to
	    # represent this in the current XMLTV format.
	    #
	    next;
	}
	
	my ($start_hhmm, $stop_hhmm);
	if ($times =~ /^(\d\d):(\d\d)-/
	    and 0 <= $1 and $1 < 24 and 0 <= $2 and $2 < 60) {
	    $start_hhmm = "$1:$2";
	}
	if ($times =~ /-(\d\d):(\d\d)$/
	    and 0 <= $1 and $1 < 24 and 0 <= $2 and $2 < 60) {
	    $stop_hhmm = "$1:$2";
	}

	# Right, got channel name, times, and title.
	# FIXME should check channel name (among other things).
	#
	my $title_url = URI->new_abs($title_href, $url);
	# We prefer to handle URLs as strings.
	push @todo, [ $start_hhmm, $stop_hhmm, $title, "$title_url" ];
    }

    # Now we need to make some sense of the times.  When stop time
    # appears before start time this could mean we have crossed
    # midnight, or it could just be a mistake.  If there is more than
    # one such potential crossing we pick the one with the shortest
    # resulting programme length and assume the others are mistakes.
    #
    my @crossings;
    my $shortest;
    my $next_day = UnixDate(DateCalc($day, '+ 1 day'), '%Y-%m-%d');
    my $crossing_at;
#    local $Log::TraceMessages::On = 1;
    t 'looking at raw times and searching for midnight crossing: ' . d \@todo;
    for (my $i = 0; $i < @todo; ++$i) {
	my ($start_hhmm, $stop_hhmm) = @{$todo[$i]};
	t '$start_hhmm=' . d $start_hhmm;
	t '$stop_hhmm=' . d $stop_hhmm;
	next if not defined $start_hhmm;
	next if not defined $stop_hhmm;

	my $start = parse_eur_date("$day $start_hhmm", $TZ);
	die if not defined $start;
	my $stop = parse_eur_date("$day $stop_hhmm", $TZ);
	die if not defined $stop;
	t "checking if $start -> $stop goes backwards";
	next if Date_Cmp($start, $stop) <= 0;
	t "yup, it's a candidate";

	my $stop_next_day = parse_eur_date("$next_day $stop_hhmm", $TZ);
	die if not defined $stop_next_day;
	die if Date_Cmp($stop_next_day, $start) <= 0;
	t 'if it were, stop time on next day would be: ' . d $stop_next_day;

	my $distance = Delta_Format(DateCalc($start, $stop_next_day), 0, '%st');
	t '...and length of programme: ' . d $distance;
	t 'shortest length so far: ' . d $shortest;
	if (not defined $shortest or $distance < $shortest) {
	    t 'this is the best so far';
	    $shortest = $distance;
	    $crossing_at = $i;
	}
    }
    t '@todo=' . d \@todo;

    # Now given the place at which we cross from $day to $next_day we
    # can add the appropriate days to the hh:mm times.
    #
    if (not defined $crossing_at) {
	push @$_, $day, $day foreach @todo;
    }
    else {
	for (my $i = 0; $i < $crossing_at; ++$i) {
	    push @{$todo[$i]}, $day, $day;
	}
	for (my $i = $crossing_at) {
	    push @{$todo[$i]}, $day, $next_day;
	}
	for (my $i = $crossing_at + 1; $i < @todo; ++$i) {
	    push @{$todo[$i]}, $next_day, $next_day;
	}
    }

    # Now we can parse the dates into Date::Manip objects.
    my @r;
    foreach (@todo) {
	my ($start_hhmm, $stop_hhmm, $title, $title_url, $start_day, $stop_day) = @$_;
	my ($start, $stop);
	if (defined $start_hhmm) {
	    $start = parse_eur_date("$start_day $start_hhmm", $TZ);
	    die if not defined $start;
	}
	if (defined $stop_hhmm) {
	    $stop = parse_eur_date("$stop_day $stop_hhmm", $TZ);
	    die if not defined $stop;
	}
	push @r, [ $start, $stop, $title, $title_url ];
    }
    t 'after parsing dates: ' . d \@r;

    # Check the dates and weed out those which are obviously wrong.
    my $last_start;
    foreach (@r) {
	our ($start, $stop);
	local (*start, *stop) = \ ($_->[0], $_->[1]);
	t 'checking dates, $last_start=' . d $last_start;
	t '$start=' . d $start;
	t '$stop=' . d $stop;
	if (defined $start and defined $stop
	    and Date_Cmp($start, $stop) > 0) {
	    # Appears to stop before it starts.  Assume the stop time
	    # is bogus but the start time might be okay.
	    #
	    undef $stop;
	}
	if (defined $last_start) {
	    if (defined $start and Date_Cmp($start, $last_start) < 0) {
		# Appears to start before previous start.
		undef $start;
	    }
	    if (defined $stop and Date_Cmp($stop, $last_start) < 0) {
		# Stops before previous start - that's just as bad.
		undef $stop;
	    }
	}
	$last_start = $start if defined $start;
    }
    t 'removed bad dates, now: ' . d \@r;

    # If there is a 'next page' link do a recursive call to handle it.
    foreach (@bits) {
	my ($type, $href, $text) = @$_;
	if ($type eq 'lijst_zender' and $text eq 'Volgende') {
	    # Next page for this day.
	    my $next_url = URI->new_abs($href, $url);
	    # Turn URL back into a string.
	    push @r, process_summary_page("$next_url", $day, $official_day)
	}
	elsif ($type eq 'lijst_zender' and $text eq 'Vorige') {
	    # Previous page for this day, assume already fetched.
	}
	else {
	    warn "discarding leftover $type: $text";
	}
    }

    t 'returning tuples: ' . d \@r;
    return @r;
}

# Parse date strings that are in Dutch.  'Why not just call
# Date_Init("Language=Dutch")?' I hear you ask.  The trouble is that
# Date::Manip's language is a global setting and having set it to
# Dutch we cannot use code that expects English - either in this file
# or in any libraries.  The least insane way to proceed is to turn
# Dutch to English strings here.
#
# The conversions to make, however, are swiped from the Date::Manip
# code.
#
sub parse_dutch_date( $ ) {
    for (my $tmp = $_[0]) {
	s/\bjanuari\b/January/g;
	s/\bjan\b/January/g;
	s/\bfebruari\b/February/g;
	s/\bfeb\b/February/g;
	s/\bmaart\b/March/g;
	s/\bmaa\b/March/g;
	s/\bmrt\b/March/g;
	s/\bapril\b/April/g;
	s/\bapr\b/April/g;
	s/\bmei\b/May/g;
	s/\bmei\b/May/g;
	s/\bjuni\b/June/g;
	s/\bjun\b/June/g;
	s/\bjuli\b/July/g;
	s/\bjul\b/July/g;
	s/\baugustus\b/August/g;
	s/\baug\b/August/g;
	s/\bseptember\b/September/g;
	s/\bsep\b/September/g;
	s/\boctober\b/October/g;
	s/\boktober\b/October/g;
	s/\boct\b/October/g;
	s/\bokt\b/October/g;
	s/\bnovember\b/November/g;
	s/\bnov\b/November/g;
	s/\bdecember\b/December/g;
	s/\bdec\b/December/g;

	s/\bzondag\b/Sunday/g;
	s/\bmaandag\b/Monday/g;
	s/\bdinsdag\b/Tuesday/g;
	s/\bwoensdag\b/Wednesday/g;
	s/\bdonderdag\b/Thursday/g;
	s/\bvrijdag\b/Friday/g;
	s/\bzaterdag\b/Saturday/g;

	my $r = parse_eur_date($_, $TZ);
	die "could not parse date $_ (from Dutch $_[0])"
	  if not defined $r;
	return $r;
    }
}
