#!/usr/bin/perl -w

=pod

=head1 NAME

tv_grab_nl_wolf - Grab TV listings for Holland.

=head1 SYNOPSIS

tv_grab_nl_wolf [--help] [--output FILE] [--days N] [--offset N] [--quiet]

=head1 DESCRIPTION

Output TV listings for several channels available in the Netherlands.
The data comes from Carlo de WolfE<39>s site exporting listings in
XMLTV format.  The default is to grab as many days as possible from
the current day onwards.

B<--output FILE> write to FILE rather than standard output.

B<--days N> grab N days starting from today, rather than as many as
possible.

B<--offset N> start grabbing N days from today, rather than starting
today.  N may be negative.

B<--quiet> suppress the progress messages normally written to standard
error.

=head1 SEE ALSO

L<xmltv(5)>, L<http://wolf.xs4all.nl/xmltv/>

=head1 AUTHOR

Ed Avis, ed@membled.com

=head1 BUGS

The grabber fetches data from the site with very little processing.
This means that if the upstream data is incorrect, the output of the
grabber will be wrong.  In particular, the upstream source currently
has a problem where some programmes are listed with a stop time before
their start time.

=cut

use strict;
use XMLTV::Version '$Id: tv_grab_nl_wolf,v 1.6 2003/07/06 17:12:00 epaepa Exp $ ';

# We work by inheriting from XMLTV::Grab_XML and overriding certain
# methods.
#
use XMLTV::Grab_XML;
package Grab_XML_nl_wolf;
use base 'XMLTV::Grab_XML';

use Date::Manip;
use HTML::TreeBuilder;

# Todo: perhaps we should internationalize messages and docs?
sub country( $ ) {
    my $pkg = shift;
    return 'the Netherlands';
}

my $URL_HOST = 'http://wolf.xs4all.nl';
my $URL_DIR = '/xmltv/';
my $url_base = "$URL_HOST$URL_DIR";

# Returns a hash mapping YYYMMDD to URL.
sub urls_by_date( $ ) {
    my $pkg = shift;
    my $index = $pkg->get($url_base);
    die "could not get index page $url_base, aborting\n"
      if not defined $index;
    my $t = new HTML::TreeBuilder;
    $t->parse($index);
    my @urls = map { $_->attr('href') } $t->find('a');
    $t->delete(); undef $t;

    if (not @urls) {
	die "did not see any links in index page $url_base, aborting\n";
    }

    my %by_date;
    foreach (@urls) {
	s/^$URL_DIR//;
	if (/^tv-(\d{8})\./) {
	    # We know that the same date can occur twice, we just take
	    # the first occurrence.
	    #
	    $by_date{$1} = "$url_base/$_"
	      unless defined $by_date{$1};
	}
    }
    return %by_date;
}

Grab_XML_nl_wolf->go();
