#!/usr/bin/perl -w =pod =head1 NAME tv_grab_se - Grab TV listings for Sweden. =head1 SYNOPSIS tv_grab_se --help tv_grab_se [--config-file FILE] --configure tv_grab_se [--config-file FILE] [--output FILE] [--days N] [--offset N] [--quiet] [--debug] =head1 DESCRIPTION Output TV and listings in XMLTV format for many stations available in Sweden. The data comes from the website of each respective TV-station. First you must run B to choose which stations you want to receive. Then running B with no arguments will get a listings for the stations you chose for five days including today. B<--configure> Prompt for which stations to download and write the configuration file. B<--config-file FILE> Set the name of the configuration file, the default is B<~/.xmltv/tv_grab_se.conf>. This is the file written by B<--configure> and read when grabbing. B<--output FILE> When grabbing, write output to FILE rather than standard output. B<--days N> When grabbing, grab N days rather than 5. B<--offset N> Start grabbing at today + N days. N may be negative. B<--quiet> suppress the progress-bar normally shown on standard error. B<--debug> provide more information on progress to stderr to help in debugging. B<--help> print a help message and exit. =head1 ERROR HANDLING If the grabber fails to download data for some channel on a specific day, it will print an errormessage to STDERR and then continue with the other channels and days. The grabber will exit with a status code of 1 to indicate that the data is incomplete. =head1 SUPPORTED CHANNELS tv_grab_se can currently fetch data for the following channels: SVT1, SVT2, Barnkanalen, SVT24, TV4, TV4+. TV3, TV8, ZTV Kanal 5 VIASAT SPORT 1/2/3 EXPLORER, ACTION/NATURE TV1000, CINEMA EuroSport MTV Nordic YLE TV Finland =head1 SEE ALSO L =head1 AUTHOR Mattias Holmlund, mattias -at- holmlund -dot- se. This documentation and parts of the code copied from tv_grab_uk by Ed Avis, ed -at- membled -dot- com. =head1 BUGS The grabber for Viasat (TV3, TV8 and ZTV) does not fetch any desriptions for the programmes, since it would then have to fetch one html-page per programme. It does however store a url for each programme where the description can be found. =cut use strict; use XMLTV; use XMLTV::Ask; use XMLTV::Config_file; use XMLTV::Get_nice qw(get_nice); use XMLTV::Memoize; XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux'); use XMLTV::DST; use XML::LibXML; use Date::Manip; use Getopt::Long; use URI; use Encode qw(encode_utf8 decode_utf8 encode decode); # Use Term::ProgressBar if installed. use constant Have_bar => eval { require Term::ProgressBar; 1 }; # The timezone for Sweden during wintertime. use constant LOCAL_TZ => "+0100"; # We assume most site data is in this language. use constant LANG => 'sv'; # List all available channels along with the grabber for them # and the parameters to send to the grabber. my %channels = ( # # Swedish channels # 'svt1.svt.se' => [ "SVT1", \&get_data_svt, 8759, "SgSVT1BG", "SgSVT1Title" ], 'svt2.svt.se' => [ "SVT2", \&get_data_svt, 8760, "SgSVT2BG", "SgSVT2Title" ], 'svt24.svt.se' => [ "SVT24", \&get_data_svt, 8761, "Sg24BG", "Sg24Title" ], 'barnkanalen.svt.se' => [ "Barnkanalen", \&get_data_svt, 8762, "SgBarnkanalenBG", "SgBarnkanalenTitle" ], 'tv4.se' => [ "TV4", \&get_data_tv4, 1 ], 'plus.tv4.se' => [ "TV4+", \&get_data_tv4, 3 ], 'tv3.se' => [ "TV 3 Sverige", \&get_data_viasat, "www.tv3.se", "sv" ], 'kanal5.se' => [ "Kanal 5", \&get_data_kanal5 ], 'eurosport.com' => [ "EuroSport", \&get_data_eurosport ], 'tv8.se' => [ "TV8", \&get_data_viasat, "www.tv8.se", "sv" ], 'ztv.se' => [ "ZTV Sverige", \&get_data_viasat, "www.ztv.se", "sv" ], 'tv1000.viasat.se' => [ "TV 1000 Sverige", \&get_data_viasat_tv1000, "www.tv1000.se", "sv", 1, 0], 'plus-1.tv1000.viasat.se' => [ "TV 1000 Sverige (delayed one hour)", \&get_data_viasat_tv1000, "www.tv1000.se", "sv", 1, 1], 'plus-2.tv1000.viasat.se' => [ "TV 1000 Sverige (delayed two hours)", \&get_data_viasat_tv1000, "www.tv1000.se", "sv", 1, 2], 'cinema.viasat.se' => [ "Viasat Cinema Sverige", \&get_data_viasat_tv1000, "www.tv1000.se", "sv", 3, 0], 'plus-1.cinema.viasat.se' => [ "Viasat Cinema Sverige (delayed one hour)", \&get_data_viasat_tv1000, "www.tv1000.se", "sv", 3, 1], 'plus-2.cinema.viasat.se' => [ "Viasat Cinema Sverige (delayed two hours)", \&get_data_viasat_tv1000, "www.tv1000.se", "sv", 3, 2], 'sport1.viasat.se' => [ "Viasat Sport 1 Sverige", \&get_data_viasat, "www.sport.viasat.se", "sv" ], 'sport2.viasat.se' => [ "Viasat Sport 2 Sverige", \&get_data_viasat, "www.sport.viasat.se", "sv", "&override_section=76" ], 'sport3.viasat.se' => [ "Viasat Sport 3 Sverige", \&get_data_viasat, "www.sport.viasat.se", "sv", "&override_section=77" ], 'action.viasat.se' => [ "Viasat Nature Action Sverige", \&get_data_viasat, "www.action.viasat.se", "sv" ], 'explorer.viasat.se' => [ "Viasat Explorer Sverige", \&get_data_viasat, "explorer.viasat.se", "sv" ], 'nordic.mtve.com' => [ "MTV Nordic", \&get_data_mtve ], 'tvfinland.yle.fi' => [ "YLE TV Finland", \&get_data_yle ], # # Danish channels # '3plus.dk' => [ "3+ Denmark", \&get_data_viasat, "www.3plus.dk", "dk" ], 'sport1.viasat.dk' => [ "Viasat Sport 1 Denmark", \&get_data_viasat, "www.sport.viasat.dk", "dk" ], 'sport2.viasat.dk' => [ "Viasat Sport 2 Denmark", \&get_data_viasat, "www.sport.viasat.dk", "dk", "&override_section=76" ], 'sport3.viasat.dk' => [ "Viasat Sport 3 Denmark", \&get_data_viasat, "www.sport.viasat.dk", "dk", "&override_section=77" ], 'action.viasat.dk' => [ "Viasat Nature Action Denmark", \&get_data_viasat, "www.action.viasat.dk", "dk" ], 'explorer.viasat.dk' => [ "Viasat Explorer Denmark", \&get_data_viasat, "explorer.viasat.dk", "dk" ], 'tv1000.viasat.dk' => [ "TV 1000 Denmark", \&get_data_viasat_tv1000, "www.tv1000.dk", "dk", 1, 0], 'plus-1.tv1000.viasat.dk' => [ "TV 1000 Denmark (delayed one hour)", \ &get_data_viasat_tv1000, "www.tv1000.dk", "dk", 1, 1], 'plus-2.tv1000.viasat.dk' => [ "TV 1000 Denmark (delayed two hours)", \&get_data_viasat_tv1000, "www.tv1000.dk", "dk", 1, 2], 'cinema.viasat.dk' => [ "Viasat Cinema Denmark", \&get_data_viasat_tv1000, "www.tv1000.dk", "dk", 3, 0], 'plus-1.cinema.viasat.dk' => [ "Viasat Cinema Denmark (delayed one hour)", \&get_data_viasat_tv1000, "www.tv1000.dk", "dk", 3, 1], 'plus-2.cinema.viasat.dk' => [ "Viasat Cinema Denmark (delayed two hours)", \&get_data_viasat_tv1000, "www.tv1000.dk", "dk", 3, 2], 'tv3.dk' => [ "TV 3 Denmark", \&get_data_viasat, "www.tv3.dk", "dk" ], # # Norwegian channels # 'tv3.no' => [ "TV 3 Norge", \&get_data_viasat, "www.tv3.no", "no" ], 'ztv.no' => [ "ZTV Norge", \&get_data_viasat, "www.ztv.no", "no" ], 'sport1.viasat.no' => [ "Viasat Sport 1 Norway", \&get_data_viasat, "www.sport.viasat.no", "no" ], 'sport2.viasat.no' => [ "Viasat Sport 2 Norway", \&get_data_viasat, "www.sport.viasat.no", "no", "&override_section=76" ], 'sport3.viasat.no' => [ "Viasat Sport 3 Norway", \&get_data_viasat, "www.sport.viasat.no", "no", "&override_section=77" ], 'action.viasat.no' => [ "Viasat Nature Action Norway", \&get_data_viasat, "www.action.viasat.no", "no" ], 'explorer.viasat.no' => [ "Viasat Explorer Norway", \&get_data_viasat, "explorer.viasat.no", "no" ], 'tv1000.viasat.no' => [ "TV 1000 Norway", \&get_data_viasat_tv1000, "www.tv1000.no", "no", 1, 0], 'plus-1.tv1000.viasat.no' => [ "TV 1000 Norway (delayed one hour)", \&get_data_viasat_tv1000, "www.tv1000.no", "no", 1, 1], 'plus-2.tv1000.viasat.no' => [ "TV 1000 Norway (delayed two hours)", \&get_data_viasat_tv1000, "www.tv1000.no", "no", 1, 2], 'cinema.viasat.no' => [ "Viasat Cinema Norway", \&get_data_viasat_tv1000, "www.tv1000.no", "no", 3, 0], 'plus-1.cinema.viasat.no' => [ "Viasat Cinema Norway (delayed one hour)", \&get_data_viasat_tv1000, "www.tv1000.no", "no", 3, 1], 'plus-2.cinema.viasat.no' => [ "Viasat Cinema Norway (delayed two hours)", \&get_data_viasat_tv1000, "www.tv1000.no", "no", 3, 2], # # Finnish channels # 'tv1000.viasat.fi' => [ "TV 1000 Finland", \&get_data_viasat_tv1000, "www.tv1000.fi", "fi", 1, 0], 'plus-1.tv1000.viasat.fi' => [ "TV 1000 Finland (delayed one hour)", \&get_data_viasat_tv1000, "www.tv1000.fi", "fi", 1, 1], 'plus-2.tv1000.viasat.fi' => [ "TV 1000 Finland (delayed two hours)", \&get_data_viasat_tv1000, "www.tv1000.fi", "fi", 1, 2], 'cinema.viasat.fi' => [ "Viasat Cinema Finland", \&get_data_viasat_tv1000, "www.tv1000.fi", "fi", 3, 0], 'plus-1.cinema.viasat.fi' => [ "Viasat Cinema Finland (delayed one hour)", \&get_data_viasat_tv1000, "www.tv1000.fi", "fi", 3, 1], 'plus-2.cinema.viasat.fi' => [ "Viasat Cinema Finland (delayd two hours)", \&get_data_viasat_tv1000, "www.tv1000.fi", "fi", 3, 2], ); my $opt = { days => 5, offset => 0, "config-file" => undef, configure => 0, help => 0, quiet => 0, output => undef, debug => 0, }; my $res = GetOptions( $opt, qw/ days=i offset=i config-file=s configure help|h quiet output=s debug / ); sub t; if( (not $res) or scalar(@ARGV) or $opt->{help} ) { print << 'EOH'; tv_grab_se --help tv_grab_se [--config-file FILE] --configure tv_grab_se [--config-file FILE] [--output FILE] [--days N] [--offset N] [--quiet] [--debug] EOH exit(1); } # XMLTV::DST says that we should do this... Date_Init('TZ=UTC'); # File that stores which channels to download. my $config_file = XMLTV::Config_file::filename($opt->{'config-file' }, 'tv_grab_se', not $opt->{debug} ); if( $opt->{configure} ) { configure( $config_file ); exit; } # List of the ids of all channels that should be loaded. # This is loaded from the configuration file. my @channel_list = (); load_config( $config_file ); my( $odoc, $root ); my $warnings = 0; my %w_args = ( encoding => 'UTF-8' ); if (defined $opt->{output}) { t "Sending output to $opt->{output}."; my $fh = new IO::File "> $opt->{output}"; die "cannot write to $opt->{output}" if not $fh; $w_args{OUTPUT} = $fh; } my $w = new XMLTV::Writer( %w_args ); # $w->comment("Hello from XML::Writer's comment() method"); $w->start({ 'generator-info-name' => 'tv_grab_se' }); # Write list of channels. t 'Writing list of channels.'; foreach my $channel_id (@channel_list) { my( $channel_name, $get_sub, @param ) = @{$channels{$channel_id}}; $w->write_channel( { id => $channel_id, 'display-name' => [[ $channel_name, LANG ]], } ); } my $now = ParseDate( 'now' ); my $today = UnixDate( $now, "%Y%m%d" ); my $date = increase_date( $today, $opt->{offset} ); my $bar = undef; $bar = new Term::ProgressBar( { name => 'downloading listings', count => $opt->{days} * @channel_list }) if Have_bar && (not $opt->{quiet}) && (not $opt->{debug}); for( my $i=0; $i < $opt->{days}; $i++ ) { t "Date: $date"; foreach my $channel_id (@channel_list) { t " $channel_id"; my( $channel_name, $get_sub, @param ) = @{$channels{$channel_id}}; &{$get_sub}( $w, $channel_id, $date, @param ); update $bar if defined( $bar ); } $date = increase_date( $date, 1 ); } $w->end(); # Signal that something went wrong if there were warnings. exit(1) if $warnings; # All data fetched ok. t "Exiting without warnings."; exit(0); ########################################## # # Routines common for all channels. # ########################################## sub parse_xml { my( $html ) = @_; my $doc; # Stupid XML::LibXML writes to STDERR. Redirect it temporarily. open(SAVERR, ">&STDERR"); # save the stderr fhandle print SAVERR "Nothing\n" if 0; open(STDERR,"> /dev/null"); eval { my $xml = XML::LibXML->new; $xml->recover(1); $doc = $xml->parse_html_string($html); }; warning( "Error from eval: $@" ) if $@; # Restore STDERR open( STDERR, ">&SAVERR" ); warning( "Failed to parse html" ) unless defined( $doc ); return $doc; } sub increase_date { my( $datestr, $delta ) = @_; my( $year, $month, $day ) = ( $datestr =~ /(\d\d\d\d)(\d\d)(\d\d)/ ); my $date = ParseDate( "$year-$month-$day" ); my $newdate = DateCalc( $date, "+ $delta days" ); return UnixDate( $newdate, "%Y%m%d" ); } # Delete leading and trailing space from a string. # Convert all whitespace to spaces. Convert multiple # spaces to a single space. sub norm { my( $str ) = @_; $str =~ s/^\s+//; $str =~ s/\s+$//; $str =~ tr/\n\r\t / /s; return $str; } # # Error handling # sub t { my( $message ) = @_; print STDERR $message . "\n" if $opt->{debug}; } sub warning { my( $message ) = @_; print STDERR $message . "\n"; $warnings++; } # # Configuration # sub load_config { my( $config_file ) = @_; my @lines = XMLTV::Config_file::read_lines( $config_file ); foreach my $line (@lines) { next unless defined $line; my( $command, $param ) = split( /\s+/, $line ); die "Unknown command $command in config-file $config_file" unless $command =~ /^\s*channel\s*$/; $param =~ tr/\n\r //d; push @channel_list, $param; } } sub configure { my( $config_file ) = @_; XMLTV::Config_file::check_no_overwrite( $config_file ); # FIXME need to make directory open(CONF, ">$config_file") or die "cannot write to $config_file: $!"; my @chan = sort { join( ".", reverse( split /\./, $a ) ) cmp join( ".", reverse( split /\./, $b ) ) } keys %channels; # Ask about Swedish channels first. my @all = (grep( /\.se$/, @chan ), grep( !/\.se$/, @chan )); my @wanted = askManyBooleanQuestions(1, map { "get channel $channels{$_}->[0] ($_)?" } @all ); foreach (@all) { print CONF '# ' if not shift @wanted; print CONF "channel $_\n"; } close CONF or warn "cannot close $config_file: $!"; say("Finished configuration."); } ########################################## # # svt.se # ########################################## sub get_data_svt { my( $w, $channel, $date, $id, $bgclass, $titleclass ) = @_; my( $base, $html ) = get_html_svt( $id, $date ); if( not defined( $html ) ) { warning( "Failed to fetch html for $channel on $date." ); return; } my $doc = parse_xml( $html ); my $lasttime = "0000"; my $ns = $doc->find( "//tr[td/\@class='$bgclass']" ); if( $ns->size() == 0 ) { warning "No data available for $channel on $date."; return; } foreach my $node ($ns->get_nodelist) { my $time = $node->findvalue( "td[\@class='$bgclass']" ); my( $starttime, $stoptime ) = ($time =~ /(\d+\.\d+)\s*-\s*(\d+\.\d+)/ ); $starttime = $time unless defined $starttime; $starttime =~ tr/\.//d; $stoptime =~ tr/\.//d if defined $stoptime; my $title = $node->findvalue( ".//*[\@class='$titleclass']" ); my $url = $node->findvalue( "(.//a)[1]/\@href" ); # Delete character that SVT uses to signify a link. $title =~ tr/»//d; my $description = ""; my $ns2 = $node->find( ".//div[\@class='SgZText'][preceding-sibling::span[\@class='$titleclass']]/text()" ); foreach my $desc ($ns2->get_nodelist) { $description .= $desc->findvalue('.'); } # Delete character that SVT uses to signify a link. $description =~ tr/»//d; if( $starttime < $lasttime ) { $date = increase_date( $date, 1 ); } my %d = ( channel => $channel, start => utc_offset( "$date$starttime", LOCAL_TZ ), title => [ [ norm($title), LANG ] ], ); if( defined $stoptime ) { my $stopdate = $date; if( $stoptime < $starttime ) { $stopdate = increase_date( $stopdate, 1 ); } $d{stop} = utc_offset( "$stopdate$stoptime", LOCAL_TZ ); } $d{desc} = [ [ norm($description), LANG ] ] if $description =~ /\S/; $d{url} = [URI->new($url)->abs($base)] if $url =~ /\S/; $w->write_programme( \%d ); $lasttime = $starttime; } } sub get_html_svt { my( $id, $date ) = @_; my $url = "http://svt.se/svt/jsp/polopoly.jsp?" . "d=$id\&selectedDate=$date\&shortVersion=false\&x=31\&y=8"; return ( URI->new( $url ), decode( "ISO-8859-1", get_nice( $url ) ) ); } ########################################## # # tv4.se # ########################################## sub get_html_tv4 { my( $id, $date ) = @_; $date =~ s/(\d{4})(\d{2})(\d{2})/$1-$2-$3/; my $url = "http://www.tv4.se/program/tabla.aspx?date=$date"; $url .= "\&ch=$id"; # unless $id==1; return ( URI->new( $url ), decode( "ISO-8859-1", get_nice( $url ) ) ); } sub get_data_tv4 { my( $w, $channel, $date, $id ) = @_; my( $base, $html ) = get_html_tv4( $id, $date ); if( not defined( $html ) ) { warning( "Failed to fetch html for $channel on $date." ); return; } my $doc = parse_xml( $html ); if( not defined( $doc ) ) { warning( "Failed to parse html for $channel on $date." ); return; } # Every odd tr contains the starttime and title. Every even tr contains # the description if it exists. The description can be found since it has # an empty first td. my $ns = $doc->find( "//span[\@id='LabelResults']/tr" ); if( $ns->size() <= 2 ) { warning "No data available for $channel on $date."; return; } my @programmes = (); my $lasttime = "0000"; foreach my $node ($ns->get_nodelist) { my $time = $node->findvalue( "td[1]" ); $time =~ tr/\n\r //d; my $text = $node->findvalue( "td[2]" ); $text =~ tr/\n\r / /s; $text =~ s/^\s*//; $text =~ s/\s*$//; # Delete character that TV4 uses to signify a link. $text =~ tr/»//d; # But Perl might see something else. UTF-8 stuff, I guess. my @text = split(//, $text); $text =~ s/\s*.$// if (@text and ord($text[-1]) == 194); if( $time =~ /^\s*\d\d:\d\d\s*$/ ) { # This tr contains a time. $time =~ s/^\s*(\d\d):(\d\d)\s*/$1$2/; if( $time < $lasttime ) { $date = increase_date( $date, 1 ); } $lasttime = $time; if( ($text =~ /^\s*-S.*ndningsuppeh.*ll-\s*$/) and scalar(@programmes > 0) ) { # This entry signals that there is nothing on TV. # Use the starttime of this entry as the end-time of the # previous entry. $programmes[-1]->{stop} = utc_offset( "$date$time", LOCAL_TZ ); } else { my %prog = ( channel => $channel, start => utc_offset( "$date$time", LOCAL_TZ ), title => [ [norm($text), LANG ] ], ); my $url = $node->findvalue('(td[2]//a)[1]/@href'); $prog{url} = [URI->new($url)->abs($base)] if $url =~ /\S/; push @programmes, \%prog; } } else { # This tr does not contain a time. It must be a description # for the previous entry. if( $text =~ /\S/ ) { $programmes[-1]->{desc} = [ [norm($text), LANG] ]; } } } foreach my $prog (@programmes) { $w->write_programme( $prog ); } } ########################################## # # kanal5.se # ########################################## my $kanal5_html = {}; my $kanal5_dates = {}; my @kanal5_table_queue; my $kanal5_table_loaded = {}; # Find the date of a page to see if it's the one we want. sub find_date_kanal5 { my ( $url ) = @_; my $html = $kanal5_html->{$url}; my $doc = parse_xml( $html ); if( not defined( $doc ) ) { return undef; } my $ns = $doc->find( "//span[\@class='Heading1']" ); if ($ns->size() != 1) { return undef; } my @nodes = $ns->get_nodelist; my $date = $nodes[0]->textContent(); # example: "tisdag, 13 juli 2004" $date =~ s/ januari / 01 /; $date =~ s/ februari / 02 /; $date =~ s/ mars / 03 /; $date =~ s/ april / 04 /; $date =~ s/ maj / 05 /; $date =~ s/ juni / 06 /; $date =~ s/ juli / 07 /; $date =~ s/ augusti / 08 /; $date =~ s/ september / 09 /; $date =~ s/ oktober / 10 /; $date =~ s/ november / 11 /; $date =~ s/ december / 12 /; $date =~ s/^.*, ([^ ]) ([^ ]*) ([^ ]*)/$3$20$1/; $date =~ s/^.*, ([^ ][^ ]) ([^ ]*) ([^ ]*)/$3$2$1/; if (defined $date) { $kanal5_dates->{$date} = $url; } } # Find links to other pages that we might want to look at. sub find_table_urls_kanal5 { my ( $url ) = @_; my $html = $kanal5_html->{$url}; my $doc = parse_xml( $html ); if( not defined( $doc ) ) { # Hopefully not fatal. return; } my $ns = $doc->find( "//a" ); # Skip using found URLs until this is reset; my $old = 1; # Find URL:s to pages with more programmes. for my $node ($ns->get_nodelist) { my $nstr = $node->textContent(); if ($nstr =~ /M.*ndag$/ || $nstr =~ /Tisdag$/ || $nstr =~ /Onsdag$/ || $nstr =~ /Torsdag$/ || $nstr =~ /Fredag$/ || $nstr =~ /L.*rdag$/ || $nstr =~ /S.*ndag$/ || $nstr =~ /N.*sta veckas tabl/) { if ($nstr =~ /^\S/) { # If the string begins with ·  # then it's a link to the page itself. # The rest of the links will be to later # days. $old = 0; } unless ($old) { my $attributelist = $node->attributes; my $href = $attributelist->getNamedItem("href"); if (defined $href) { my $nurl = $href->textContent; my $absurl = URI->new($nurl)->abs($url); push @kanal5_table_queue, $absurl; } } } } } # Download and index a page. sub load_url_kanal5 { my( $url ) = @_; $kanal5_html->{$url} = ( URI->new( $url ), decode( "ISO-8859-1", get_nice( $url ) ) ); find_date_kanal5($url); find_table_urls_kanal5($url); } # Work through queue of pages until we find the one we want. sub get_url_by_date_kanal5 { my( $date ) = @_; while (@kanal5_table_queue and !defined($kanal5_dates->{$date})) { my $tryurl = shift @kanal5_table_queue; unless (defined ($kanal5_table_loaded->{$tryurl})) { get_html_kanal5($tryurl); $kanal5_table_loaded->{$tryurl} = 1; } } if (defined ($kanal5_dates->{$date})) { return $kanal5_dates->{$date}; } else { return undef; } } # Download, index and return HTML. sub get_html_kanal5 { my( $url ) = @_; unless (defined ($kanal5_html->{$url})) { load_url_kanal5($url); } return $kanal5_html->{$url}; } # The programme table parser. sub get_programmes_kanal5 { my( $channel, $date, $base, $html ) = @_; my $doc = parse_xml( $html ); if( not defined( $doc ) ) { return undef; } my $ns = $doc->find( "//span[\@class='Heading1']/parent::td/parent::tr/parent::table/tr/td[\@class='Content']/node()" ); if( $ns->size() == 0 ) { return undef; } my @programmes = (); my $starttime = ""; my $title = ""; my $progdesc = ""; my $progurl = ""; my $progdate = $date; my $nextdate = $progdate; foreach my $node ($ns->get_nodelist) { my $nstr = $node->textContent; if ($nstr =~ /^\s*([0-9][0-9]):([0-9][0-9])$/) { my $nextstart = "$1$2"; if (($nextstart cmp $starttime) == -1) { $nextdate = increase_date($nextdate, 1); } if ($title =~ /\S/) { # Ok, we have a programme. my %prog = ( channel => $channel, start => utc_offset( "$progdate$starttime", LOCAL_TZ ), stop => utc_offset( "$nextdate$nextstart", LOCAL_TZ ), title => [ [$title, LANG ] ], ); $prog{url} = [URI->new($progurl)->abs($base)] if $progurl =~ /\S/; if ($progdesc =~ /\S/) { $progdesc =~ s/^\s*//; $progdesc =~ s/\s*$//; $prog{desc} = [ [ $progdesc, LANG ] ]; } push @programmes, \%prog; } $starttime = $nextstart; $progdate = $nextdate; $title = ""; $progdesc = ""; $progurl = ""; } elsif ($node->nodeName eq "b") { $title = $nstr; my $firstchild = $node->firstChild; if (defined $firstchild) { if ($firstchild->nodeName eq "a") { my $attributelist = $firstchild->attributes; my $href = $attributelist->getNamedItem("href"); if (defined $href) { $progurl = $href->textContent; } } } } else { $progdesc .= $nstr . "\n"; } } # There's never an end time, so we'll just skip # the last programme. return \@programmes; } # Find the right page, parse the table on the page and output the # result. sub get_data_kanal5 { my( $w, $channel, $date ) = @_; # Initialize the URL queue if it hasn't been done yet. get_html_kanal5("http://www.kanal5.se/Templates/TVListToday.aspx"); my $html = get_html_kanal5(get_url_by_date_kanal5($date)); unless (defined $html) { warning( "Failed to fetch html for $channel on $date." ); return; } my $programmes = get_programmes_kanal5($channel, $date, "http://www.kanal5.se/", $html); unless (defined $programmes) { warning( "Failed to parse html for $channel on $date." ); return; } foreach my $prog (@$programmes) { $w->write_programme( $prog ); } } ########################################## # # eurosport.com # ########################################## my $eurosport_html = {}; my $eurosport_dates = {}; my @eurosport_table_queue; my $eurosport_table_loaded = {}; # Find the date of a page to see if it's the one we want. sub find_date_eurosport { my ( $url ) = @_; my $html = $eurosport_html->{$url}; my $doc = parse_xml( $html ); if( not defined( $doc ) ) { return undef; } my $ns = $doc->find( "//td[\@class='tabletitre']/child::text()" ); if ($ns->size() != 1) { return undef; } my @nodes = $ns->get_nodelist; my $date = $nodes[0]->textContent(); if (defined $date) { $date =~ s/.*ON\s//; $date =~ s/\///g; $eurosport_dates->{$date} = $url; } } # Find links to other pages that we might want to look at. sub find_table_urls_eurosport { my ( $url ) = @_; my $html = $eurosport_html->{$url}; my $doc = parse_xml( $html ); if( not defined( $doc ) ) { # Hopefully not fatal. return; } my $ns = $doc->find( "//a[text()='TV Schedule']" ); if (defined $ns) { my @nodes = $ns->get_nodelist; my $scheduleurl = $nodes[0]->getAttribute("href"); if ($scheduleurl =~ /\S/) { $scheduleurl = URI->new($scheduleurl)->abs($url); push @eurosport_table_queue, $scheduleurl; } } my $ns2 = $doc->find( "//a" ); if (defined $ns2) { my @nodes = $ns2->get_nodelist; for my $node (@nodes) { my $nstr = $node->textContent; if ($nstr =~ /^\s*..\s*([0-9][0-9])\/([0-9][0-9])$/) { my $month = $1; my $day = $2; my $scheduleurl = $node->getAttribute("href"); $scheduleurl = URI->new($scheduleurl)->abs($url); if ($scheduleurl =~ /\S/) { for my $tryday (0, 1, 2, 3, 4, 5, 6, 7, 8) { my $trydate = increase_date( $today, $tryday ); if ($trydate =~ /$month$day$/) { $eurosport_dates->{$trydate} = $scheduleurl; } } } } } } } # Download and index a page. sub load_url_eurosport { my( $url ) = @_; $eurosport_html->{$url} = ( URI->new( $url ), decode_utf8( get_nice( $url ) ) ); find_date_eurosport($url); find_table_urls_eurosport($url); } # Work through queue of pages until we find the one we want. sub get_url_by_date_eurosport { my( $date ) = @_; while (@eurosport_table_queue and !defined($eurosport_dates->{$date})) { my $tryurl = shift @eurosport_table_queue; unless (defined ($eurosport_table_loaded->{$tryurl})) { get_html_eurosport($tryurl); $eurosport_table_loaded->{$tryurl} = 1; } } if (defined ($eurosport_dates->{$date})) { return $eurosport_dates->{$date}; } else { # print "FAILED: $date\n"; return undef; } } # Download, index and return HTML. sub get_html_eurosport { my( $url ) = @_; unless (defined ($eurosport_html->{$url})) { load_url_eurosport($url); } return $eurosport_html->{$url}; } # The programme table parser. sub get_programmes_eurosport { my( $channel, $date, $base, $html ) = @_; my $doc = parse_xml( $html ); if( not defined( $doc ) ) { return undef; } my $ns = $doc->find("//td[\@class='tvschtitre']/parent::tr/parent::table/parent::td/parent::tr/td/table/descendant::td[\@class='tvschtitre' or \@class='tvschtitrelive' or \@class='tvschtext']" ); if( $ns->size() == 0 ) { return undef; } my @programmes = (); my $starttime = ""; my $title = ""; my $progdesc = ""; my $progurl = ""; my $progdate = $date; my $nextdate = $progdate; foreach my $node ($ns->get_nodelist) { my $nstr = $node->textContent; if ($nstr =~ /^\s*([0-9][0-9]):([0-9][0-9])$/) { my $nextstart = "$1$2"; if (($nextstart cmp $starttime) == -1) { $nextdate = increase_date($nextdate, 1); } if ($title =~ /\S/) { # Ok, we have a programme. my %prog = ( channel => $channel, start => utc_offset( "$progdate$starttime", LOCAL_TZ ), stop => utc_offset( "$nextdate$nextstart", LOCAL_TZ ), title => [ [$title, LANG ] ], ); $prog{url} = [URI->new($progurl)->abs($base)] if $progurl =~ /\S/; if ($progdesc =~ /\S/) { $progdesc =~ s/^\s*//; $progdesc =~ s/\s*$//; $prog{desc} = [ [ $progdesc, LANG ] ]; } push @programmes, \%prog; } $starttime = $nextstart; $progdate = $nextdate; $title = ""; $progdesc = ""; $progurl = ""; } elsif ($title eq "") { $title = $nstr; $title =~ s/^\s*//gs; $title =~ s/\s*$//gs; my $links = $doc->find("a"); if( $links->size() > 0 ) { foreach my $linknode ($links->get_nodelist) { my $attributelist = $linknode->attributes; my $href = $attributelist->getNamedItem("href"); if (defined $href) { $progurl = $href->textContent; } } } } else { my $desc = $nstr; $desc =~ s/^\s*//gs; $desc =~ s/\s*$//gs; $progdesc .= $desc . "\n"; } } # There's never an end time, so we'll just skip # the last programme. return \@programmes; } # Find the right page, parse the table on the page and output the # result. sub get_data_eurosport { my( $w, $channel, $date ) = @_; # Initialize the URL queue if it hasn't been done yet. get_html_eurosport("http://www.eurosport.com/"); my $url = get_url_by_date_eurosport($date); unless (defined $url) { warning( "Failed to find url for $channel on $date." ); return; } my $html = get_html_eurosport($url); unless (defined $html) { warning( "Failed to fetch html for $channel on $date." ); return; } my $programmes = get_programmes_eurosport($channel, $date, "http://www.eurosport.com/", $html); unless (defined $programmes) { warning( "Failed to parse html for $channel on $date." ); return; } foreach my $prog (@$programmes) { $w->write_programme( $prog ); } } ########################################## # # mtve.com # ########################################## # Return the URL of the page for the date. sub get_url_by_date_mtve { my( $date ) = @_; return "http://www.nordic.mtve.com/television/schedule.asp?date=".$today."&show_date=" . $date; } # Download and return HTML. sub get_html_mtve { my( $url ) = @_; return ( URI->new( $url ), decode( "ISO-8859-1", get_nice( $url ) ) ); } # The programme table parser. sub get_programmes_mtve { my( $channel, $date, $base, $html ) = @_; my $doc = parse_xml( $html ); if( not defined( $doc ) ) { return undef; } my $ns = $doc->find("//tr[\@bgcolor='']/parent::table/tr/td/node()"); if( $ns->size() == 0 ) { return undef; } my @programmes = (); my $starttime = ""; my $title = ""; my $progurl = ""; my $progdate = $date; my $nextdate = $progdate; foreach my $node ($ns->get_nodelist) { my $nstr = $node->textContent; my $nodename = $node->nodeName; if (defined $nodename and $nodename eq "div" and $nstr =~ /^([0-9][0-9]):([0-9][0-9])\s/) { my $nextstart = "$1$2"; if (($nextstart cmp $starttime) == -1) { $nextdate = increase_date($nextdate, 1); } if ($title =~ /\S/) { # Ok, we have a programme. # LOCAL_TZ should be alright for MTVE Nordic. my %prog = ( channel => $channel, start => utc_offset( "$progdate$starttime", LOCAL_TZ ), stop => utc_offset( "$nextdate$nextstart", LOCAL_TZ ), title => [ [$title, "en" ] ], ); $prog{url} = [URI->new($progurl)->abs($base)] if $progurl =~ /\S/; push @programmes, \%prog; } $starttime = $nextstart; $progdate = $nextdate; $title = ""; $progurl = ""; } else { $title = $nstr; $title =~ s/^\s*//; if ($nodename eq "a") { my $attributelist = $node->attributes; my $href = $attributelist->getNamedItem("href"); if (defined $href) { $progurl = $href->textContent; } } } } # The table seems to always end at midnight. my $nextstart = "0000"; if (($nextstart cmp $starttime) == -1) { $nextdate = increase_date($nextdate, 1); } if ($title =~ /\S/) { # Ok, we have a programme. my %prog = ( channel => $channel, start => utc_offset( "$progdate$starttime", LOCAL_TZ ), stop => utc_offset( "$nextdate$nextstart", LOCAL_TZ ), title => [ [$title, LANG ] ], ); $prog{url} = [URI->new($progurl)->abs($base)] if $progurl =~ /\S/; push @programmes, \%prog; } return \@programmes; } # Find the right page, parse the table on the page and output the # result. sub get_data_mtve { my( $w, $channel, $date ) = @_; my $html = get_html_mtve(get_url_by_date_mtve($date)); unless (defined $html) { warning( "Failed to fetch html for $channel on $date." ); return; } my $programmes = get_programmes_mtve($channel, $date, "http://www.nordic.mtve.com/", $html); unless (defined $programmes) { warning( "Failed to parse html for $channel on $date." ); return; } foreach my $prog (@$programmes) { $w->write_programme( $prog ); } } ########################################## # # yle.fi # ########################################## # Could also use http://www.yle.fi/ohjelmaopas/data/stftoday.htm for # more details. Se also # http://www.yle.fi/tv1/myle/ohjelmakartta/frame_bottom_programs.php?focus_day=0 # Return the URL of the page for the date. sub get_url_by_date_yle { my( $channel, $date ) = @_; if ($channel eq "tvfinland.yle.fi") { return "http://www.yle.fi/ohjelmaopas/index.php?&co[]=tv1&co[]=tv2&co[]=mtv&co[]=nel&co[]=tvf&l=s&span=day&span=day&date=" . $date; } return undef; } # Download and return HTML. sub get_html_yle { my( $url ) = @_; return ( URI->new( $url ), decode( "windows-1252", get_nice( $url ) ) ); } # The programme table parser. sub get_programmes_yle { my( $channel, $date, $base, $html ) = @_; my $doc = parse_xml( $html ); if( not defined( $doc ) ) { return undef; } my $channelwebname; if ($channel eq "tvfinland.yle.fi") { $channelwebname = "TV Finland (CET)"; } my $ns = $doc->find("//b[text()='".$channelwebname."']/parent::td/parent::tr/parent::table/tr/td/table/tr/td/node()"); if( $ns->size() == 0 ) { return undef; } my @programmes = (); my $starttime = ""; my $title = ""; my $progurl = ""; my $progdate = $date; my $nextdate = $progdate; foreach my $node ($ns->get_nodelist) { my $nstr = $node->textContent; my $nodename = $node->nodeName; if ($nstr =~ /^([0-9][0-9]).([0-9][0-9])$/) { my $nextstart = "$1$2"; if (($nextstart cmp $starttime) == -1) { $nextdate = increase_date($nextdate, 1); } if ($title =~ /\S/) { # Ok, we have a programme. # TV Finland is CET, so LOCAL_TZ whould be ok, # but for the others it's not. # "sv" is usually correct for TVF. my %prog = ( channel => $channel, start => utc_offset( "$progdate$starttime", LOCAL_TZ ), stop => utc_offset( "$nextdate$nextstart", LOCAL_TZ ), title => [ [$title, LANG ] ], ); $prog{url} = [URI->new($progurl)->abs($base)] if $progurl =~ /\S/; push @programmes, \%prog; } $starttime = $nextstart; $progdate = $nextdate; $title = ""; $progurl = ""; } else { $title = $nstr; $title =~ s/^\s*//; if ($nodename eq "a") { my $attributelist = $node->attributes; my $href = $attributelist->getNamedItem("href"); if (defined $href) { $progurl = $href->textContent; } } } } # Skip the last programme. Sorry. Should probably check for early # programs the next day. return \@programmes; } # Find the right page, parse the table on the page and output the # result. sub get_data_yle { my( $w, $channel, $date ) = @_; my $url = get_url_by_date_yle($channel, $date); my $html = get_html_yle($url); unless (defined $html) { warning( "Failed to fetch html for $channel on $date." ); return; } my $programmes = get_programmes_yle($channel, $date, $url, $html); unless (defined $programmes) { warning( "Failed to parse html for $channel on $date." ); return; } foreach my $prog (@$programmes) { $w->write_programme( $prog ); } } ########################################## # # viasat # ########################################## sub get_html_viasat { my( $site, $date, $url_addon) = @_; my( $year, $month, $day ) = ($date =~ /(\d{4})(\d{2})(\d{2})/); my $url = "http://$site/index.phtml?page_type=tvchart$url_addon\&" . "start_year=$year\&start_month=$month\&start_day=$day"; return ( URI->new( $url ), decode( "ISO-8859-1", get_nice( $url ) ) ); } sub get_data_viasat { my( $w, $channel, $date, $site, $language, $url_addon ) = @_; $url_addon = "" unless defined $url_addon; my( $base, $html ) = get_html_viasat( $site, $date, $url_addon ); if( not defined( $html ) ) { warning( "Failed to fetch html for $channel on $date." ); return; } my $doc = parse_xml( $html ); if( not defined( $doc ) ) { warning( "Failed to parse html for $channel on $date." ); return; } my $ns = $doc->find( '//table/tr[@class="bgcolorBoxGeneral" or @class="bgcolorBoxGeneral2"]' ); if( $ns->size() == 0 ) { warning "No data available for $channel on $date."; return; } my @programmes = (); my $lasttime = "0000"; foreach my $node ($ns->get_nodelist) { my $starttime = $node->findvalue( "td[1]" ); $starttime =~ tr/\n\r //d; my $title = $node->findvalue( "td[2]/a" ); # Fallback in case there is no link for this programme. $title = $node->findvalue( "td[2]" ) if ($title =~ /^\s*$/); my $url = $node->findvalue( '(td[2]/a[@class="show"])[1]/@href' ); $starttime =~ tr/\://d; if( $starttime < $lasttime ) { $date = increase_date( $date, 1 ); } if( $title =~ /^\s*SLUT\s*$/ ) { # This entry signals that there is nothing on TV. # Use the starttime of this entry as the end-time of the # previous entry. $programmes[-1]->{stop} = utc_offset( "$date$starttime", LOCAL_TZ ); } else { my %prog = ( channel => $channel, start => utc_offset( "$date$starttime", LOCAL_TZ ), title => [ [norm($title), $language ] ], ); $prog{url} = [URI->new($url)->abs($base)] if $url =~ /\S/; push @programmes, \%prog; } $lasttime = $starttime; } foreach my $prog (@programmes) { $w->write_programme( $prog ); } } ########################################## # # viasat TV1000 # ########################################## sub utc_offset_shift( $$$ ) { my ($indate, $basetz, $shift) = @_; my $d = date_to_local(parse_local_date($indate, $basetz), $basetz); my $d_shifted = DateCalc($d->[0], "+ $shift hour", my $err); return UnixDate($d_shifted,"%Y%m%d%H%M%S") . " " . $d->[1]; } sub get_data_viasat_tv1000 { my( $w, $channel, $date, $site, $language, $td_id, $timeshift ) = @_; my( $base, $html ) = get_html_viasat( $site, $date, "" ); if( not defined( $html ) ) { warning( "Failed to fetch html for $channel on $date." ); return; } my $doc = parse_xml( $html ); if( not defined( $doc ) ) { warning( "Failed to parse html for $channel on $date." ); return; } my $ns = $doc->find("//td[$td_id]/table/tr/td/div[\@class='txtBlue']" ); if( $ns->size() == 0 ) { warning "No data available for $channel on $date."; return; } my $lasttime = "0000"; foreach my $node ($ns->get_nodelist) { my $starttime = $node->findvalue( "." ); $starttime =~ tr/\n\r //d; my $title = $node->findvalue( "../a" ); # Fallback in case there is no link for this programme. $title = $node->findvalue( ".." ) if ($title =~ /^\s*$/); my $description = $node->findvalue( ".." ); my @array = split(/\n/,$description); $description = $array[$#array]; my $url = $node->findvalue( '(../a)[1]/@href' ); $starttime =~ tr/\://d; if( $starttime < $lasttime ) { $date = increase_date( $date, 1 ); } my %d = ( channel => $channel, start => utc_offset_shift( "$date$starttime", LOCAL_TZ, $timeshift ), title => [ [ norm($title), $language ] ], ); $d{desc} = [ [ norm($description), $language ] ] if $description =~ /\S/; $d{url} = [URI->new($url)->abs($base)] if $url =~ /\S/; $w->write_programme( \%d ); $lasttime = $starttime; } } ### Setup indentation in Emacs ## Local Variables: ## perl-indent-level: 4 ## perl-continued-statement-offset: 4 ## perl-continued-brace-offset: 0 ## perl-brace-offset: -4 ## perl-brace-imaginary-offset: 0 ## perl-label-offset: -2 ## cperl-indent-level: 2 ## cperl-brace-offset: 0 ## cperl-continued-brace-offset: 0 ## cperl-label-offset: -2 ## cperl-extra-newline-before-brace: t ## cperl-merge-trailing-else: nil ## cperl-continued-statement-offset: 2 ## End: