#!/usr/bin/perl use warnings; use strict; # Don't allow sloppy syntax use CGI; use LWP::Simple; use XML::RSS; use CHI; my $DEBUG = 0; my $BASEURL = 'http://tv.repubblica.it/'; my $LISTURL = 'php/crontab/ssi_list_scelti_per_voi.php?cat_id='; my %months = ( "gennaio" => 1, "febbraio" => 2, "marzo" => 3, "aprile" => 4, "maggio" => 5, "giugno" => 6, "luglio" => 7, "agosto" => 8, "settembre" => 9, "ottobre" => 10, "novembre" => 11, "dicembre" => 12 ); my $cache = CHI->new( driver => 'File', root_dir => '/home/mala/reprssCache' ); # What did the user ask for? my $params = new CGI; # note: categories are # Copertina 15 # Politica 36 # Cronaca 17 # Mondo 18 # Sport 19 # Spett&Cultura 20 # Tecno&Scienze 21 # Politica/Dossier 16 my $category = $params->param('cat') || 15; my $pagenum = $params->param('page') || 1; $category = $ARGV[0] || 15 if $DEBUG; $pagenum = $ARGV[1] || 1 if $DEBUG; print "Content-type: text/xml\n\n"; die "Usage: perl repubblica.pl <category> <page>\n\n" unless $category; my $URL = $BASEURL.$LISTURL.$category."&page=$pagenum"; my $rss = $cache->get($URL); if (defined $rss) { # if the RSS feed has already been cached just print it print $rss; exit; } # create base contents for RSS feed $rss = new XML::RSS (version => '2.0'); $rss->add_module(prefix=>'media', uri=>'http://search.yahoo.com/mrss/'); $rss->add_module(prefix=>'dc', uri=>'http://purl.org/dc/elements/1.1/'); $rss->channel( title => "RepubblicaTV", link => "http://tv.repubblica.it", description => "An RSS feed for RepubblicaTV videos", generator => "+mala's perl script", dc => { date => '2011-10-15T00:00+00:00', subject => "News", creator => 'malattia@gmx.net', language => 'it', } ); print "Connecting to $URL\n" if $DEBUG; my $page = get($URL) || die ("Can't connect to $URL\n\n"); while ($page =~ /<div class="mediaItem">(.*?)<\/div>/gsi){ my %newsItem; getBaseInfo (\%newsItem, $1); next unless isValidUrl($newsItem{'URL'}); # getDetailedInfo connects to $newsItem{'URL'} to get the video URL and a description getDetailedInfo (\%newsItem); next unless $newsItem{'videoURL'}; # only show if you got the video URL right dumpNewsItem (\%newsItem) if $DEBUG; addNewsItem (\%newsItem, $rss); } $cache->set( $URL , $rss->as_string, "60 minutes" ); print $rss->as_string; exit 0; sub getBaseInfo { my ($newsRef, $snippet) = @_; if ($snippet =~ /a href="([^"]+)".*?title="([^"]+)".*?src="([^"]+)"/si){ $$newsRef{'URL'} = $BASEURL.$1; $$newsRef{'Title'} = $2; $$newsRef{'Thumb'} = $3; } } sub getDetailedInfo { my ($newsRef) = @_; my $content = get ($$newsRef{'URL'}); # get video URL if ($content =~ /'pcUrl',\s*'([^']+)'/){ $$newsRef{'videoURL'} = $1; } # get description if ($content =~ /<div id="vi_abstract">\s*(.*)\s*<\/div>/){ $$newsRef{'description'} = $1; $$newsRef{'description'} =~ s/(<[^<]+>)//gsi; } # get date if ($content =~ /<p class="date-player">\(([^<]+)\)</){ $$newsRef{'date'} = dateConvert($1); } } sub dumpNewsItem { my ($newsRef) = @_; print "--------------------------------------------\n"; print "URL : " .$$newsRef{'URL'}. "\n"; print "Title: " .$$newsRef{'Title'}. "\n"; print "Thumb: " .$$newsRef{'Thumb'}. "\n"; print "Video: " .$$newsRef{'videoURL'}. "\n"; print "Descr: " .$$newsRef{'description'}. "\n"; print "Date : " .$$newsRef{'date'}. "\n\n"; } sub isValidUrl { my ($url) = @_; return 0 unless $url; # URLs we know are not absolute return 0 if ($url =~ /\/http:\/\//); return 1; } sub dateConvert { my ($oldDate) = @_; my ($day, $month, $year) = split (/ /, $oldDate); $day = sprintf ("%02d", $day); $month = sprintf ("%02d", $months{$month}); return "$year-$month-$day"."T00:00+00:00"; } sub addNewsItem { my ($newsRef, $rss) = @_; $rss->add_item ( title => $$newsRef{'Title'}, link => $$newsRef{'URL'}, description => $$newsRef{'description'}, pubDate => $$newsRef{'date'}, media => { thumbnail => {url => $$newsRef{'Thumb'}, heigth => "100", width => "100"}, content => { url => $$newsRef{'videoURL'}, type => "video/mp4"}, } ); }