#!/usr/bin/env perl
use strict;
local $^W=1;

use LWP::Simple;
use HTML::TokeParser;
use XML::RSS;

# Butchered version of Chris Ball's RSS screenscraper tutorial
# at http://www.perl.com/pub/a/2001/11/15/creatingrss.html

# Download our page.

my $content = get ("http://news.bbc.co.uk/2/hi.html") or die $!;

# make a tokeparser from the html.

my $stream = HTML::TokeParser->new( \$content ) or die $!;

# create rss object

my $rss = XML::RSS->new( version=> '0.9' );

# Prepare the RSS metadata.

$rss->channel(
	title		=> "news.bbc.co.uk",
	link		=> "http://news.bbc.co.uk/",
	description	=> "news.bbc.co.uk -- BBC world news",
);

my ($tag,$headline,$url);

# First indication of a headline is a div tag.

while( $tag = $stream->get_tag("div") ) {
	# look for class="bodytext" in the div tag.
	
	if ($tag->[1]{class} and $tag->[1]{class} eq 'bodytext') {
		# got it. Skip <a></a> set that just hyperlinks image.

		$tag = $stream->get_tag('a');
		$tag = $stream->get_tag('a');

		# Now we're at the <a> tag with the headline in it.
		# put contents of href token in $url

		$url = $tag->[1]{href} || "--";

		# Get the headline in the <br> tags.
		# grab with get_trimmed_text up to the close of the b tag.
		print "URL is $url\n";
		
		$tag = $stream->get_tag('span');
		if ($tag->[1]{class} =~ /^h[12]$/) {
			$headline = $stream->get_trimmed_text('/span') ;
		}

		# escape ampersands.

		$url =~ s/&/&amp;/g;

		# change relative URLs in tags to absolute.

		$url = 'http://news.bbc.co.uk'.$url;

		# Add to RSS channel.

		$rss->add_item( title=>$headline, link=>$url);

	}

}

$rss->save("bbcnews.rss");

