#!/usr/bin/env python2.4 # # Copyright (c) 2007 Media Standards Trust # Licensed under the Affero General Public License # (http://www.affero.org/oagpl.html) # # Scraper for BBC News site # # TODO: # make use of: # http://news.bbc.co.uk/rss/feeds.opml # (~1600 feeds, multiple languages) # # NOTE: we scrape the low-graphics version of the page - much easier. # some pages give 404 errors for their low-graphics counterpart... # I _think_ these are video pages (only text is a small caption) import re from datetime import datetime import sys import urlparse import site site.addsitedir("../pylib") from BeautifulSoup import BeautifulSoup, Comment from JL import ukmedia, ScraperUtils # sources used by FindArticles OLDrssfeeds = { 'News Front Page': 'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/front_page/rss.xml', 'World': 'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/world/rss.xml', 'UK': 'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/uk/rss.xml', 'England': 'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/rss.xml', 'Northern Ireland': 'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/northern_ireland/rss.xml', 'Scotland': 'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/scotland/rss.xml', 'Business': 'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/business/rss.xml', 'Politics': 'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/uk_politics/rss.xml', 'Health': 'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/health/rss.xml', 'Education': 'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/education/rss.xml', 'Science/Nature': 'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/sci/tech/rss.xml', 'Technology': 'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/technology/rss.xml', 'Entertainment': 'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/entertainment/rss.xml' } rssfeeds = { "BBC News | Also in the news | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/also_in_the_news/rss.xml", "BBC News | Business | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/business/rss.xml", "BBC News | Africa | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/business/africa/rss.xml", "BBC News | Americas | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/business/americas/rss.xml", "BBC News | Asia-Pacific | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/business/asia-pacific/rss.xml", "BBC News | Business | Companies | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/business/companies/rss.xml", "BBC News | Business | Economy | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/business/economy/rss.xml", "BBC News | Europe | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/business/europe/rss.xml", # "BBC News | Business | Market Data | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/business/market_data/rss.xml", "BBC News | Middle East | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/business/middle_east/rss.xml", "BBC News | South Asia | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/business/south_asia/rss.xml", "BBC News | Business | Your Money | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/business/your_money/rss.xml", "BBC News | Education | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/education/rss.xml", "BBC News | Education | League Tables | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/education/league_tables/rss.xml", "BBC News | England | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/rss.xml", "BBC News | England | Beds/Bucks/Herts | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/beds/bucks/herts/rss.xml", "BBC News | England | Berkshire | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/berkshire/rss.xml", "BBC News | England | Bradford | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/bradford/rss.xml", "BBC News | England | Bristol | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/bristol/rss.xml", "BBC News | England | Cambridgeshire | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/cambridgeshire/rss.xml", "BBC News | England | Cornwall | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/cornwall/rss.xml", "BBC News | England | Coventry/Warwickshire | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/coventry_warwickshire/rss.xml", "BBC News | England | Cumbria | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/cumbria/rss.xml", "BBC News | England | Derbyshire | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/derbyshire/rss.xml", "BBC News | England | Devon | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/devon/rss.xml", "BBC News | England | Dorset | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/dorset/rss.xml", "BBC News | England | Essex | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/essex/rss.xml", "BBC News | England | Gloucestershire | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/gloucestershire/rss.xml", "BBC News | England | Hampshire | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/hampshire/rss.xml", "BBC News | England | Hereford/Worcs | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/hereford/worcs/rss.xml", "BBC News | England | Humber | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/humber/rss.xml", "BBC News | England | Kent | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/kent/rss.xml", "BBC News | England | Lancashire | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/lancashire/rss.xml", "BBC News | England | Leicestershire | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/leicestershire/rss.xml", "BBC News | England | Lincolnshire | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/lincolnshire/rss.xml", "BBC News | England | London | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/london/rss.xml", "BBC News | England | Manchester | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/manchester/rss.xml", "BBC News | England | Merseyside | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/merseyside/rss.xml", "BBC News | England | Norfolk | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/norfolk/rss.xml", "BBC News | England | Northamptonshire | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/northamptonshire/rss.xml", "BBC News | England | North Yorkshire | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/north_yorkshire/rss.xml", "BBC News | England | Nottinghamshire | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/nottinghamshire/rss.xml", "BBC News | England | Oxfordshire | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/oxfordshire/rss.xml", "BBC News | England | Shropshire | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/shropshire/rss.xml", "BBC News | England | Somerset | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/somerset/rss.xml", "BBC News | England | Southern Counties | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/southern_counties/rss.xml", "BBC News | England | South Yorkshire | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/south_yorkshire/rss.xml", "BBC News | England | Staffordshire | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/staffordshire/rss.xml", "BBC News | England | Suffolk | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/suffolk/rss.xml", "BBC News | England | Surrey | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/surrey/rss.xml", "BBC News | England | Sussex | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/sussex/rss.xml", "BBC News | England | Tees | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/tees/rss.xml", "BBC News | Travel | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/travel/rss.xml", "BBC News | England | Tyne | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/tyne/rss.xml", "BBC News | England | Wear | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/wear/rss.xml", "BBC News | England | West Midlands | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/west_midlands/rss.xml", "BBC News | England | West Midlands | Black country | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/west_midlands/black_country/rss.xml", "BBC News | England | West Yorkshire | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/west_yorkshire/rss.xml", "BBC News | England | Wiltshire | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/wiltshire/rss.xml", "BBC News | Entertainment | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/entertainment/rss.xml", "BBC News | News Front Page | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/front_page/rss.xml", "BBC News | Health | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/health/rss.xml", # "BBC News | Health | Medical notes | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/health/medical_notes/rss.xml", "BBC News | Special Reports | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/in_depth/rss.xml", "BBC News | Latest Published Stories | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/latest_published_stories/rss.xml", # "BBC News | Most Emailed Stories | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/livestats/most_emailed/rss.xml", # "BBC News | Most Popular Stories | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/livestats/most_read/rss.xml", "BBC News | Magazine | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/magazine/rss.xml", "BBC News | Magazine | A Point of View | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/magazine/views/a_point_of_view/rss.xml", "BBC News | Northern Ireland | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/northern_ireland/rss.xml", "BBC News | Media reports | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/not_in_website/syndication/monitoring/media_reports/rss.xml", "BBC News | Science/Nature | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/sci/tech/rss.xml", "BBC News | Sci/Tech | Climate Change | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/sci/tech/portal/climate_change/rss.xml", "BBC News | Scotland | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/scotland/rss.xml", "BBC News | Scotland | Edinburgh, East and Fife | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/scotland/edinburgh_and_east/rss.xml", "BBC News | Scotland | Glasgow, Lanarkshire and West | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/scotland/glasgow_and_west/rss.xml", "BBC News | Scotland | Highlands and Islands | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/scotland/highlands_and_islands/rss.xml", "BBC News | Scotland | North East/N Isles | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/scotland/north_east/rss.xml", "BBC News | Scotland | Scotland Video and Audio | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/scotland/scotland_video_and_audio/rss.xml", "BBC News | Scotland | South of Scotland | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/scotland/south_of_scotland/rss.xml", "BBC News | Scotland | Tayside and Central | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/scotland/tayside_and_central/rss.xml", "BBC News | Technology | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/technology/rss.xml", "BBC News | UK | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/uk/rss.xml", "BBC News | Politics | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/uk_politics/rss.xml", "BBC News | UK Politics | Northern Ireland politics | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/uk_politics/northern_ireland/rss.xml", "BBC News | UK Politics | Scotland politics | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/uk_politics/scotland/rss.xml", "BBC News | UK Politics | Wales politics | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/uk_politics/wales/rss.xml", "BBC News | Wales | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/wales/rss.xml", "BBC News | Wales | Mid Wales | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/wales/mid/rss.xml", "BBC News | Wales | North East Wales | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/wales/north_east/rss.xml", "BBC News | Wales | North West Wales | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/wales/north_west/rss.xml", "BBC News | Wales | South East Wales | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/wales/south_east/rss.xml", "BBC News | Wales | South West Wales | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/wales/south_west/rss.xml", "BBC News | World | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/world/rss.xml", "BBC News | World | Africa | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/world/africa/rss.xml", # "BBC News | World | Africa | Country profiles | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/world/africa/country_profiles/rss.xml", "BBC News | World | Americas | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/world/americas/rss.xml", # "BBC News | World | Americas | Country profiles | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/world/americas/country_profiles/rss.xml", "BBC News | World | Asia-Pacific | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/world/asia-pacific/rss.xml", # "BBC News | World | Asia-Pacific | Country profiles | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/world/asia-pacific/country_profiles/rss.xml", "BBC News | World | Europe | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/world/europe/rss.xml", # "BBC News | World | Europe | Country profiles | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/world/europe/country_profiles/rss.xml", "BBC News | World | Europe | Guernsey | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/world/europe/guernsey/rss.xml", "BBC News | World | Europe | Isle of Man | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/world/europe/isle_of_man/rss.xml", "BBC News | World | Europe | Jersey | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/world/europe/jersey/rss.xml", "BBC News | World | Middle East | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/world/middle_east/rss.xml", # "BBC News | World | Middle East | Country profiles | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/world/middle_east/country_profiles/rss.xml", "BBC News | World | South Asia | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/world/south_asia/rss.xml", # "BBC News | World | South Asia | Country profiles | UK Edition": "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/world/south_asia/country_profiles/rss.xml", # "BBC Sport | 606 | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/606/rss.xml", "BBC Sport | Sport Academy | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/academy/rss.xml", "BBC Sport | Athletics | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/athletics/rss.xml", "BBC Sport | Boxing | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/boxing/rss.xml", "BBC Sport | Cricket | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/cricket/rss.xml", "BBC Sport | Berkshire | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/berkshire/rss.xml", "BBC Sport | Birmingham | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/birmingham/rss.xml", "BBC Sport | Black Country | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/black_country/rss.xml", "BBC Sport | Bristol | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/bristol/rss.xml", "BBC Sport | Cornwall | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/cornwall/rss.xml", "BBC Sport | Coventry and Warwickshire | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/coventry_and_warwickshire/rss.xml", "BBC Sport | Cumbria | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/cumbria/rss.xml", "BBC Sport | Derbyshire | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/derbyshire/rss.xml", "BBC Sport | Devon | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/devon/rss.xml", "BBC Sport | Gloucestershire | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/gloucestershire/rss.xml", "BBC Sport | Hampshire | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/hampshire/rss.xml", "BBC Sport | Hereford and Worcestershire | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/hereford_and_worcestershire/rss.xml", "BBC Sport | Isle of Man | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/isle_of_man/rss.xml", "BBC Sport | Kent | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/kent/rss.xml", "BBC Sport | Leicestershire | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/leicestershire/rss.xml", "BBC Sport | Nottinghamshire | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/nottinghamshire/rss.xml", "BBC Sport | Oxfordshire | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/oxfordshire/rss.xml", "BBC Sport | Shropshire | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/shropshire/rss.xml", "BBC Sport | Somerset | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/somerset/rss.xml", "BBC Sport | Southern Counties | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/southern_counties/rss.xml", "BBC Sport | Stoke and Staffordshire | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/stoke_and_staffordshire/rss.xml", "BBC Sport | Tees | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/tees/rss.xml", "BBC Sport | Tyne | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/tyne/rss.xml", "BBC Sport | Wear | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/wear/rss.xml", "BBC Sport | Wiltshire | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/england/wiltshire/rss.xml", "BBC Sport | Football | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/football/rss.xml", "BBC Sport | Sport Homepage | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/front_page/rss.xml", "BBC Sport | Front page features | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/front_page_features/rss.xml", "BBC Sport | Golf | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/golf/rss.xml", "BBC Sport | Latest Published Stories | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/latest_published_stories/rss.xml", "BBC Sport | Motorsport | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/motorsport/rss.xml", "BBC Sport | Northern Ireland | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/northern_ireland/rss.xml", "BBC Sport | Olympics & Olympic sport | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/olympics/rss.xml", "BBC Sport | Other sport... | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/rss.xml", "BBC Sport | Other Sports | American Football | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/american_football/rss.xml", "BBC Sport | Other Sports | Basketball | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/basketball/rss.xml", "BBC Sport | Other Sports | Bowls | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/bowls/rss.xml", "BBC Sport | Other Sports | Cycling | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/cycling/rss.xml", "BBC Sport | Other Sports | Darts | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/darts/rss.xml", "BBC Sport | Other Sports | Disability sport | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/disability_sport/rss.xml", "BBC Sport | Other Sports | Horse Racing | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/horse_racing/rss.xml", "BBC Sport | Other Sports | Ice Hockey | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/ice_hockey/rss.xml", "BBC Sport | Other Sports | Snooker | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/snooker/rss.xml", "BBC Sport | Other Sports | Squash | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/squash/rss.xml", "BBC Sport | Other Sports | Winter Sports | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/winter_sports/rss.xml", "BBC Sport | Rugby League | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_league/rss.xml", "BBC Sport | Rugby Union | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_union/rss.xml", "BBC Sport | Syndication | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_union/syndication/rss.xml", "BBC Sport | Scotland | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/scotland/rss.xml", "BBC Sport | Tennis | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/tennis/rss.xml", "BBC Sport | Wales | UK Edition": "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/wales/rss.xml", } # example bbc news url: # "http://news.bbc.co.uk/1/hi/world/africa/7268903.stm" idpat = re.compile( '/(\d+)\.stm$' ) def CalcSrcID( url ): """ Extract unique srcid from url. Returns None if this scraper doesn't handle it.""" o = urlparse.urlparse(url) if o[1] != 'news.bbc.co.uk': return None # blogs are all at "blogs.bbc.co.uk" (old?) and # "www.bbc.co.uk/blogs/", but we leave that to blogs.py for now... # also blogs have .html or .shtml extension m = idpat.search( url ) if not m: return None # suppress this article (probably a blog) return 'bbcnews_' + m.group(1) def Extract( html, context ): if '/low/' in context['srcurl']: return Extract_low( html, context ) else: # NOTE: hi-graphics extract version needs work to handle # embedded video - at the moment these pages confuse it and # cause it to fail (and lots of pages have embedded video now) raise Exception( 'poo' ) # return Extract_hi( html, context ) def Extract_low( html, context ): """parse html of a low-graphics page""" art = context page_enc = 'ISO-8859-1' # pubdate pubdate_pat = re.compile( r'' ) m = pubdate_pat.search( html ) art['pubdate'] = ukmedia.ParseDateTime( m.group(1) ) # title headline_pat = re.compile( r'\s*(.*?)', re.DOTALL ) m = headline_pat.search(html) art['title'] = m.group(1).strip().decode( page_enc ) # byline byline = u'' byline_pat = re.compile( r'(.*?)', re.DOTALL ) m = byline_pat.search( html ) if m: byline = m.group(1).decode( page_enc ) # trim off possible leading all-caps cruft (eg "WHO, WHAT, WHY?
") byline = re.sub( r'[^a-z]+\s*', '', byline ) # replace
with a comma to retain a little more context when we strip html tags byline = re.sub( ur'', u',', byline ) byline = ukmedia.FromHTMLOneLine(byline) byline = re.sub( u'\s+,', u',', byline ) byline = re.sub( u',$', u'', byline ) byline = byline.strip() html = byline_pat.sub( '', html ) art['byline'] = byline # images # NOTE: low-graphics version of page has no caption, but alt attr is OKish. art['images'] = [] image_pat = re.compile( r'(.*?)', re.DOTALL ) for im in image_pat.finditer( html ): imtxt = im.group(1) m = re.search( r'src="(.*?)"', imtxt ) img_url = m.group(1) m = re.search( r'alt="(.*?)"', imtxt ) img_caption = unicode( m.group(1), page_enc ) art['images'].append( { 'url': img_url, 'caption': img_caption, 'credit': u'' } ) html = image_pat.sub( '', html ) # main text main_pat = re.compile( r'(?:)+(.*?)', re.DOTALL ) m = main_pat.search(html) art['content'] = m.group(1).decode( page_enc ) art['description'] = ukmedia.FirstPara( art['content'] ) # if description came up blank, maybe it's because it was a gallery page if art['description'] == u'': picpage = False for foo in ( r'\bpictures\b',r'\bphotos\b', r'\bgallery\b' ): pat = re.compile( foo, re.IGNORECASE ) if pat.search( art['title'] ): picpage = True break if picpage: ukmedia.DBUG2( "IGNORE pictures/photos page ( %s )\n" %( art['srcurl'] ) ) return None return art def Extract_hi( html, context ): """Parse the html of a single article (in hi-graphics form) html -- the article html context -- any extra info we have about the article (from the rss feed) """ art = context soup = BeautifulSoup( html ) meta = soup.find( 'meta', { 'name': 'Headline' } ) if meta: art['title'] = ukmedia.DescapeHTML( meta[ 'content' ] ).strip() if soup.find('title').renderContents(None).startswith( "BBC News | In pictures:"): ukmedia.DBUG2( "IGNORE 'in pictures' gallery ( %s )\n" %( art['srcurl'] ) ) return None gal = soup.find( 'div', {'class': 'galMain' } ) if gal: ukmedia.DBUG2( "IGNORE picture gallery '%s' ( %s )\n" %( art['title'], art['srcurl'] ) ) return None meta = soup.find( 'meta', { 'name': 'OriginalPublicationDate' } ) if meta: art['pubdate'] = ukmedia.ParseDateTime( meta['content'] ) # TODO: could use first paragraph for a more verbose description meta = soup.find( 'meta', { 'name': 'Description' } ) if meta and 'content' in meta: art['description'] = ukmedia.FromHTML( meta[ 'content' ] ) else: art['description'] = u'' # byline byline = u'' spanbyl = soup.find( 'span', {'class':'byl'} ) if spanbyl: # eg "By Paul Rincon" byline = spanbyl.renderContents(None).strip() spanbyd = soup.find( 'span', {'class':'byd'} ) if spanbyd: # eg "Science reporter, BBC News, Houston" byline = byline + u', ' + spanbyd.renderContents(None).strip() byline = ukmedia.FromHTML( byline ) byline = u' '.join( byline.split() ) art['byline'] = byline # just use regexes to extract the article text storybody = soup.find( "td", {'class':'storybody'} ) if not storybody: # uh-oh... is it a video page? av = soup.find( 'div', {'class':'wideav'} ) if av: ukmedia.DBUG2( "IGNORE video-only page ( %s )\n" %( art['srcurl'] ) ) return None # if storybody: # txt = storybody.renderContents(None) # else: # txt = unicode( html, soup.originalEncoding ) txt = unicode( html, soup.originalEncoding ) m = re.search( u'(.*)', txt, re.UNICODE|re.DOTALL ) txt = m.group(1) # bbcnews has blocks denoted in comments, eg: # # ...html... # # try to extract images (in IIMA block) # TODO: could also get images from IBOX blocks? art['images'] = [] imgblock_pat = re.compile( r"(.*?)", re.DOTALL ) for iima in imgblock_pat.finditer( html ): imhtml = unicode( iima.group(2), soup.originalEncoding ) m = re.compile( ur'(.*?)', re.IGNORECASE|re.UNICODE|re.DOTALL ).search( imhtml ) if m: im['caption'] = m.group(1) # else could try and pull out img alt attr... art['images'].append( im ) # zap assorted extra blocks from the text # (could be problems with nesting... but seems ok) # IIMA - image? # IBOX - quote? # IBYL - byline # IANC - anchor # ILIN # IFOR - form # IMED - embedded media link blockkillerpat = re.compile( r".*?", re.UNICODE|re.DOTALL ) txt = blockkillerpat.sub( u'', txt ) # ITAB - table # IROW - table row # ICOL - table column # ICEE - preformatted? from cfax? (used for sports fixtures/results) # IINC - included text/image (but from where?) allowedblocks = ('SF','BO','ITAB', 'ICOL', 'IROW', 'ICEE', 'IINC' ) # sanity check (might not know all block types) m = re.search( u'', txt, re.UNICODE ) if m: if m.group(1) not in allowedblocks: raise Exception, ("unknown block type encountered ('%s')" % m.group(1)) txt = ukmedia.SanitiseHTML( txt ) art['content'] = txt if art['description'] == u'': art['description'] = ukmedia.FromHTML( ukmedia.FirstPara( art['content'] ) ) return art def ScrubFunc( context, entry ): """ per-article callback for processing RSS feeds """ # a story can have multiple paths (eg uk vs international version) srcid = CalcSrcID( context['srcurl'] ) if not srcid: return None # suppress it if '/in_pictures/' in context['srcurl']: return None context['srcid'] = srcid # scrape the low-graphics version of the page context['srcurl'] = re.sub( '/hi/', '/low/', context['srcurl'] ) return context def FindArticles(): """ get a set of articles to scrape from the bbc rss feeds """ # TODO: filter out "Your Stories" page return ScraperUtils.FindArticlesFromRSS( rssfeeds, u'bbcnews', ScrubFunc ) def ContextFromURL( url ): """Build up an article scrape context from a bare url.""" # NOTE: urls from the rss feed have a couple of extra components which # we _could_ strip out here... context = {} context['permalink'] = url context['srcurl'] = url # scrape the low-graphics version of the page # NOTE: a few pages give 404 errors for their low-graphics counterpart... # I _think_ these are video pages (only text is a small caption) context['srcurl'] = re.sub( '/hi/', '/low/', context['srcurl'] ) context['srcid'] = CalcSrcID( url ) context['srcorgname'] = u'bbcnews' context['lastseen'] = datetime.now() return context if __name__ == "__main__": # high maxerrors to cope with some video-only pages which give 404 errors if you try to get the low-graphics version ScraperUtils.RunMain( FindArticles, ContextFromURL, Extract, maxerrors=50 )