#!/usr/bin/env python2.4 # # Scraper for The Scotsman and Scotland on Sunday # # Copyright (c) 2007 Media Standards Trust # Licensed under the Affero General Public License # (http://www.affero.org/oagpl.html) # # NOTES: # Same article urls work on both thescotsman.scotsman.com and # scotlandonsunday.scotsman.com. # import sys import re from datetime import datetime import sys import urlparse import site site.addsitedir("../pylib") from BeautifulSoup import BeautifulSoup,BeautifulStoneSoup,Comment from JL import ukmedia, ScraperUtils scotsman_rssfeeds = { "Aberdeen - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6995", "Arts - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7096", "Athletics - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7087", "Banking & Insurance - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7079", "Banking - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7058", "Books - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7006", "Boxing - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7098", "Breaking News - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7055", "BT Cups - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7099", "Business - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6982", "Business Top Stories - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6983", # "Cartoon - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7031", "Celebrities - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7016", "Comedy - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7001", "Comment - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7007", "Credit Cards - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7105", "Cricket - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7032", "Critique": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=9817", "Culture - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7048", "Digital - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7067", "Division 1 - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7034", "Division 2 - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7023", "Division 3 - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7035", "Drink - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7065", "Dundee - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7030", "e-business - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7080", "Economics - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7012", "Edinburgh - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7029", "Education - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6988", "Energy & Utilities - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7014", "English - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7026", "Entertainment - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7010", "Environment - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=10193", "Environment - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=10336", "European Club - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7027", "Fashion - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7111", "Features - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6996", "Features - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7114", "Festival - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7100", "Film - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7003", "Food - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7109", "Food, Drink & Agriculture - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7077", "Football - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6991", "Formula One - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7020", "Gadgets - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7068", # don't support other languages yet... # "Gaelic - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7004", "Games - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6999", "Games - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7112", "Genealogy - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7070", "Glasgow - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7009", "Golf - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7072", "Great Scots - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7102", "Health - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6989", "Health - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7066", "Heritage - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7045", "Historic Sites - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7103", "Homes & Gardens - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7069", "Horse Racing - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6984", "Industry - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7081", "Ingenuity - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7047", "Insurance - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7107", "Int'l Football - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7093", "International - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7000", "International - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=11293", "Inverness - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7085", # These are all Press Association articles: # "Latest East Anglia News - National": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=10966", # "Latest East Midlands News - National": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=10965", # "Latest Entertainment Video - National": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6438", # "Latest Irish News - National": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=5909", # "Latest London News - National": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=10968", # "Latest National News - National": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=4068", # "Latest National Sport - National": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=4069", # "Latest North East News - National": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=10964", # "Latest Scottish News - National": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=5908", # "Latest South East News - National": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=10967", # "Latest South West News - National": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=5905", # "Latest Sport Video - National": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6437", # "Latest UK News Video - National": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6436", # "Latest West Midlands News - National": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=5906", # "Latest York and Humberside News - National": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=5907", "Leaders - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7076", "League Cup - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7092", # "Letters - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7008", "Life Insurance - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7060", "Loans - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7106", "Management - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7082", "Market Reports - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7086", "Media & Leisure - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7078", "Mortgages - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7056", "Motorbikes - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7021", "Motorsport - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6994", "Movies - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7097", "Movies - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=11291", "Music - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7002", "Music - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7018", "My Story - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7115", "Myths - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7104", "Natural - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7049", "Nature - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7041", "News - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6985", "News - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7024", "News - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7050", "Odd - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7108", "Olympics - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7073", "Online - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7113", "Opinion - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7074", "Other sports - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7101", "Outdoors - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7017", "Pensions - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7059", "People - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7046", "People - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7095", "Performing Arts - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7063", "Personal Finance - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7075", "Politics - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6990", "Premiership 1 - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7088", "Premiership 2 - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7036", "Premiership 3 - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7089", "Previews - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7040", "Profiles - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7043", "Rallying - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7033", # "Reader Offers - National": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=10083", "Recipes - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7110", "Restaurants - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7064", "Retail - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7083", "Reviews - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7005", "Reviews - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7039", "Rugby - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6993", "Savings - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7057", "Sci-Tech - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6998", "Science - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7091", "Scotland - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7042", "Scotsman Magazine": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=9819", "Scottish Cup - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6987", "Showbiz - National": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=4070", "Six Nations/Int'l - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7037", "Snooker - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7019", "SoS Review - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=9821", "Spectrum - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=9820", "SPL - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7022", "Sport - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6980", "Sport Top Stories - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6981", "Superteams - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7090", "Tax - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7061", "Technology - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7038", "Technology - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7084", "Tennis - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6992", "Top Stories - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=6986", "Traditions - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7051", "Transport - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7013", "Transport - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=10186", "Transport - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=10337", "Travel - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7015", "TV & Radio - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7094", "UK - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7071", "Visual Arts - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7062", "World - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=12007", "World Cup - Scotland": "http://thescotsman.scotsman.com/getFeed.aspx?Format=rss§ionid=7028", } scotlandonsunday_rssfeeds = { "Aberdeen - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6995", "Arts - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7096", "Athletics - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7087", "Banking & Insurance - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7079", "Banking - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7058", "Books - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7006", "Boxing - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7098", "Breaking News - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7055", "BT Cups - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7099", "Business - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6982", "Business Top Stories - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6983", "Cartoon - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7031", "Celebrities - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7016", "Comedy - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7001", "Comment - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7007", "Credit Cards - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7105", "Cricket - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7032", "Critique - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=9817", "Culture - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7048", "Digital - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7067", "Division 1 - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7034", "Division 2 - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7023", "Division 3 - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7035", "Drink - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7065", "Dundee - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7030", "e-business - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7080", "Economics - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7012", "Edinburgh - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7029", "Editor's Choice": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=11162", "Education - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6988", "Energy & Utilities - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7014", "English - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7026", "Entertainment - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7010", "Environment - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=10193", "Environment - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=10336", "European Club - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7027", "Fashion - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7111", "Features - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6996", "Features - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7114", "Festival - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7100", "Film - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7003", "Food - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7109", "Food, Drink & Agriculture - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7077", "Football - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6991", "Formula One - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7020", "Gadgets - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7068", "Gaelic - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7004", "Games - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6999", "Games - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7112", "Genealogy - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7070", "Glasgow - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7009", "Golf - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7072", "Great Scots - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7102", "Health - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6989", "Health - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7066", "Heritage - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7045", "Historic Sites - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7103", "Homes & Gardens - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7069", "Horse Racing - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6984", "Industry - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7081", "Ingenuity - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7047", "Insurance - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7107", "Int'l Football - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7093", "International - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7000", "International - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=11293", "Inverness - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7085", # These are all Press Association articles: # "Latest East Anglia News - National": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=10966", # "Latest East Midlands News - National": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=10965", # "Latest Entertainment Video - National": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6438", # "Latest Irish News - National": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=5909", # "Latest London News - National": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=10968", # "Latest National News - National": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=4068", # "Latest National Sport - National": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=4069", # "Latest North East News - National": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=10964", # "Latest Scottish News - National": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=5908", # "Latest South East News - National": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=10967", # "Latest South West News - National": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=5905", # "Latest Sport Video - National": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6437", # "Latest UK News Video - National": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6436", # "Latest West Midlands News - National": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=5906", # "Latest York and Humberside News - National": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=5907", "Leaders - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7076", "League Cup - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7092", # "Letters - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7008", "Life Insurance - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7060", "Loans - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7106", "Management - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7082", "Market Reports - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7086", "Media & Leisure - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7078", "Mortgages - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7056", "Motorbikes - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7021", "Motorsport - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6994", "Movies - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7097", "Movies - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=11291", "Music - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7002", "Music - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7018", "My Story - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7115", "Myths - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7104", "Natural - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7049", "Nature - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7041", "News - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6985", "News - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7024", "News - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7050", "Odd - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7108", "Olympics - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7073", "Online - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7113", "Opinion - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7074", "Other sports - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7101", "Outdoors - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7017", "Pensions - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7059", "People - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7046", "People - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7095", "Performing Arts - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7063", "Personal Finance - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7075", "Politics - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6990", "Premiership 1 - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7088", "Premiership 2 - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7036", "Premiership 3 - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7089", "Previews - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7040", "Profiles - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7043", "Rallying - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7033", # "Reader Offers - National": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=10083", "Recipes - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7110", "Restaurants - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7064", "Retail - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7083", "Reviews - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7005", "Reviews - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7039", "Rugby - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6993", "Savings - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7057", "Sci-Tech - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6998", "Science - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7091", "Scotland - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7042", "Scotsman Magazine - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=9819", "Scottish Cup - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6987", "Showbiz - National": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=4070", "Six Nations/Int'l - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7037", "Snooker - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7019", "SoS Review": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=9821", "Spectrum": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=9820", "SPL - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7022", "Sport - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6980", "Sport Top Stories - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6981", "Superteams - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7090", "Tax - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7061", "Technology - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7038", "Technology - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7084", "Tennis - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6992", "Top Stories - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=6986", "Traditions - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7051", "Transport - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7013", "Transport - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=10186", "Transport - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=10337", "Travel - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7015", "TV & Radio - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7094", "UK - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7071", "Visual Arts - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7062", "World - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=12007", "World Cup - Scotland": "http://scotlandonsunday.scotsman.com/getFeed.aspx?Format=rss§ionid=7028", } def Extract( html, context ): art = context # for some reason BeautifulSoup gets the encoding as ISO-8859-1... # but we know it's utf-8 soup = BeautifulSoup( html, fromEncoding='utf-8' ) # check for and ignore broken pages artdiv = soup.find( 'div', {'id':'viewarticle'} ) if not artdiv and html.find( "The article has been unable to display.") != -1: ukmedia.DBUG2( "IGNORE article ('unable to display') (%s)\n" % ( art['srcurl']) ); return None # pull out source publication # eg"