#!/usr/bin/env python2.4
#
# Copyright (c) 2007 Media Standards Trust
# Licensed under the Affero General Public License
# (http://www.affero.org/oagpl.html)
#
# Scraper for assorted News blogs site
#
# TODO:
#
import getopt
import re
from datetime import datetime
import sys
import time
import os
#print sys.argv
import site
site.addsitedir("../pylib")
from BeautifulSoup import BeautifulSoup, Comment
from JL import ArticleDB,ukmedia,ScraperUtils
#10 bbcnews BBC News
#11 observer The Observer
#12 sundaymirror The Sunday Mirror
#13 sundaytelegraph The Sunday Telegraph
#3 express The Daily Express
#1 independent The Independent
#2 dailymail The Daily Mail
#4 guardian The Guardian
#5 mirror The Mirror
#6 sun The Sun
#8 times The Times
#9 sundaytimes The Sunday Times
#7 telegraph The Daily Telegraph
# sources used by FindArticles
rssfeedGroups = {
u'bbcnews':
{
'rssfeeds':
{
u'(The Editors- split out by name)': 'http://www.bbc.co.uk/blogs/theeditors/rss.xml', # 'http://www.bbc.co.uk/blogs/theeditors/',
u'Evan Davies': 'http://www.bbc.co.uk/blogs/thereporters/evandavis/rss.xml', # 'http://www.bbc.co.uk/blogs/thereporters/evandavis/',
u'(Five Live Breakfast-split out by name)': 'http://www.bbc.co.uk/blogs/fivelivebreakfast/index.xml', # 'http://www.bbc.co.uk/blogs/fivelivebreakfast/',
u'Mark Mardell': 'http://www.bbc.co.uk/blogs/thereporters/markmardell/rss.xml', # 'http://www.bbc.co.uk/blogs/thereporters/markmardell/',
u'Mihir Bose': 'http://www.bbc.co.uk/blogs/thereporters/mihirbose/rss.xml', # 'http://www.bbc.co.uk/blogs/thereporters/mihirbose/',
u'Nick Robinson': 'http://blogs.bbc.co.uk/nickrobinson/rss.xml', # 'http://www.bbc.co.uk/blogs/nickrobinson/',
u'Mark Devenport': 'http://www.bbc.co.uk/blogs/thereporters/markdevenport/rss.xml',# 'http://www.bbc.co.uk/blogs/thereporters/markdevenport/',
u'Robert Peston': 'http://www.bbc.co.uk/blogs/thereporters/robertpeston/rss.xml', # 'http://www.bbc.co.uk/blogs/thereporters/robertpeston/',
u'(PM Blog[Eddie Mair] et al)': 'http://www.bbc.co.uk/blogs/pm/index.xml', # 'http://www.bbc.co.uk/blogs/pm/',
u'Martin Rosenbaum': 'http://www.bbc.co.uk/blogs/opensecrets/rss.xml', # 'http://www.bbc.co.uk/blogs/opensecrets/',
u'Brian Taylor': 'http://www.bbc.co.uk/blogs/thereporters/briantaylor/rss.xml', # 'http://www.bbc.co.uk/blogs/thereporters/briantaylor/',
u'(Sports editors blogs[Roger Mosey] et al)':'http://www.bbc.co.uk/blogs/sporteditors/index.xml', # 'http://www.bbc.co.uk/blogs/sporteditors/',
u'(Newsnight blog[Peter Barron] et al)': 'http://www.bbc.co.uk/blogs/newsnight/index.xml', # 'http://www.bbc.co.uk/blogs/newsnight/',
u'Betsan Powys': 'http://www.bbc.co.uk/blogs/thereporters/betsanpowys/rss.xml', # 'http://www.bbc.co.uk/blogs/thereporters/betsanpowys/',
u'(World Have Your Say[Ros Atkins] et al)': 'http://blogs.bbc.co.uk/worldhaveyoursay/index.xml' # 'http://www.bbc.co.uk/blogs/worldhaveyoursay/'
},
'regexp':
[
# BBC News blogs pattern:
u'''
)
|
(?:You\ can\ comment\ on\ this\ entry)
)
'''
]
},
u'telegraph':
{
'rssfeeds':
{
u'(Telegraph Blogs)': 'http://blogs.telegraph.co.uk/Feed.rss', # 'http://blogs.telegraph.co.uk/',
# From Our Bloggers: http://blogs.telegraph.co.uk/
#UK Correspondents
u'Holy Smoke by Damian Thompson': 'http://blogs.telegraph.co.uk/ukcorrespondents/holysmoke/feed.rss',
u'Christopher Howse on language': 'http://blogs.telegraph.co.uk/ukcorrespondents/christopherhowse/feed.rss',
u'Andrew McKie Obituaries Editor': 'http://blogs.telegraph.co.uk/ukcorrespondents/andrewmckie/feed.rss',
u'Neil Midgley on television': 'http://blogs.telegraph.co.uk/ukcorrespondents/neilmidgley/feed.rss',
u'Home Truths': 'http://blogs.telegraph.co.uk/ukcorrespondents/hometruths/feed.rss',
u'Julia Haileson the environment': 'http://blogs.telegraph.co.uk/ukcorrespondents/juliahailes/feed.rss',
u'Web TV hits': 'http://blogs.telegraph.co.uk/ukcorrespondents/webtvhits/feed.rss',
#Foreign Correspondents
u'Catherine Elsworth in Los Angeles': 'http://blogs.telegraph.co.uk/foreign/catherineelsworth/feed.rss',
u'Peter Fosterin New Delhi': 'http://blogs.telegraph.co.uk/foreign/peterfoster/feed.rss',
u'Richard Spencer in Beijing': 'http://blogs.telegraph.co.uk/foreign/richardspencer/feed.rss',
u'David Blair Diplomatic Correspondent': 'http://blogs.telegraph.co.uk/foreign/davidblair/feed.rss',
u'Toby Harnden in Washington DC': 'http://blogs.telegraph.co.uk/foreign/tobyharnden/feed.rss',
u'Harry de Quetteville in Berlin': 'http://blogs.telegraph.co.uk/foreign/harrydequetteville/feed.rss',
u'Adrian Blomfield in Moscow': 'http://blogs.telegraph.co.uk/foreign/adrianblomfield/feed.rss',
#Business
u'Tales from the high streetby Fletcher and Hall': 'http://blogs.telegraph.co.uk/business/talesofthehighstreet/feed.rss',
u'Your Business Blogby Richard Tyler': 'http://blogs.telegraph.co.uk/business/yourbusiness/feed.rss',
u'Ambrose Evans-Pritchard': 'http://blogs.telegraph.co.uk/business/ambrosevanspritchard/feed.rss',
u'Market forcesby Ben Bland': 'http://blogs.telegraph.co.uk/business/marketforces/feed.rss',
#Technology
u'Shane Richmond': 'http://blogs.telegraph.co.uk/technology/shanerichmond/feed.rss',
u'Ian Douglas': 'http://blogs.telegraph.co.uk/technology/iandouglas/feed.rss',
#Politics
u'Gimson Unbound': 'http://blogs.telegraph.co.uk/politics/gimsonunbound/feed.rss',
u'Daniel Hannan': 'http://blogs.telegraph.co.uk/politics/danielhannan/feed.rss',
u'Christopher Hope': 'http://blogs.telegraph.co.uk/politics/christopherhope/feed.rss',
u'Brassneck by Mick Fealty': 'http://blogs.telegraph.co.uk/politics/brassneck/feed.rss',
#Arts
u'Ceri Radford': 'http://blogs.telegraph.co.uk/arts/ceriradford/feed.rss',
u'Reel Life with Davies and Gray': 'http://blogs.telegraph.co.uk/arts/reellife/feed.rss',
u'The Slaughtered Lamb by Sally Peck': 'http://blogs.telegraph.co.uk/arts/slaughteredlamb/feed.rss',
u'Frame of mind by Lucy Davies': 'http://blogs.telegraph.co.uk/arts/frameofmind/feed.rss',
#Sport
u'Mick Cleary on rugby': 'http://blogs.telegraph.co.uk/sport/mickcleary/feed.rss',
u'Gareth A. Davies on boxing': 'http://blogs.telegraph.co.uk/sport/garethdavies/feed.rss',
u'Kevin Garside on Formula 1': 'http://blogs.telegraph.co.uk/sport/kevingarside/feed.rss',
u'Oliver Brown on football': 'http://blogs.telegraph.co.uk/sport/oliverbrown/feed.rss',
u'In Sport': 'http://blogs.telegraph.co.uk/sport/insport/feed.rss',
u'Fantasy Football': 'http://blogs.telegraph.co.uk/sport/fantasyfootball/feed.rss',
u'Nick Houlton sporting history': 'http://blogs.telegraph.co.uk/sport/nickhoult/feed.rss',
u'Patrick Nathanson': 'http://blogs.telegraph.co.uk/sport/patricknathanson/feed.rss',
#Society
u'Bryony Gordon': 'http://blogs.telegraph.co.uk/society/bryonygordon/feed.rss',
#Travel
u'Francisca Kellett': 'http://blogs.telegraph.co.uk/travel/franciscakellett/feed.rss',
u'Charles Starmer-Smith': 'http://blogs.telegraph.co.uk/travel/charlesstarmersmith/feed.rss',
#Motoring
u'Erin Baker': 'http://blogs.telegraph.co.uk/motoring/erinbaker/feed.rss',
#Gardening
u'The Rake\'s Progress by Lila Das Gupta': 'http://blogs.telegraph.co.uk/gardening/rakes-progress/'
},
'regexp':
[
u'''