#!/usr/bin/env python2.4
#
# Copyright (c) 2008 Media Standards Trust
# Licensed under the Affero General Public License
# (http://www.affero.org/oagpl.html)
'''
Scrapes articles from The Telegraph per-journo pages linked from COLUMNISTS_URL.
'''
import sys
import re
import urllib
import urlparse
from telegraph import ScraperUtils, ContextFromURL, Extract, ukmedia
from telegraph import BeautifulSoup
COLUMNISTS_URL = ('http://www.telegraph.co.uk/opinion/main.jhtml?'
'menuId=6795&menuItemId=-1&view=DISPLAYCONTENT&grid=A1&targetRule=0')
def absurl(url, base_url):
'''Makes url be absolute, assuming it was a link on a page at base_url.'''
http, computer = urlparse.urlparse(base_url)[:2]
base_url = '%s://%s/' % (http, computer)
return urllib.basejoin(base_url, url)
def FindArticles():
ukmedia.DBUG2("*** telegraph_journo ***: looking for articles...\n")
foundarticles = []
for url in FindColumnistURLs():
# Read multiple pages of results building up article_urls.
html = ukmedia.FetchURL(url)
soup = BeautifulSoup.BeautifulSoup(html)
bio = soup.find('div', {'class':'summarytrue'}).renderContents(None)
bio = ukmedia.SanitiseHTML(bio).replace('
', '')
journo_upper = soup.find('div', {'class':'boxhdnolink'})
if journo_upper:
journo_upper = journo_upper.renderContents(None).encode('utf-8')
ukmedia.DBUG2(journo_upper + ':')
ukmedia.DBUG2(bio.encode('utf-8'))
SaveJournoBio(bio)
article_links = soup('a', {'class': 'main'})
articles = [absurl(a['href'], url) for a in article_links]
ukmedia.DBUG2('(%d articles) ' % len(articles))
foundarticles += articles
ukmedia.DBUG2('\n')
ukmedia.DBUG2( "Found %d articles\n" % len(foundarticles) )
return [ContextFromURL(url) for url in foundarticles]
def FindColumnistURLs():
'''
Searches the page at COLUMNISTS_URL for the list of links to
pages about each columnist, returns the URLs.
'''
html = ukmedia.FetchURL(COLUMNISTS_URL)
soup = BeautifulSoup.BeautifulSoup(html)
urls = []
for div in soup('div', {'class': 'menu2'}):
url = div.a['href']
url = re.sub(r'targetRule=\d+', 'targetRule=9999', url) # no pagination
url = absurl(url, COLUMNISTS_URL)
urls.append(url)
return urls
def SaveJournoBio(bio):
# TODO: Implement SaveJournoBio.
# journo_bio table needs columns for context['srcorgname'] ("srctype"?)
# and context['srcurl'].
pass
if __name__=='__main__':
ScraperUtils.RunMain(FindArticles, ContextFromURL, Extract)