#!/usr/bin/env python2.4
#
# Copyright (c) 2007 Media Standards Trust
# Licensed under the Affero General Public License
# (http://www.affero.org/oagpl.html)
#
# Scraper for Mirror and Sunday Mirror
#
import re
from datetime import datetime
import time
import string
import sys
import urlparse
import site
site.addsitedir("../pylib")
from BeautifulSoup import BeautifulSoup
from JL import ukmedia, ScraperUtils
def FindRSSFeeds():
""" fetch a list of RSS feeds for the mirror.
returns a list of (name, url) tuples, one for each feed
"""
# miriam blacklisted for now, as her column is redirected to blogs (and picked up by our blog rss list)
url_blacklist = ( '/fun-games/', '/pictures/', '/video/', '/miriam/' )
ukmedia.DBUG2( "Fetching list of rss feeds\n" );
sitemap_url = 'http://www.mirror.co.uk/sitemap/'
html = ukmedia.FetchURL( sitemap_url )
soup = BeautifulSoup(html)
feeds = []
for a in soup.findAll( 'a', {'class':'sitemap-rss' } ):
url = a['href']
# a2 = a.findNextSibling( 'a' )
# if a2:
# title = a2.renderContents( None )
# else:
m = re.search( r'mirror.co.uk/(.*)/rss[.]xml', url )
title = m.group(1)
skip = False
for banned in url_blacklist:
if banned in url:
ukmedia.DBUG2( " ignore feed '%s' [%s]\n" % (title,url) )
skip = True
if not skip:
feeds.append( (title,url) )
return feeds
# feedburner blogs, see "http://www.mirror.co.uk/opinion/blogs/"
blog_rssfeeds = [
("blog: 3pm", "http://feeds.feedburner.com/mirror-3pm"),
("blog: Amber & friends", "http://feeds.feedburner.com/mirrorfashion"),
("blog: Big Brother'", "http://feeds.feedburner.com/big-brother/" ),
# Christopher Hitchens rss link is borked
#("blog: Christopher Hitchens", 'http://feeds.feedburner.com/.....'),
("blog: Cricket", "http://feeds.feedburner.com/mirror/cricket"),
("blog: Dear Miriam", "http://feeds.feedburner.com/dear-miriam"),
("blog: Football Spy", "http://feeds.feedburner.com/FootballSpy" ),
("blog: Kevin Maguire & Friends","http://feeds.feedburner.com/KevinMaguire" ),
# tv blog handled by main site
# ("blog: Kevin O'Sullivan", '' ),
("Mirror Investigates","http://feeds.feedburner.com/mirror/investigations" ),
# science one is also borked...
# ("Science, Health and the Environment", "http://feeds.feedburner.com/investigations" ),
# jim shelly handled by main site
#("Shelleyvision", "" ),
("Showbiz with Zoe", "http://feeds.feedburner.com/showbiz-with-zoe" ),
# Sue Carroll handled by main site
# ("Sue Carroll", "" ),
("The Sex Doctor", "http://feeds.feedburner.com/sex-doctor/" ),
]
def Extract( html, context ):
url = context['srcurl']
if re.search( r'/(blogs|fashion)[.]mirror[.]co[.]uk/', url ):
return Extract_Blog( html, context )
else:
return Extract_MainSite( html, context )
def Extract_MainSite( html, context ):
art = context
soup = BeautifulSoup( html )
if '/sunday-mirror/' in art['srcurl']:
art['srcorgname'] = u'sundaymirror'
else:
art['srcorgname'] = u'mirror'
maindiv = soup.find( 'div', { 'id': 'three-col' } )
if not maindiv:
if "
You are viewing:
" in html:
ukmedia.DBUG2("IGNORE gallery page '%s' [%s]\n" % (art['title'],art['srcurl']) )
return None
h1 = maindiv.h1
title = h1.renderContents(None)
title = ukmedia.FromHTMLOneLine( title )
art['title'] = title
# eg "By Jeremy Armstrong 24/07/2008"
bylinepara = maindiv.find( 'p', {'class': 'article-date' } )
bylinetxt = bylinepara.renderContents( None )
bylinetxt = ukmedia.FromHTMLOneLine( bylinetxt )
bylinepat = re.compile( r'\s*(.*?)\s*(\d{1,2}/\d{1,2}/\d{4})\s*' )
m = bylinepat.match( bylinetxt )
art['byline'] = m.group(1)
art['pubdate'] = ukmedia.ParseDateTime( m.group(2) )
# sometimes, only sundaymirror.co.uk in byline is only indicator
if u'sundaymirror' in art['byline'].lower():
art['srcorgname'] = u'sundaymirror'
# look for images
art['images'] = []
caption_pat = re.compile( ur"\s*(.*?)\s*[(]\s*(?:pic\s*:|pics\s*:)?\s*(.*)[)]\s*$", re.UNICODE|re.IGNORECASE )
# pick out gallery images first
galimages = []
for galdiv in maindiv.findAll( 'div', {'class': 'galleryembed' } ):
for picdiv in galdiv.findAll( 'div', {'id': re.compile(r'gallery_\d+_pic_\d+') } ):
img = picdiv.img
img_url = img['src']
caption = img['alt']
credit = u''
m = caption_pat.match(caption)
if m:
caption = m.group(1)
credit = m.group(2)
# use a proper caption if there is one
p = picdiv.find('p', {'class':'gallery-caption'})
if p:
caption = ukmedia.FromHTMLOneLine( p.renderContents(None) )
galimages.append( {'url':img_url, 'caption':caption, 'credit':credit } )
galdiv.extract()
# now get any non-gallery images
for imgdiv in maindiv.findAll( 'div', {'class': re.compile('article-image|art-o')} ):
img = imgdiv.img
if not img:
continue
# special exception to avoid star rating on review pages :-)
if img['height'] == "15":
continue
img_url = img['src']
p = imgdiv.find( 'p', {'class': 'article-date'} )
t = img['alt']
if p:
t = p.renderContents(None)
t = ukmedia.FromHTMLOneLine(t)
m = caption_pat.match(t)
cap = t
cred = u''
if m:
cap = m.group(1)
cred = m.group(2)
art['images'].append( {'url':img_url, 'caption':cap, 'credit':cred } )
# add the gallery images last (ordering probably will get lost at some point, but hey)
art['images'].extend( galimages )
# get the main content.
# sometimes there is an , but not always
contentdiv = maindiv.find( 'div', {'id':'article-body'} )
if contentdiv:
pass
else:
# use main div as the content...
contentdiv = maindiv
# ...trying to remove everything except for article text
h1.extract()
bylinepara.extract()
# kill adverts, photos etc...
for cruft in contentdiv.findAll( 'div' ):
cruft.extract()
# sometimes a misplaced "link" element!
for cruft in contentdiv.findAll( 'link' ):
cruft.extract()
content = contentdiv.renderContents(None)
art['content'] = content
art['description'] = ukmedia.FirstPara( content )
if art['description'].strip() == u'':
# check for obvious reasons we might get empty content
t = art['title'].lower()
# if re.search( r'\bpix\b', t ):
# ukmedia.DBUG2("IGNORE pix page '%s' [%s]\n" % (art['title'],art['srcurl']) )
# return None
if re.search( r'^video:', t ):
ukmedia.DBUG2("IGNORE video page '%s' [%s]\n" % (art['title'],art['srcurl']) )
return None
if re.search( r'\bdummy story\b', t ) or re.search( r'\bholding story\b', t ):
ukmedia.DBUG2("IGNORE dummy story '%s' [%s]\n" % (art['title'],art['srcurl']) )
return None
return art
def Extract_Blog( html, context ):
"""extract article from a mirror.co.uk page"""
art = context
soup = BeautifulSoup( html )
#maindiv = soup.find( 'div', { 'class': 'art-body' } )
h1 = soup.find( 'h1', { 'class':'asset-name' } )
art['title'] = ukmedia.FromHTML( h1.renderContents( None ) )
body = soup.find( 'div', { 'class': 'asset-body' } )
for cruft in body.findAll( 'span', {'class':re.compile("mt-enclosure")} ):
cruft.extract()
for cruft in body.findAll( 'img' ):
cruft.extract()
for cruft in body.findAll( 'object' ):
cruft.extract()
art['content'] = body.renderContents( None )
#art['content'] = ukmedia.SanitiseHTML( art['content'] )
art['description'] = ukmedia.FirstPara( art['content'] )
# meta contains byline and date and permalink...
# eg: "By Ann Gripper on Jul 21, 08 10:00 AM in Golf"
meta = soup.find( 'div', { 'class': 'asset-meta' } )
metatxt = ukmedia.FromHTML( meta.renderContents( None ) )
metatxt = u' '.join( metatxt.split() )
metapat = re.compile( r"\s*(.*?)\s*on\s+(.*?(AM|PM))\s*" )
m = metapat.search( metatxt )
art['byline'] = m.group(1)
art['pubdate'] = ukmedia.ParseDateTime( m.group(2) )
return art
# to get unique id out of url
srcid_patterns = [
# new-style:
# http://www.mirror.co.uk/news/top-stories/2008/07/24/exclusive-anne-darwin-vows-to-flee-to-panama-and-1million-fortune-when-out-of-jail-115875-20668758/
# old-style (mirror):
# http://www.mirror.co.uk/news/topstories/2008/02/29/prince-harry-to-be-withdrawn-from-afghanistan-89520-20335665/
# old-style (sunday mirror):
# http://www.sundaymirror.co.uk/news/sunday/2008/02/24/commons-speaker-michael-martin-in-new-expenses-scandal-98487-20329121/
re.compile( "-([-0-9]+)(/([?].*)?)?$" ),
# really old style:
re.compile( "%26(objectid=[0-9]+)%26" ),
# blogs:
# http://blogs.mirror.co.uk/maguire/2008/07/beauty-and-the-beast.html
# "http://fashion.mirror.co.uk/2008/04/sun-and-sandal.html"
re.compile( "((blogs|fashion).mirror.co.uk/.*[.]html)" )
]
def CalcSrcID( url ):
""" Calculate a unique srcid from a url """
o = urlparse.urlparse( url )
# only want pages from mirror.co.uk or sundaymirror.co.uk
# domains (includes blogs.mirror.co.uk)
if not o[1].endswith( 'mirror.co.uk' ) and not o[1].endswith('sundaymirror.co.uk'):
return None
for pat in srcid_patterns:
m = pat.search( url )
if m:
break
if not m:
return None
return 'mirror_' + m.group(1)
def ScrubFunc( context, entry ):
title = context['title']
title = ukmedia.DescapeHTML( title )
title = ukmedia.UncapsTitle( title ) # all mirror headlines are caps. sigh.
context['title'] = title
url = context['srcurl']
o = urlparse.urlparse( url )
# I think they've switched from feedburner.com... to feedsportal.com...
if o[1] == 'feeds.feedburner.com':
# Luckily, feedburner feeds have a special entry
# which contains the original link
url = entry.feedburner_origlink
# o = urlparse.urlparse( url )
if o[1] == 'rss.feedsportal.com':
# Luckily the guid has proper link (marked as non-permalink)
url = entry.guid
# sanity check - make sure we've got a direct link
if url.find( 'mirror.co.uk' ) == -1:
raise Exception, "URL not from mirror.co.uk or sundaymirror.co.uk ('%s')" % (url)
if '/video/' in url:
ukmedia.DBUG2( "ignore video '%s' [%s]\n" % (title,url) )
context[ 'srcid' ] = CalcSrcID( url )
context[ 'srcurl' ] = url
context[ 'permalink'] = url
return context
def ContextFromURL( url ):
"""Build up an article scrape context from a bare url."""
context = {}
context['srcurl'] = url
context['permalink'] = url
context[ 'srcid' ] = CalcSrcID( url )
# looks like sundaymirror.co.uk domainname has been deprecated
if 'sundaymirror.co.uk' in url or '/sunday-mirror/' in url:
context['srcorgname'] = u'sundaymirror'
else:
context['srcorgname'] = u'mirror'
context['lastseen'] = datetime.now()
return context
def FindArticles():
feeds = FindRSSFeeds() # scrape the list of feeds for the main site
feeds = feeds + blog_rssfeeds # add the blog feeds
# feedsportal.com has lots of HTTP Error 503:
# "Feed is currently being prepared; try again real soon"
# The muppets.
# hence the large maxerrors
found = ScraperUtils.FindArticlesFromRSS( feeds, u'mirror', ScrubFunc, maxerrors=30 )
return found
if __name__ == "__main__":
ScraperUtils.RunMain( FindArticles, ContextFromURL, Extract, maxerrors=50 )