#!/usr/bin/env python2.4 # # Scraper for The Herald (http://www.theherald.co.uk) # # Copyright (c) 2007 Media Standards Trust # Licensed under the Affero General Public License # (http://www.affero.org/oagpl.html) # # # TODO: # - could get journo email addresses from bylines import sys import re from datetime import datetime import sys import urlparse import urllib2 import site site.addsitedir("../pylib") import BeautifulSoup from JL import ukmedia, ScraperUtils from SpiderPig import SpiderPig # pattern to extract unique id from urls # main news site urls: # "http://www.theherald.co.uk/news/news/display.var.2036423.0.Minister_dismisses_more_tax_power_for_Holyrood.php" # blog urls: # "http://www.theherald.co.uk/features/bookblog/index.var.9706.0.at_home_in_a_story.php" idpat = re.compile( "/((display|index)[.]var[.].*[.]php)" ) def CalcSrcID( url ): """ extract unique srcid from url """ url = url.lower() o = urlparse.urlparse( url ) if not o[1].endswith( 'theherald.co.uk' ): return None m = idpat.search( o[2] ) if m: return 'herald_' + m.group(1) else: return None # pattern to find blog rss feeds on the blog index pages blogrsspat = re.compile( "http://www.theherald.co.uk/(.*)/rss.xml" ) def FindArticles(): """Gather articles to scrape from the herald website. Returns a list of scrape contexts, one for each article. """ ukmedia.DBUG2( "*** herald ***: spidering for blog rss feeds...\n" ) found = FindBlogEntries() ukmedia.DBUG2( "*** herald ***: spidering for article links...\n" ) found = found + FindArticlesBySpidering() ukmedia.DBUG2( "found %d articles in total\n" % (len(found)) ) return found def blog_url_handler( feedlist, url, depth, a ): """ SpiderPig callback for searching out links of blog rss feeds """ if depth>2: return None if a.find( text=re.compile( 'LINK' ) ): #print "%d BLOGINDEX: %s" %(depth, url) return url m = blogrsspat.search( url ) if m: blogname = m.group(1) if not blogname in feedlist: # print "%d RSS '%s': %s" %(depth,blogname,url) feedlist[blogname] = url def FindBlogEntries(): """spider to find blog RSS feeds, then use the articles from the feeds""" feeds = {} pig = SpiderPig( blog_url_handler, userdata=feeds, logfunc=ukmedia.DBUG2 ) pig.AddSeed( 'http://www.theherald.co.uk/heraldblogs' ) pig.Go() found = ScraperUtils.FindArticlesFromRSS( feeds, u'herald', ScrubFunc ); return found def art_url_handler( arturls, url, depth, a ): """ SpiderPig callback for searching out article links """ # follow up to three links in if depth > 3: return None # we use the class attribute to decide what sort of link it is if not a.has_key('class'): return None classes = a['class'].split() # links to articles... # We record them but don't follow them. if ('headlineLink' in classes) or ('sectTopHeadline' in classes): if idpat.search( url ): # Might actually be a link to a page listing more articles... if re.match( u'\\s*More\\s*...\\s*', a.renderContents(None) ): return url # OK we think it's an article! arturls.add(url) return None # links to other lists of articles if ('channelLink' in classes) or ('navLink' in classes): return url return None def FindArticlesBySpidering(): """ spider through the site looking for articles """ urls = set() pig = SpiderPig( art_url_handler, userdata=urls, logfunc=ukmedia.DBUG2 ) pig.AddSeed( 'http://www.theherald.co.uk' ) pig.Go() found = [] for url in urls: found.append( ContextFromURL( url ) ) ukmedia.DBUG( "spidering found %d articles\n" %(len(found)) ) return found def Extract( html, context ): url = context['srcurl'] if re.search( 'Copyright Press Association Ltd \\d{4}, All Rights Reserved', html ): ukmedia.DBUG2( "IGNORE Press Association item (%s)\n" % (url) ) return None # TODO: skip NEWS COMPILER pages instead? badtitles = ( "The Herald : Features: LETTERS", "Poetry Blog (from The Herald )", "Arts Blog (from The Herald )", "The Herald : Business: MAIN BUSINESS", "The Herald : Motors videos", "The Herald - Scotland's Leading Quality Daily Newspaper", ) m = re.search( '(.*?)', html ) pagetitle = m.group(1) if pagetitle in badtitles: ukmedia.DBUG2( "IGNORE page '%s' (%s)\n" % ( pagetitle, url) ) return None # blog or article? if html.find( "
" ) != -1: return blog_Extract( html,context ) else: return news_Extract( html,context ) # if re.search( '/display[.]var[.]\\d+', url ): # return news_Extract( html,context ) # if re.search( '/index[.]var[.]\\d+', url ): # return blog_Extract( html,context ) raise Exception, "can't determine type (news or blog) of article (%s)" % (url) def news_Extract( html, context ): """extract function for handling main news site articles""" art = context soup = BeautifulSoup.BeautifulSoup( html ) # TODO: skip NEWS COMPILER pages? headlinediv = soup.find( 'div', {'class':'artHeadline'} ) bylinediv = soup.find( 'td', {'class':'artByline'} ) datediv = soup.find( 'td', {'class':'artDate'} ) # PA items seem to use a different format... sigh... itdatespan = soup.find( 'span', {'class':'itdate'} ) contentdiv = soup.find( 'div', {'class':'articleText'} ) # images art['images'] = [] for img in soup.findAll( src=re.compile( r"http://images[.]newsquest[.]co[.]uk/.*" ) ): im = { 'url': img['src'], 'caption': img['alt'], 'credit': u'' } art['images'].append( im ) # comments art['commentlinks'] = [] comment_pat = re.compile( r"Read Comments\s+[(]\s*(\d+)\s*[)]" ) for marker in soup.findAll( text=comment_pat ): a = marker.parent if a.name != 'a': continue comment_url = urlparse.urljoin( art['srcurl'], a['href'] ) num_comments = None m = comment_pat.search( marker ) if m: num_comments = int( m.group(1) ) art['commentlinks'].append( {'num_comments':num_comments, 'comment_url':comment_url} ) break # just the one. # byline byline = u'' if bylinediv: byline = bylinediv.renderContents( None ) byline = ukmedia.FromHTML( byline ) # look for press association notice #
if byline == u'' and soup.find( 'div', {'class': re.compile('paNews') } ): # it's from the Press Association byline = u'PA' # sometimes byline is first line of article text, in bold... if byline == u'': # but not obituaries (they always have a bit of bold at the top)... if not 'obituaries' in art['srcurl']: n=None if len(contentdiv.p.contents) > 0: n = contentdiv.p.contents[0] if isinstance( n, BeautifulSoup.Tag ): # Want bold elements, with no
s inside them, but followed directly by a
... if n.name == 'b' and not n.find( "br" ): if isinstance( n.nextSibling, BeautifulSoup.Tag ) and n.nextSibling.name == 'br': byline = n.renderContents(None) byline = ukmedia.FromHTML( byline ) byline = u' '.join( byline.split() ) n.extract() # TODO: sometimes followed by place... (eg "in Paris
") headline = headlinediv.renderContents( None ) headline = ukmedia.FromHTML( headline ) for cruft in contentdiv.findAll( 'div', {'id':'midpagempu'} ): cruft.extract() content = contentdiv.renderContents(None) desc = ukmedia.FirstPara( content ) desc = ukmedia.FromHTML( desc ) pubdatetxt = u'' if datediv: pubdatetxt = datediv.renderContents(None).strip() elif itdatespan: pubdatetxt = itdatespan.renderContents(None).strip() # replace 'today' with current date today = datetime.now().strftime( '%a %d %b %Y' ) pubdatetxt = pubdatetxt.replace( 'today', today ) if pubdatetxt == u'': # if still no date, try the web issue date at top of page... # (which will be todays date, rather than real date... but best we can do) issuedate = soup.find( 'td', {'align':'right', 'class':'issueDate'} ) if issuedate: pubdatetxt = issuedate.renderContents(None) pubdatetxt = ukmedia.FromHTML( pubdatetxt ) art['pubdate'] = ukmedia.ParseDateTime( pubdatetxt ) art['byline'] = byline art['title'] = headline art['content'] = content art['description'] = desc return art def blog_Extract( html, context ): """extract function for handling blog entries""" if html.find( "No blog entries found." ) != -1: ukmedia.DBUG2( "IGNORE missing blog entry (%s)\n" % (context[srcurl]) ) return None art = context soup = BeautifulSoup.BeautifulSoup( html ) entdiv = soup.find( 'div', {'class':'entry2'} ) headbox = entdiv.findPreviousSibling( 'div', {'class':'b_box'} ) headline = headbox.a.renderContents(None).strip() headline = ukmedia.FromHTML( headline ) art['title'] = headline byline = u'' postedby = headbox.find( text=re.compile('Posted by') ) if postedby: byline = postedby.nextSibling.renderContents(None).strip() art['byline'] = byline datespan = headbox.find( 'span', {'class':'itdate'} ) # replace 'today' with current date today = datetime.now().strftime( '%a %d %b %Y' ) datetxt = ukmedia.FromHTML( datespan.renderContents(None) ) datetxt = datetxt.replace( 'today', today ) art['pubdate'] = ukmedia.ParseDateTime( datetxt ) content = entdiv.renderContents(None) art['content'] = content desc = ukmedia.FirstPara( content ) desc = ukmedia.FromHTML( desc ) art['description'] = desc return art def ScrubFunc( context, entry ): context['srcid'] = CalcSrcID( context['srcurl'] ) return context def ContextFromURL( url ): """Build up an article scrape context from a bare url.""" context = {} context['srcurl'] = url context['permalink'] = url context['srcid'] = CalcSrcID( url ) context['srcorgname'] = u'herald' context['lastseen'] = datetime.now() return context if __name__ == "__main__": ScraperUtils.RunMain( FindArticles, ContextFromURL, Extract )