#!/usr/bin/env python2.4 # # Copyright (c) 2007 Media Standards Trust # Licensed under the Affero General Public License # (http://www.affero.org/oagpl.html) # # Scraper for the independent # # NOTES: # # Indy runs eScenic CMS and has blogs at typepad.com # # They changed over to a new system around the end of 2007/beginning of # 2008. # Old format urls look like: # http://news.independent.co.uk/world/middle_east/article2790961.ece # New ones look like: # http://www.independent.co.uk/news/uk/home-news/harry-set-to-be-pulled-out-of-afghanistan-789513.html # # Unfortunately it doesn't look like they redirect the old format ones to # the new format, so they've broken a lot of our permalinks :-( # # For blogs, hostnames indyblogs.typepad.com and blogs.independent.co.uk # are interchangable. # import getopt import re from datetime import datetime import sys import urlparse import site site.addsitedir("../pylib") from BeautifulSoup import BeautifulSoup from JL import ukmedia, ScraperUtils # things that this scraper might mistakenly use as a byline: dudbylines = [ u'leading article', u'leadinga article', u'the third leader' ] def FindRSSFeeds(): """ fetch a list of RSS feeds for the indy. returns a list of (name, url) tuples, one for each feed """ rsspage = "http://www.independent.co.uk/service/list-of-rss-feeds-775086.html" ukmedia.DBUG2( "Fetching list of rss feeds\n" ); html = ukmedia.FetchURL( rsspage ) soup = BeautifulSoup(html) feeds = [] bodydiv = soup.find( 'div', {'class':'body'} ) # two kinds of link "/rss" for main paper, ".xml" for blogs for a in bodydiv.findAll( 'a', {'href':re.compile( '(/rss)|([.]xml)$' ) } ): url = a['href'] # the page has some borked urls... url = url.replace( "http://http://", "http://" ) title = ukmedia.FromHTMLOneLine( a.renderContents( None ) ) skip = False # for banned in url_blacklist: # if banned in url: # ukmedia.DBUG2( " ignore feed '%s' [%s]\n" % (title,url) ) # skip = True # print "%s: %s" %(title,url) if not skip: feeds.append( (title,url) ) ukmedia.DBUG2( "found %d rss feeds to fetch\n" % ( len(feeds) ) ); return feeds # new-format url (the number is the important bit - the text you # can fiddle with and still get the same article :-) # http://www.independent.co.uk/news/uk/home-news/harry-set-to-be-pulled-out-of-afghanistan-789513.html srcidpat_newformat = re.compile( '/[^/]+-(\d+)[.]html$' ) # old-format url # http://news.independent.co.uk/world/middle_east/article2790961.ece srcidpat_oldformat = re.compile( '/(article\d+[.]ece)$' ) # http://indyblogs.typepad.com/independent/2007/11/terrorism-whos-.html # http://blogs.independent.co.uk/independent/2007/12/the-fife-diet.html def CalcSrcID( url ): """ Calculate a unique srcid from a url """ o = urlparse.urlparse( url ) # we don't handle blogs here (see blogs.py instead). if o[1] in ( 'indyblogs.typepad.com', 'blogs.independent.co.uk' ): return 'independent_' + o[2] if not o[1].endswith( ".independent.co.uk" ): return None m = srcidpat_newformat.search( o[2] ) if m: return 'independent_' + m.group(1) # probably never encounter the old format urls (they seem to # have been turned off now)... but just in case: m = srcidpat_oldformat.search( o[2] ) if m: return 'independent_' + m.group(1) return None def Extract( html, context ): """Extract article from html""" url = context['srcurl'] o = urlparse.urlparse( url ) # we don't handle blogs here (see blogs.py instead). if o[1] in ( 'indyblogs.typepad.com', 'blogs.independent.co.uk' ): return Extract_typepad( html, context ) else: return Extract_eScenic( html, context ) def Extract_eScenic( html, context ): """Extract fn for main paper (eScenic CMS)""" art = context soup = BeautifulSoup( html ) articlediv = soup.find( 'div', { 'id':'article' } ) # the headline headline = articlediv.find( 'h1' ) art['title'] = ukmedia.FromHTMLOneLine( headline.renderContents(None) ) # some articles have taglines taglinepara = articlediv.find('p',{'class':'tagline'}) # "info" para contains byline, date infopara = articlediv.find( 'p', {'class':'info'} ) # date fmt: "Thursday, 24 January 2008" pubdatetext = infopara.em.renderContents(None) art['pubdate'] = ukmedia.ParseDateTime( pubdatetext ) # a couple of ways to get byline... byline = u'' authorelement = infopara.find('author') if authorelement: # it's got a _proper_ byline! byline = authorelement.renderContents(None) # Big names have their own sections which makes bylining them easy if not byline: try: as = soup.find('div', id='breadcrumbs').findAll('a') if as[-2].string in ('Commentators', 'Columnists'): byline = as[-1].string except (IndexError, AttributeError): pass if byline == u'' and taglinepara: # if there's a tagline, try the byline-o-matic on it: byline = ukmedia.ExtractAuthorFromParagraph( taglinepara.renderContents(None) ) if byline == u'': # a lot of stories (particularly comment pieces) have # name in title... # eg "Janet Street-Porter: Our politicians know nothing of real life" m = re.match( "([\\w\\-']+\\s+[\\w\\-']+(\\s+[\\w\\-']+)?\\s*):", art['title'], re.UNICODE ) if m: byline = m.group(1) # cull out duds if byline.lower() in dudbylines: byline = u'' art['byline'] = ukmedia.FromHTML( byline ) # look for images art['images'] = [] for imgdiv in articlediv.findAll( 'div', {'class': 'photoCaption'} ): img = imgdiv.img img_url = img['src'] img_caption = img['alt'] img_credit = u'' p = imgdiv.find( 'p', {'class': 'caption'} ) if p: img_caption = p.renderContents(None) p = imgdiv.find( 'p', {'class': 'credits'} ) if p: img_credit = p.renderContents(None) img_caption = ukmedia.FromHTMLOneLine( img_caption ) img_credit = ukmedia.FromHTMLOneLine( img_credit ) art['images'].append( {'url': img_url, 'caption': img_caption, 'credit': img_credit } ) # article text is in "body" div bodydiv = articlediv.find( 'div',{'class':'body'} ) # Kill cruft: #" Click here to have your say" for cruft in bodydiv.findAll( 'a', {'href':'http://indyblogs.typepad.com/openhouse/have_your_say/index.html'} ): cruft.extract() #"Interesting? Click here to explore further" for cruft in bodydiv.findAll( 'a', {'title':'Click here to explore further'} ): cruft.extract() contenttext = bodydiv.renderContents(None) contenttext = ukmedia.SanitiseHTML( contenttext ) contenttext = contenttext.strip() art['content'] = contenttext # description from tagline if taglinepara: art['description'] = ukmedia.FromHTML( taglinepara.renderContents(None) ) else: # use first para of main text art['description'] = ukmedia.FromHTML( ukmedia.FirstPara( contenttext ) ) return art def Extract_typepad( html, context ): """Extract fn for indy blogs (on typepad.com)""" art = context soup = BeautifulSoup( html ) # the headline headlinediv = soup.find( 'h3', {'class':'entry-header'} ) art['title'] = ukmedia.FromHTMLOneLine( headlinediv.renderContents(None) ) # timestamp # some blogs have a little rdf block with iso timestamp, but don't some don't # m = re.compile( r'dc:date="(.*?)"' ).search( html ) # d = m.group(1) # art['pubdate'] = dateutil.parser.parse( m.group(1) ) # date and time in separate places. sigh. dateheader = soup.find( 'h2', {'class':'date-header'} ) d = dateheader.renderContents(None) # "Thursday, 05 June 2008" postfooter = soup.find( 'span', {'class':'post-footers'} ) t = postfooter.renderContents( None ) # "Posted at 02:22 PM in "... m = re.compile( r"Posted at\s+(\d+:\d+\s+\w\w)\s+" ).search(t) d = d + u' ' + m.group(1) art['pubdate'] = ukmedia.ParseDateTime( d ) # description, byline, content # byline (if present) is in first para byline = u'' bodydiv = soup.find( 'div', {'class':'entry-body'} ) bylinep = bodydiv.p if bylinep: firstpara = ukmedia.FromHTMLOneLine( bylinep.renderContents(None) ) if firstpara.startswith( u"By") and len(firstpara.split()) <= 4: byline = firstpara bylinep.extract() if not byline: # try the RDF block (raw regex search in the html) creatorpat = re.compile( r'dc:creator="(.*?)"' ) m = creatorpat.search( html ) if m: byline = unicode( m.group(1) ) content = bodydiv.renderContents(None) morediv = soup.find( 'div', {'class':'entry-more'} ) if morediv: if art['title'].startswith( u'Cyberclinic:' ): cruft = morediv.find( 'span', {'style':'color: #cccc00;'}) if cruft: if u'CONFUSED ABOUT TECHNOLOGY?' in cruft.renderContents(None): cruft.extract() content = content + morediv.renderContents( None ) content = ukmedia.SanitiseHTML( content ) desc = ukmedia.FirstPara( content ) art['byline'] = byline art['description'] = desc art['content'] = content return art def ScrubFunc( context, entry ): """ description contains html entities and tags... scrub it! """ context[ 'description' ] = ukmedia.FromHTML( context['description'] ) url = context['srcurl'] # url = TidyURL( context['srcurl'] ) if 'rss.feedsportal.com' in url: # luckily, the guid (marked as non-permalink) has the real url url = entry.guid context['srcid'] = CalcSrcID( url ) context['srcurl'] = url context['permalink'] = url return context def FindArticles(): """get a set of articles to scrape from the rss feeds """ rssfeeds = FindRSSFeeds() return ScraperUtils.FindArticlesFromRSS( rssfeeds, u'independent', ScrubFunc ) def ContextFromURL( url ): """Build up an article scrape context from a bare url.""" # url = TidyURL( url ) context = {} context['srcurl'] = url context['permalink'] = url context['srcid'] = CalcSrcID( url ) context['srcorgname'] = u'independent' context['lastseen'] = datetime.now() return context if __name__ == "__main__": ScraperUtils.RunMain( FindArticles, ContextFromURL, Extract, maxerrors=50 )