#!/usr/bin/env python2.4 # # Copyright (c) 2007 Media Standards Trust # Licensed under the Affero General Public License # (http://www.affero.org/oagpl.html) # # KNOWN ISSUES: # # - the sun sometimes embed multiple stories on the same webpage, # for now we just process only the "main" story and discard the substory. # - we miss subheadings for the occasional article # (they sometimes skip the "article" class we look for...) # - pages with flash video leave some cruft in the content text # ("You need Flash Player 8 or higher..." etc) # import re import urllib2 import sys import traceback from datetime import date,datetime import urlparse import site site.addsitedir("../pylib") from BeautifulSoup import BeautifulSoup from JL import ukmedia,ScraperUtils # current url format: # http://www.thesun.co.uk/sol/homepage/news/royals/article862982.ece srcidpat_ecestyle = re.compile( '/(article\\d+[.]ece)$' ) # Old url format, no longer used (vignette storyserver cms, I think) # http://www.thesun.co.uk/article/0,,2007400986,00.html srcidpat_oldstyle = re.compile( '/(article/[^/]+[.]html)$' ) # names of columnists and indicators in urls, for last resort bylines columnist_lookups = [ {'url': '/columnists/fergus_shanahan/', 'name':u'Fergus Shanahan' }, {'url': '/columnists/ally_ross/', 'name': u'Ally Ross' }, {'url': '/columnists/jane_moore/', 'name': u'Jane Moore' }, {'url': '/columnists/blunkett/', 'name': u'David Blunkett' }, {'url': '/columnists/kelvin_mackenzie/', 'name': u'Kelvin MacKenzie' }, {'url': '/columnists/john_gaunt/', 'name': u'John Gaunt' }, {'url': '/columnists/lorraine_kelly/', 'name': u'Lorraine Kelly' }, {'url': '/columnists/clarkson/', 'name': u'Jeremy Clarkson' }, {'url': '/columnists/kavanagh/', 'name': u'Trevor Kavanagh' }, ] def CalcSrcID( url ): """Extract a unique srcid from url""" o = urlparse.urlparse( url ) if not o[1].endswith( 'thesun.co.uk' ): return None m = srcidpat_ecestyle.search( o[2] ) if m: return 'sun_' + m.group(1) m = srcidpat_oldstyle.search( o[2] ) if m: return 'sun_' + m.group(1) return None # NEW version def FindArticles(): """ Scrapes the Sun-Lite pages to get article list for the week """ baseurl = "http://www.thesun.co.uk" days = [ 'Monday', 'Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday' ] dow = datetime.now().weekday() days[dow] = 'today' ukmedia.DBUG2( "*** sun ***: looking for articles...\n" ) found = [] for day in days: sunlite_url = baseurl + "/sol/homepage/?sunlite=" + day ukmedia.DBUG2( "fetching %s\n" % (sunlite_url) ) html = ukmedia.FetchURL( sunlite_url ) soup = BeautifulSoup(html) cnt = 0 for a in soup.findAll( 'a', { 'href': srcidpat_ecestyle, 'class':'black-link' } ): url = a['href'] if not url.startswith( "http://" ): url = baseurl + a['href'] title = ukmedia.FromHTML( a.renderContents( None ) ) if '/video/' in url: ukmedia.DBUG2( "SKIP video page '%s' [%s]\n" %(title,url) ) continue if '/sportvideos/' in url: ukmedia.DBUG2( "SKIP sportvideos page '%s' [%s]\n" %(title,url) ) continue if title.lower() == 'photo casebook' and '/deidre/' in url: ukmedia.DBUG2( "SKIP photo casebook page '%s' [%s]\n" %(title,url) ) continue if title.lower() == 'photo casebook' and '/deidre/' in url: ukmedia.DBUG2( "SKIP photo casebook page '%s' [%s]\n" %(title,url) ) continue if title.lower() == 'dream team' and '/football/fantasy_games/' in url: ukmedia.DBUG2( "SKIP dreamteam page '%s' [%s]\n" %(title,url) ) continue art = ContextFromURL( url ) art['title'] = title found.append( art ) cnt = cnt+1 #print "'%s' [%s]" % (title,url) ukmedia.DBUG2( " %d articles\n" % (cnt) ) return found # OLD VERSION - Crawls the whole site, rather than just sun-lite pages # Sun RSS feeds are rubbish, and the section pages look like they # could change often (according to whatever campaigns the sun is # banging on about at any one time)... so to get articles to scrape # we do a shallow crawl the site looking for likely links... # def OLD_FindArticles(): """Gather articles to scrape from the sun website. Returns a list of scrape contexts, one for each article. """ ukmedia.DBUG2( "*** sun ***: looking for articles...\n" ) urls = Crawl( 'http://www.thesun.co.uk/sol/homepage/' ) found = [] for url in urls: found.append( ContextFromURL( url ) ) return found # keep track of pages visited by Crawl(), so we don't process them # multiple times crawled = set() # articleurlpat = re.compile( "http:[/][/]www[.]thesun[.]co[.]uk[/]sol[/].*[/]article\\d+[.]ece([?].*)?" ) # OLD VERSION - Crawls the whole site, rather than just sun-lite pages def Crawl( url, depth=0 ): """Recursively crawl the sun website looking for article links. Returns a set containing article urls. """ global crawled maxdepth = 1 # Very shallow. We only go 1 level down. if depth==0: # Starting a new crawl? crawled = set() articlelinks = set() indexlinks = set() if url in crawled: ukmedia.DBUG2( "(already visited '%s')\n" % (url) ) return articlelinks try: html = ukmedia.FetchURL( url ) except urllib2.HTTPError, e: # continue even if we get http errors (bound to be a borked # link or two) traceback.print_exc() print >>sys.stderr, "SKIP '%s' (%d error)\n" %(url, e.code) return articlelinks soup = BeautifulSoup( html ) for a in soup.findAll( 'a' ): if not a.has_key( 'href' ): continue href = a['href'].strip() if href.startswith('/'): # handle relative links href = 'http://www.thesun.co.uk' + href # discard external sites, discussion pages, login pages etc... if not href.startswith( 'http://www.thesun.co.uk/sol/' ): continue if articleurlpat.match( href ): articlelinks.add( href ) else: indexlinks.add( href ) crawled.add( url ) ukmedia.DBUG2( "Crawled '%s' (depth=%d), found %d articles\n" % ( url, depth, len( articlelinks ) ) ) if depth < maxdepth: for l in indexlinks: if not (l in crawled): articlelinks = articlelinks | Crawl( l, depth+1 ) else: ukmedia.DBUG2( " [already visited '%s']\n" % (l) ) return articlelinks def Extract( html, context ): art = context soup = BeautifulSoup( html ) # rt = soup.find( 'roottag' ) # if not rt: # print "no roottag '%s' [%s]" %(art['title'], art['srcurl']) # else: # print "roottag '%s' [%s]" %(art['title'], art['srcurl']) # return None # main column is column2 div - we can exclude a lot of nav cruft by starting here. col2 = soup.find( 'div', {'id':"column2"} ) # sigh.... the sun sometimes embed multiple stories on the same page... # For now we'll just discard the sub-story. Unhappy about this, but # it just makes things too complicated. # TODO: something better. col3 = col2.find('div', { 'id':re.compile("\\bcolumn3\\b") } ) if col3: col3.extract() # get headline h1 = col2.h1 if not h1: # their html is so messed up that sometimes BeautifulSoup mistakenly # closes the column2 div before the main article. If that is the # case, just use the whole soup instead... col2 = soup # need to skip the h1 banner at top of page artmodule = soup.find( text=re.compile(".*BEGIN: Module - Main Article.*")) if artmodule: h1 = artmodule.findNext('h1') else: #sigh... sometimes they have "roottag" at the start of the article. # What is "roottag"? good question... roottag = soup.find( 'roottag' ) h1 = roottag.findPrevious( 'h1' ) if 'small' in h1['class']: h1 = None # sometimes there is no