#!/usr/bin/env python2.4 # # Copyright (c) 2007 Media Standards Trust # Licensed under the Affero General Public License # (http://www.affero.org/oagpl.html) # # # Telegraph seems to have three different formats: # .xml, .html and blogs (.htm). # The blog ones are done by blogs.py # # TODO: # # - see if the telegraph update their master rss feed page. Currently the # list of feeds is a manually-cobbled-together mix of old and new feeds. # At time of writing (2008-07-23) telegraph rss feeds page only lists # old-style feeds. # # - Some RSS feeds for old-style (xml) sections no longer seem to work. Columists is hard hit, so see FindColumnistArticles() cheesy hack. # Remove it when opinion section moves over to new format. # # - better sundaytelegraph handling # # - tidy URLs ( strip jsessionid etc) # http://www.telegraph.co.uk/earth/main.jhtml?view=DETAILS&grid=&xml=/earth/2007/07/19/easeabird119.xml # (strip view param) # # - handle multi-page articles (currently only pick up first page) (is this a problem with new website format too?) # import re from datetime import datetime import sys import os import urlparse import site site.addsitedir("../pylib") import BeautifulSoup from JL import ukmedia, ScraperUtils # these were obtained by manually going through each section and noting down the feeds. new_rssfeeds = { "Home feed": "http://www.telegraph.co.uk/rss", "News feed": "http://www.telegraph.co.uk/news/rss", " UK News feed": "http://www.telegraph.co.uk/news/uknews/rss", " World News feed": "http://www.telegraph.co.uk/news/worldnews/rss", " Politics feed": "http://www.telegraph.co.uk/news/newstopics/politics/rss", " Conservative feed": "http://www.telegraph.co.uk/news/newstopics/politics/conservative/rss", " Labour feed": "http://www.telegraph.co.uk/news/newstopics/politics/labour/rss", " Liberal Democrats feed": "http://www.telegraph.co.uk/news/newstopics/politics/liberaldemocrats/rss", " Celebrity News feed": "http://www.telegraph.co.uk/news/newstopics/celebritynews/rss", " Obituaries feed": "http://www.telegraph.co.uk/news/obituaries/rss", " How About That? feed": "http://www.telegraph.co.uk/news/newstopics/howaboutthat/rss", # science section is still old style, and mixed in with earth section " News Topics feed": "http://www.telegraph.co.uk/news/newstopics/rss", "Sport feed": "http://www.telegraph.co.uk/sport/rss", " Football feed": "http://www.telegraph.co.uk/sport/football/rss", " Leagues feed": "http://www.telegraph.co.uk/sport/football/leagues/rss", # There are actually feeds for every team... but I'm not entering them all by hand :-) # And the leagues feed should cover them all anyway. " European feed": "http://www.telegraph.co.uk/sport/football/european/rss", " International feed": "http://www.telegraph.co.uk/sport/football/international/rss", " Cricket feed": "http://www.telegraph.co.uk/sport/cricket/rss", " International feed": "http://www.telegraph.co.uk/sport/cricket/international/rss", " Counties feed": "http://www.telegraph.co.uk/sport/cricket/counties/rss", " Olympics feed": "http://www.telegraph.co.uk/sport/othersports/olympics/rss", " Rubgy Union feed": "http://www.telegraph.co.uk/sport/rugbyunion/rss", " International feed": "http://www.telegraph.co.uk/sport/rugbyunion/international/rss", " Club feed": "http://www.telegraph.co.uk/sport/rugbyunion/club/rss", " Formula One feed": "http://www.telegraph.co.uk/sport/motorsport/formulaone/rss", " Golf feed": "http://www.telegraph.co.uk/sport/golf/rss", " Tennis feed": "http://www.telegraph.co.uk/sport/tennis/rss", " Horse Racing feed": "http://www.telegraph.co.uk/sport/horseracing/rss", " Other Sports feed": "http://www.telegraph.co.uk/sport/othersports/rss", " sports columnists": "http://www.telegraph.co.uk/sport/columnists/rss", "Finance": "http://www.telegraph.co.uk/finance/rss", " Finance - News By Sector": "http://www.telegraph.co.uk/finance/newsbysector/rss", " Finance - News By Sector - Banks and Finance": "http://www.telegraph.co.uk/finance/newsbysector/banksandfinance/rss", " Finance - News By Sector - Construction and Property": "http://www.telegraph.co.uk/finance/newsbysector/constructionandproperty/rss", " Finance - News By Sector - Energy": "http://www.telegraph.co.uk/finance/newsbysector/energy/rss", " Finance - News By Sector - Industry": "http://www.telegraph.co.uk/finance/newsbysector/industry/rss", " Finance - News By Sector - media tech and telecoms": "http://www.telegraph.co.uk/finance/newsbysector/mediatechnologyandtelecoms/rss", " Finance - News By Sector - Pharmaceuticals and Chemicals": "http://www.telegraph.co.uk/finance/newsbysector/pharmaceuticalsandchemicals/rss", " Finance - News By Sector - Retail and Consumer": "http://www.telegraph.co.uk/finance/newsbysector/retailandconsumer/rss", " Finance - News By Sector - Support Services": "http://www.telegraph.co.uk/finance/newsbysector/supportservices/rss", " Finance - News By Sector - Transport": "http://www.telegraph.co.uk/finance/newsbysector/transport/rss", " Finance - News By Sector - Utilities": "http://www.telegraph.co.uk/finance/newsbysector/utilities/rss", " Finance - Comment": "http://www.telegraph.co.uk/finance/comment/rss", # TODO: also include individual columist feeds? " Finance - Personal Finance": "http://www.telegraph.co.uk/finance/personalfinance/rss", # TODO - subsections for personal finance... " Finance - Markets": "http://www.telegraph.co.uk/finance/markets/rss", # TODO - subsections " Finance - Economics": "http://www.telegraph.co.uk/finance/economics/rss", " Finance - Your Business": "http://www.telegraph.co.uk/finance/yourbusiness/rss", " Finance - Topics": "http://www.telegraph.co.uk/finance/financetopics/rss", # TODO - subsections # Comment section is still old style (See FindColumnistArticles() hack) "Travel feed": "http://www.telegraph.co.uk/travel/rss", # I think "Types of Trips" and "Destinations" feeds might always be empty " Types of Trips feed": "http://www.telegraph.co.uk/travel/typesoftrips/rss", " Destinations feed": "http://www.telegraph.co.uk/travel/destinations/rss", " Hotels feed": "http://www.telegraph.co.uk/travel/hotels/rss", " UK Hotel reviews feed": "http://www.telegraph.co.uk/travel/hubs/ukhotelreviews/rss", " Europe Hotel reviews Feed": "http://www.telegraph.co.uk/travel/hubs/europehotelreviews/rss", " Travel News feed": "http://www.telegraph.co.uk/travel/travelnews/rss", " Columnists feed": "http://www.telegraph.co.uk/travel/columnists/rss", # Lifestyle section is still old style # Culture section is still old style } old_rssfeeds = { "Telegraph | Arts": "http://www.telegraph.co.uk/newsfeed/rss/arts.xml", "Telegraph | Books": "http://www.telegraph.co.uk/newsfeed/rss/arts-books.xml", "Telegraph | Digital Life": "http://www.telegraph.co.uk/newsfeed/rss/connected.xml", "Telegraph | Earth": "http://www.telegraph.co.uk/newsfeed/rss/earth.xml", "Telegraph | Science news": "http://www.telegraph.co.uk/newsfeed/rss/earth-science.xml", "Telegraph | Education": "http://www.telegraph.co.uk/newsfeed/rss/education.xml", "Telegraph | Expat": "http://www.telegraph.co.uk/newsfeed/rss/global.xml", "Telegraph | Fashion": "http://www.telegraph.co.uk/newsfeed/rss/fashion.xml", "Telegraph | Gardening": "http://www.telegraph.co.uk/newsfeed/rss/gardening.xml", "Telegraph | Health": "http://www.telegraph.co.uk/newsfeed/rss/health.xml", "Telegraph | Motoring": "http://www.telegraph.co.uk/newsfeed/rss/motoring.xml", "Telegraph | News | All": "http://www.telegraph.co.uk/newsfeed/rss/news.xml", "Telegraph | News | Major": "http://www.telegraph.co.uk/newsfeed/rss/news-major.xml", "Telegraph | News | UK": "http://www.telegraph.co.uk/newsfeed/rss/news-uk_news.xml", "Telegraph | News | International": "http://www.telegraph.co.uk/newsfeed/rss/news-international_news.xml", # BLOGS: # "Telegraph | News | Blog Yourview": "http://www.telegraph.co.uk/newsfeed/rss/news-blog-yourview.xml", # "Telegraph Business RSS": "http://www.telegraph.co.uk/newsfeed/rss/money_city.xml", # "Telegraph Business | Markets RSS": "http://www.telegraph.co.uk/newsfeed/rss/money_markets.xml", # "Telegraph Money | Personal Finance RSS": "http://www.telegraph.co.uk/newsfeed/rss/money_pf.xml", # "Telegraph | News | Business": "http://www.telegraph.co.uk/newsfeed/rss/money-city_news.xml", # "Telegraph | Your Money": "http://www.telegraph.co.uk/newsfeed/rss/money-personal_finance.xml", # blogs? # "Telegraph | Opinion": "http://www.telegraph.co.uk/newsfeed/rss/opinion-dt_opinion.xml", "Telegraph Opinion RSS": "http://www.telegraph.co.uk/newsfeed/rss/opinion.xml", "Telegraph Opinion | Leaders RSS": "http://www.telegraph.co.uk/newsfeed/rss/leaders.xml", "Telegraph | Leaders": "http://www.telegraph.co.uk/newsfeed/rss/opinion-dt_leaders.xml", "Telegraph | Property": "http://www.telegraph.co.uk/newsfeed/rss/property.xml", "Telegraph | Sport": "http://www.telegraph.co.uk/newsfeed/rss/sport.xml", "Telegraph | Sport | Football": "http://www.telegraph.co.uk/newsfeed/rss/sport-football.xml", "Telegraph | Sport | Premiership Football": "http://www.telegraph.co.uk/newsfeed/rss/sport-football-premiership.xml", "Telegraph | Sport | Cricket": "http://www.telegraph.co.uk/newsfeed/rss/sport-cricket.xml", # doesn't work? # "Telegraph | Sport | International Cricket": "http://www.telegraph.co.uk/newsfeed/rss/sport-international_cricket.xml", "Telegraph | Sport | Rugby Union": "http://www.telegraph.co.uk/newsfeed/rss/sport-rugby_union.xml", "Telegraph | Sport | Golf": "http://www.telegraph.co.uk/newsfeed/rss/sport-golf.xml", "Telegraph | Sport | Tennis": "http://www.telegraph.co.uk/newsfeed/rss/sport-tennis.xml", "Telegraph | Sport | Motor Sport": "http://www.telegraph.co.uk/newsfeed/rss/sport-motor_sport.xml", "Telegraph | Travel": "http://www.telegraph.co.uk/newsfeed/rss/travel.xml", "Telegraph | Wine": "http://www.telegraph.co.uk/newsfeed/rss/wine.xml", # "Telegraph | Podcast": "http://www.telegraph.co.uk/newsfeed/rss/podcast.xml", # "Telegraph | Podcast | mp3": "http://www.telegraph.co.uk/newsfeed/rss/podcastmp3.xml", # seems to cause an error: # "Telegraph | Top Ten Stories": # "http://stats.telegraph.co.uk/rss/topten.xml", # type="rss" language="en-gb" /> # blogs style? # "Telegraph | My Telegraph": # "http://my.telegraph.co.uk/feed.rss" # type="rss" language="en-gb" /> # "Telegraph | Blogs | All Posts": # "http://blogs.telegraph.co.uk/Feed.rss" } rssfeeds = new_rssfeeds rssfeeds.update( old_rssfeeds ) def Extract( html, context ): # blog url format: (handled by blogs.py) # http://blogs.telegraph.co.uk/politics/threelinewhip/feb/speakerfurorenotclasswarfare.htm o = urlparse.urlparse( context['srcurl'] ) if o[2].endswith( ".html" ): # HTML article url format: # http://www.telegraph.co.uk/travel/africaandindianocean/maldives/759764/Maldives-family-holiday-Game-Boys-v-snorkels.html return Extract_HTML_Article( html, context ) if o[2].endswith( ".jhtml" ): # XML article url format: # http://www.telegraph.co.uk/news/main.jhtml?xml=/news/2008/02/25/ncameron125.xml return Extract_XML_Article( html, context ) # if o[1] == "blogs.telegraph.co.uk": # ukmedia.DBUG2( "IGNORE: blog ('%s')\n" % ( context['srcurl']) ) # return None raise Exception, "Uh-oh... don't know how to handle url '%s'" % (context['srcurl']) def Extract_HTML_Article( html, context ): art = context # cull out video section before we do anything vidpat = re.compile( r".*?", re.DOTALL ) html = vidpat.sub( '', html ) soup = BeautifulSoup.BeautifulSoup( html ) # 'storyHead' div contains headline and description storyheaddiv = soup.find( 'div', {'class': 'storyHead' } ) title = storyheaddiv.h1.renderContents( None ) title = ukmedia.FromHTML( title ) title = u' '.join( title.split() ) art['title'] = title desctxt = u'' h2 = storyheaddiv.find('h2') if h2: desctxt = h2.renderContents(None) desctxt = ukmedia.FromHTML( desctxt ) desctxt = u' '.join( desctxt.split() ) # 'story' div contains byline and main article text storydiv = soup.find( 'div', {'class': 'story' } ) bylinediv = storydiv.find( 'div', {'class':'byline'} ) # byline div contains both byline and pubdate txt = bylinediv.renderContents(None) txt = ukmedia.FromHTML( txt ) txt = u' '.join( txt.split() ) m = re.match( r"\s*(.*?)\s*Last Updated:\s+(.*)", txt ) art['byline'] = m.group(1) pubdatetxt = m.group(2) # eg "11:52PM BST 22 Jul 2008" art['pubdate'] = ukmedia.ParseDateTime( pubdatetxt ) # cull out cruft from the story div: bylinediv.extract() for cruft in storydiv.findAll( 'div', {'class': re.compile(r'\bslideshow\b') } ): cruft.extract() for cruft in storydiv.findAll( 'ul', {'class': 'storylist'} ): cruft.extract() contenttxt = storydiv.renderContents(None) contenttxt = ukmedia.SanitiseHTML( contenttxt ) art['content'] = contenttxt if desctxt == u'': desctxt = ukmedia.FirstPara( art['content'] ) art['description'] = desctxt return art def Extract_XML_Article( html, context ): # Sometimes the telegraph has missing articles. # But the website doesn't return proper 404 (page not found) errors. # Instead, it redirects to an error page which has a 200 (OK) code. # Sigh. # there do seem to be a few borked pages on the site, so we'll treat it # as non-fatal (so it won't contribute toward the error count/abort) if re.search( """