#!/usr/bin/env python2.4 # # Scraper for Financial Times # # Copyright (c) 2007 Media Standards Trust # Licensed under the Affero General Public License # (http://www.affero.org/oagpl.html) # # # NOTES: # This scraper handles both blogs and news articles # Login form is a separate page, in a iframe: # http://media.ft.com/h/subs2.html # # TODO: # - how to handle NY times articles on FT site? # - handle ft Lex articles? # import sys import re from datetime import datetime import sys import urllib import urllib2 import cookielib import urlparse import site site.addsitedir("../pylib") from BeautifulSoup import BeautifulSoup,BeautifulStoneSoup from JL import ukmedia, ScraperUtils def FetchRSSFeeds( masterpage='http://www.ft.com/servicestools/newstracking/rss' ): feeds = {} feedpat = re.compile( "http://.*" ) f = urllib2.urlopen( masterpage ) html = f.read() f.close() soup = BeautifulSoup( html ) #