#!/usr/bin/env python2.4 ''' Scrapes articles from The Times per-journo pages such as http://www.timesonline.co.uk/tol/comment/columnists/anatole_kaletsky/ ''' import sys import re import urllib import urlparse from times import ScraperUtils, ContextFromURL, Extract, ukmedia ARTICLE_LIST_URL = None def FindArticles(): ukmedia.DBUG2( "*** times_journo ***: looking for articles...\n" ) foundarticles = [] http, computer = urlparse.urlparse(ARTICLE_LIST_URL)[:2] BASEURL = '%s://%s/' % (http, computer) html = ukmedia.FetchURL(ARTICLE_LIST_URL) start_marker = '' end_marker = '' html = html.replace(end_marker + start_marker, '') start = html.find(start_marker) end = html.find(end_marker) assert start>-1 and end>-1 html = html[start + len(start_marker):end] for url in re.findall(r"""