python提取网页数据
#coding:utf-8 import urllib2 import os import re def dow(url): return urllib2.urlopen(url).read() str=dow('http://theater.mtime.com/China_Beijing/') lst=re.findall('\d+家影院上映\d+场',str) url = 'http://theater.mtime.com/China_Beijing' req = urllib2.Request(url,headers={'User-Agent' : "Magic Browser"}) webpage = urllib2.urlopen(req) strw = webpage.read() #print strw tg_start = strw.find('hotplaySvList = [') #print tg_start#开始 if tg_start == -1: print 'not find start tag' os._exit(0) tmp = strw[tg_start:-1] tg_end = tmp.find(';') if tg_end == -1 : print 'not find end tag' os._exit(0) tmp = tmp[len('hotplaySvList = ['):tg_end] tar_ls = tmp.split("},{") dict_film = {} i=0 for t0 in tar_ls: ls_t = t0.split(',') id = ls_t[0].split(':')[-1].strip() film = ls_t[-1].split('"')[-2].strip() print id,film,lst[i] i=i+1 dict_film[id] = film print len(dict_film)
doublekai.com