python提取网页数据

#coding:utf-8
import urllib2
import os
import re
def dow(url):
    return urllib2.urlopen(url).read()
str=dow('http://theater.mtime.com/China_Beijing/')
lst=re.findall('\d+家影院上映\d+场',str)
url = 'http://theater.mtime.com/China_Beijing'
req = urllib2.Request(url,headers={'User-Agent' : "Magic Browser"})
webpage = urllib2.urlopen(req)
strw = webpage.read()
#print strw
tg_start = strw.find('hotplaySvList = [')
#print tg_start#开始
if tg_start == -1:
    print 'not find start tag'
    os._exit(0)
tmp = strw[tg_start:-1]
tg_end = tmp.find(';')

if tg_end == -1 :
    print 'not find end tag'
    os._exit(0)
tmp = tmp[len('hotplaySvList = ['):tg_end]

tar_ls = tmp.split("},{")

dict_film = {}
i=0
for t0 in tar_ls:
    ls_t = t0.split(',')
    id = ls_t[0].split(':')[-1].strip()
    film = ls_t[-1].split('"')[-2].strip()
    print id,film,lst[i]
    i=i+1

    dict_film[id] = film
print len(dict_film)

 

posted @ 2017-06-02 14:18  Doublekai  阅读(574)  评论(2编辑  收藏  举报