python(初学提取html页面元素,借用老师)
-*- coding: utf-8 -*- import urllib2 import os def mean_audience_score(id): arv = 0.0 sc_url = "http://movie.mtime.com/" + id + "/" sc_req = urllib2.Request(sc_url, headers={'User-Agent': "Magic Browser"}) sc_page = urllib2.urlopen(sc_req) sc_strw = sc_page.read() sc_str = re.findall(r'<span class="db_point ml6">+\d+\.+\d+</span>', sc_strw) if len(sc_str) == 0: return arv for tt in sc_str: scsc = re.findall(r'\d+\.+\d', tt) arv += float(scsc[0]) return arv / len(sc_str) url = 'http://theater.mtime.com/China_Anhui_Province_Wuhu/' req = urllib2.Request(url,headers={'User-Agent' : "Magic Browser"}) webpage = urllib2.urlopen(req) strw = webpage.read()*0 print strw tg_start = strw.find('hotplaySvList = [') print tg_start if tg_start == -1: print 'not find start tag' os._exit(0) tmp = strw[tg_start:-1] print tmp tg_end = tmp.find(';') print tg_end if tg_end == -1 : print 'not find end tag' os._exit(0) tmp = tmp[len('hotplaySvList = ['):tg_end] print tmp tar_ls = tmp.split("},{") dict_film = {} for t0 in tar_ls: ls_t = t0.split(',') id = ls_t[0].split(':')[-1].strip() film = ls_t[-1].split('"')[-2].strip() dict_film[id] = film for t in dict_film: print "id:" + t + " film:" + dict_film[t]
doublekai.com