Python语言学习 (七)1.2

HTMLParser:

feed:向解析器喂数据

handle_starttag(tag,attrs):处理开始标签

handle_data(data):处理标签里的数据体,data数据文本

handle_endtag(tag,attrs):处理结束标签

 

#!/usr/bin/python
# encoding: utf-8


from __future__ import print_function
from HTMLParser import HTMLParser
import requests
import os
import time

_MOVIE_PATH = 'html/head/meta/meta/meta/meta/meta/link/link/link/link/body/div/div/div/div/div/div/div/div/table'


class Movie(object):
def __init__(self):

self.attrs = []

def __str__(self):
content = []
for k, v in self.attrs:
line = '{0} = {1}'.format(k, v)
content.append(line)
return '\r\n'.join(content)

def downloadImg(self, imgpath, headers):
imgurl = None
for (k, v) in self.attrs:
if k == 'movie_img_url':
imgurl = v
if imgurl is None:
return None

imgname = imgurl.split('/')[-1]
imglocalpath = os.path.join(imgpath, imgname)
img = requests.get(imgurl, headers)
with open(imglocalpath, 'wb') as f:
f.write(img.content)
self.attrs.append(('movie_img_localpath', imglocalpath))
return imglocalpath


class DouBanMovieRankParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self._tags_stack = [] # 栈,后入先出

self.movies = []
self._new_movie = False

def reset(self):
HTMLParser.reset(self)
self._tags_stack = [] # 栈,后入先出
self._new_movie = False

def handle_starttag(self, tag, attrs):

def _getattr(attrname):
for (k, v) in attrs:
if attrname == k:
return v
return None

self._tags_stack.append(tag)
path = '/'.join(self._tags_stack)

# 可以用下面代码检查感兴趣内容的路径
# print (path)

if path == _MOVIE_PATH:
self._new_movie = True
self.movies.append(Movie())

if self._new_movie == True and tag == 'a' and _getattr('class') == 'nbg':
self.movies[-1].attrs.append(('movie_url', _getattr('href')))
self.movies[-1].attrs.append(('movie_name', _getattr('title')))

elif self._new_movie == True and tag == 'img':
self.movies[-1].attrs.append(('movie_img_url', _getattr('src')))

def handle_endtag(self, tag):
path = '/'.join(self._tags_stack)
if path == _MOVIE_PATH:
self._new_movie = False
self._tags_stack.pop()

def handle_data(self, data):

path = '/'.join(self._tags_stack)
# 可以用下面代码检查感兴趣内容的路径
# print (path,data)

if self._new_movie == True and path.endswith('/p'):
self.movies[-1].attrs.append(('movie_intro', data))


if __name__ == '__main__':
t1 = time.time()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115'}
x = requests.get('https://movie.douban.com/chart', headers=headers)
movieparser = DouBanMovieRankParser()
movieparser.feed(x.content)
parent_dir = os.path.dirname(os.path.abspath(__file__))
imgpath = os.path.join(parent_dir, 'doubanmovieimg')
if not os.path.exists(imgpath):
os.makedirs(imgpath)

for m in movieparser.movies:
m.downloadImg(imgpath, headers)
# print(m)

print(time.time()-t1)
posted @ 2016-09-01 22:38  TinaGao  阅读(250)  评论(0编辑  收藏  举报