Python语言学习（七）1.2

HTMLParser：

feed：向解析器喂数据

handle_starttag(tag,attrs)：处理开始标签

handle_data(data)：处理标签里的数据体，data数据文本

handle_endtag(tag,attrs)：处理结束标签

#!/usr/bin/python
# encoding: utf-8


from __future__ import print_function
from HTMLParser import HTMLParser
import requests
import os
import time

_MOVIE_PATH = 'html/head/meta/meta/meta/meta/meta/link/link/link/link/body/div/div/div/div/div/div/div/div/table'


class Movie(object):
    def __init__(self):

        self.attrs = []

    def __str__(self):
        content = []
        for k, v in self.attrs:
            line = '{0} = {1}'.format(k, v)
            content.append(line)
        return '\r\n'.join(content)

    def downloadImg(self, imgpath, headers):
        imgurl = None
        for (k, v) in self.attrs:
            if k == 'movie_img_url':
                imgurl = v
        if imgurl is None:
            return None

        imgname = imgurl.split('/')[-1]
        imglocalpath = os.path.join(imgpath, imgname)
        img = requests.get(imgurl, headers)
        with open(imglocalpath, 'wb') as f:
            f.write(img.content)
        self.attrs.append(('movie_img_localpath', imglocalpath))
        return imglocalpath


class DouBanMovieRankParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self._tags_stack = []  # 栈，后入先出

        self.movies = []
        self._new_movie = False

    def reset(self):
        HTMLParser.reset(self)
        self._tags_stack = []  # 栈，后入先出
        self._new_movie = False

    def handle_starttag(self, tag, attrs):

        def _getattr(attrname):
            for (k, v) in attrs:
                if attrname == k:
                    return v
            return None

        self._tags_stack.append(tag)
        path = '/'.join(self._tags_stack)

        # 可以用下面代码检查感兴趣内容的路径
        # print (path)

        if path == _MOVIE_PATH:
            self._new_movie = True
            self.movies.append(Movie())

        if self._new_movie == True and tag == 'a' and _getattr('class') == 'nbg':
            self.movies[-1].attrs.append(('movie_url', _getattr('href')))
            self.movies[-1].attrs.append(('movie_name', _getattr('title')))

        elif self._new_movie == True and tag == 'img':
            self.movies[-1].attrs.append(('movie_img_url', _getattr('src')))

    def handle_endtag(self, tag):
        path = '/'.join(self._tags_stack)
        if path == _MOVIE_PATH:
            self._new_movie = False
        self._tags_stack.pop()

    def handle_data(self, data):

        path = '/'.join(self._tags_stack)
        # 可以用下面代码检查感兴趣内容的路径
        # print (path,data)

        if self._new_movie == True and path.endswith('/p'):
            self.movies[-1].attrs.append(('movie_intro', data))


if __name__ == '__main__':
    t1 = time.time()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115'}
    x = requests.get('https://movie.douban.com/chart', headers=headers)
    movieparser = DouBanMovieRankParser()
    movieparser.feed(x.content)
    parent_dir = os.path.dirname(os.path.abspath(__file__))
    imgpath = os.path.join(parent_dir, 'doubanmovieimg')
    if not os.path.exists(imgpath):
        os.makedirs(imgpath)

    for m in movieparser.movies:
        m.downloadImg(imgpath, headers)
        # print(m)

    print(time.time()-t1)

posted @ 2016-09-01 22:38 TinaGao 阅读(250) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

Python语言学习 （七）1.2

公告

Python语言学习（七）1.2