[案例]豆瓣电影信息爬取

import json
import os

import requests
from lxml import etree
from lxml.etree import _Element


class DoubanMovieSpider(object):
    def __init__(self):
        self.url = "https://movie.douban.com/chart"
        self.headers = {
            "Host": "movie.douban.com",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
        }
        # 在init时就开启一个文件的fd
        self.file = open("douban.json", "w", encoding="utf-8")

        os.environ["NO_PROXY"] = "*"
        pass

    def get_data(self):
        resp = requests.get(self.url, headers=self.headers)
        return resp.content

    def parse_data(self, data):
        """

        1、获取列表://div[@class='indent']//div/table

        2、每个项./tbody/tr[@class='item']
            2-1、项中的图片./td/a[@class='nbg']/img/@src
            2-2、项目的信息:
                1、标题:./td[@vlign='top']/div[@class='p12']/a
                2、链接:./td[@vlign='top']/div[@class='p12']/a/@href
        """
        etree_html = etree.HTML(data.decode())  # type: _Element

        table_item = etree_html.xpath("//table/tr[@class='item']")

        result_list = []
        for t_item in table_item:  # type: _Element

            temp_dict = dict()
            title = t_item.xpath("./td[@valign='top']/div/a")[0].text.strip("\n /")
            href = t_item.xpath("./td[@valign='top']/div/a/@href")[0]
            img = str(t_item.xpath("./td[@valign='top']/a[@class='nbg']/img/@src")[0])

            try:
                rating_num = t_item.xpath("./td/div/div/span[@class='rating_nums']/text()")[0]
            except:
                rating_num = "暂无评分"
            temp_dict["title"] = title
            temp_dict["href"] = href
            temp_dict["img"] = img
            temp_dict["rating_num"] = rating_num
            result_list.append(temp_dict)

        return result_list

    def save_data(self, data):
        self.file.write(json.dumps(data, ensure_ascii=False, indent=4))

    def __del__(self):
        try:
            self.file.close()
        except Exception as e:
            print(e)

    def run(self):
        data = self.get_data()
        parse_data = self.parse_data(data)
        self.save_data(parse_data)
        pass


if __name__ == '__main__':
    douban_movie = DoubanMovieSpider()
    douban_movie.run()

posted @ 2023-07-17 11:10  蕝戀  阅读(10)  评论(0编辑  收藏  举报