Python爬虫练习——爬取豆瓣TOP250

main.py

from time import time
from module.action import Action as action
from module.save import Save as save
from module.tools import Tools as tools
import time

def main():
    for i in range(0,250,25):
        movie_urls = action.get_movie_list("https://movie.douban.com/top250?start={page}&filter=".format(page=i))
        for info_url in movie_urls:
            action.get_movie_info(info_url)
            time.sleep(1)


if __name__ == "__main__":
    main()

action.py

from email import header
from ntpath import join
import requests
from lxml import etree
from module.tools import Tools as tools

from module.tools import Tools
class Action:

    # 抓取链接列表
    def get_movie_list(base_url):
        headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36"}
        r = requests.get(base_url,headers=headers)
        r.encoding = "utf-8"
        html = etree.HTML(r.text)
        movie_urls = html.xpath('//div[@class="info"]/div[@class="hd"]/a/@href')
        return movie_urls

    # 抓取电影详情
    def get_movie_info(page_url):
        headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36"}
        r = requests.get(page_url,headers=headers)
        r.encoding = "utf-8"
        html = etree.HTML(r.text)

        # 匹配数据
        movie_name = html.xpath('//div[@id="content"]/h1/span/text()')[0]
        regisseur = html.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')[0]
        actors_ret = html.xpath('//*[@id="info"]/span[3]/span[2]/a/text()')
        actors = "/".join(actors_ret[0:3])
        type_ret = html.xpath('//*[@id="info"]/span[@property="v:genre"]/text()')
        type = "/".join(type_ret)
        html_txt = etree.tostring(html,encoding="utf-8").decode("utf-8")
        country = tools.getmidstring(html_txt,"制片国家/地区:</span>","<br/>")
        language = tools.getmidstring(html_txt,"语言:</span>","<br/>")
        score = html.xpath('//strong[@property="v:average"]/text()')[0]
        
        # 打印数据
        print("电影名："+movie_name)
        print("导演："+regisseur)
        print("演员："+actors)
        print("类型："+type)
        print("国家地区："+country)
        print("语言："+language)
        print("豆瓣评分："+score)
        print("++++++++++++++++++++++++++++++++++++++++++++++++++")

tools.py

import os
import glob
class Tools:

    # 取出中间文本
    def getmidstring(html, start_str, end):
        start = html.find(start_str)
        if start >= 0:
            start += len(start_str)
            end = html.find(end, start)
            if end >= 0:
                return html[start:end].strip()

    # 创建文件夹
    # 遇到重复文件夹命名为文件夹目录_1(2,3,4……)
    # 返回文件夹目录名称
    def mkdir(path,root_flag=False):
        folder = os.path.exists(path)
        floder_path = path
        if not folder:
            os.makedirs(path)
        else:
            if not root_flag:
                num_p = 1
                sub_path = glob.glob(path + '*')
                if sub_path:
                    # 最后一个创建目录
                    last_path = sub_path[-1]
                    floder_path = last_path + '_{}'.format(num_p)
                    if last_path.find('_') > 0:
                        num_str = last_path.split('_')
                        if num_str[-1].isdigit():
                            num_p = int(num_str[-1]) + 1
                            floder_path = last_path[0:last_path.rfind(
                                '_')] + '_{}'.format(num_p)
                            os.makedirs(floder_path)
                        else:
                            os.makedirs(floder_path)
                    else:
                        os.makedirs(floder_path)
        return floder_path

结果：

posted @ 2022-04-06 18:13 波罗斯の程序日记阅读(90) 评论(0) 编辑收藏举报

刷新页面返回顶部

波罗斯の程序日记

Python爬虫练习——爬取豆瓣TOP250

公告