Python爬虫练习——爬取豆瓣TOP250
main.py
from time import time from module.action import Action as action from module.save import Save as save from module.tools import Tools as tools import time def main(): for i in range(0,250,25): movie_urls = action.get_movie_list("https://movie.douban.com/top250?start={page}&filter=".format(page=i)) for info_url in movie_urls: action.get_movie_info(info_url) time.sleep(1) if __name__ == "__main__": main()
action.py
from email import header from ntpath import join import requests from lxml import etree from module.tools import Tools as tools from module.tools import Tools class Action: # 抓取链接列表 def get_movie_list(base_url): headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36"} r = requests.get(base_url,headers=headers) r.encoding = "utf-8" html = etree.HTML(r.text) movie_urls = html.xpath('//div[@class="info"]/div[@class="hd"]/a/@href') return movie_urls # 抓取电影详情 def get_movie_info(page_url): headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36"} r = requests.get(page_url,headers=headers) r.encoding = "utf-8" html = etree.HTML(r.text) # 匹配数据 movie_name = html.xpath('//div[@id="content"]/h1/span/text()')[0] regisseur = html.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')[0] actors_ret = html.xpath('//*[@id="info"]/span[3]/span[2]/a/text()') actors = "/".join(actors_ret[0:3]) type_ret = html.xpath('//*[@id="info"]/span[@property="v:genre"]/text()') type = "/".join(type_ret) html_txt = etree.tostring(html,encoding="utf-8").decode("utf-8") country = tools.getmidstring(html_txt,"制片国家/地区:</span>","<br/>") language = tools.getmidstring(html_txt,"语言:</span>","<br/>") score = html.xpath('//strong[@property="v:average"]/text()')[0] # 打印数据 print("电影名:"+movie_name) print("导演:"+regisseur) print("演员:"+actors) print("类型:"+type) print("国家地区:"+country) print("语言:"+language) print("豆瓣评分:"+score) print("++++++++++++++++++++++++++++++++++++++++++++++++++")
tools.py
import os import glob class Tools: # 取出中间文本 def getmidstring(html, start_str, end): start = html.find(start_str) if start >= 0: start += len(start_str) end = html.find(end, start) if end >= 0: return html[start:end].strip() # 创建文件夹 # 遇到重复文件夹命名为文件夹目录_1(2,3,4……) # 返回文件夹目录名称 def mkdir(path,root_flag=False): folder = os.path.exists(path) floder_path = path if not folder: os.makedirs(path) else: if not root_flag: num_p = 1 sub_path = glob.glob(path + '*') if sub_path: # 最后一个创建目录 last_path = sub_path[-1] floder_path = last_path + '_{}'.format(num_p) if last_path.find('_') > 0: num_str = last_path.split('_') if num_str[-1].isdigit(): num_p = int(num_str[-1]) + 1 floder_path = last_path[0:last_path.rfind( '_')] + '_{}'.format(num_p) os.makedirs(floder_path) else: os.makedirs(floder_path) else: os.makedirs(floder_path) return floder_path
结果: