爬取豆瓣电影Top250信息

心血来潮,爬取豆瓣电影Top250信息,几个课题记录下:

  分两种数据解析方式:

    第一为parsel;

    第二为将html数据转换成python对象,利用xpath进行解析,对xpath进行复习,用到lxml里的etree。

  数据保存方式:

    第一为csv;

    第二为openpyxl;

    第三为pandas;

    第四种通过pymsql保存到MySql数据库。

  多线程抓取

首先是单线parsel解析数据,csv保存数据。豆瓣电影top250代码基本不变,爬虫代码大致一样。

 1 import csv
 2 import re
 3 import requests
 4 import parsel
 5 import time
 6 import random
 7 
 8 f = open('豆瓣top250电影信息.csv', mode='a', encoding='utf-8-sig', newline='')
 9 csvWriter = csv.DictWriter(f, fieldnames=[
10     '电影名',
11     '外文名',
12     '港台名',
13     '是否可播放',
14     '上映年份',
15     '详情地址',
16     '导演',
17     '编剧',
18     '主演',
19     '评分',
20     '评分人数',
21 ])
22 
23 csvWriter.writeheader() # 写入头
24 headers = {
25     'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36',
26 }
27 
28 def get_data(url):
29     """
30     传入url进行数据采集
31     """
32     response = requests.get(url=url, headers=headers)
33     selector = parsel.Selector(response.text) # 选择器
34     # 选取需要的列表
35     lis = selector.css('#content ol li')
36     # 循环读取需要的数据
37     for item in lis:
38         title = item.css('.info .hd a span:nth-child(1)::text').get() # 中文标题
39         try:
40             foreignTitle = item.css('.info .hd a span:nth-child(2)::text').get().strip().replace('/', '') # 外文标题,包括英语和其他语言
41         except:
42             foreignTitle = '未定!'
43         try:
44             gtTitle = item.css('.info .hd a .other::text').get().strip().lstrip('/') # 港台标题
45         except:
46             gtTtile = '未定!'
47         try:
48             canbePlayed = item.css('.playable::text').get().strip('[').strip(']') # 是否可播放
49         except:
50             canbePlayed = '未知!'
51         movieinfo = item.css('.bd p::text').getall() # 导演演员和上映年份等信息
52         # print(movieinfo)
53         # director = movieinfo[0].split('\xa0\xa0\xa0')[0].strip() # 导演
54         # print(director)
55         # actors = movieinfo[0].split('\xa0\xa0\xa0')[1].strip() # 演员
56         # print(actors)
57         releaseYear = movieinfo[1].split('/')[0].strip()
58         # print(releaseYear)
59 
60         detaiPage = item.css('.hd a::attr(href)').get() # 获取详情页去请求导演和演员信息
61         if detaiPage:
62             response = requests.get(url=detaiPage,headers=headers)
63             selector = parsel.Selector(response.text)
64             results = selector.css('#info')
65             mactors = selector.css('.actor .attrs')
66             for i in results:
67                 director = i.css('span:nth-child(1) a::text').get() # 导演
68                 scenarist = i.css('span:nth-child(3) .attrs a::text').getall() # 编剧
69                 scenarist = ' / '.join(scenarist) # 转换成字符串
70             for j in mactors:
71                 allactor = j.css('span a::text').getall()
72                 actors = ' / '.join(allactor)
73 
74         reviewScore = item.css('.star .rating_num::text').get()
75         reviewCount = item.css('.star span:nth-child(4)::text').get().strip('人评价') # 总评论人数s
76         print(title, foreignTitle, gtTitle, canbePlayed, releaseYear, detaiPage, director, scenarist, actors, reviewScore
77         , reviewCount, sep=' | ')
78         dit = {
79             '电影名': title,
80             '外文名': foreignTitle,
81             '港台名': gtTitle,
82             '是否可播放': canbePlayed,
83             '上映年份': releaseYear,
84             '详情地址': detaiPage,
85             '导演': director,
86             '编剧': scenarist,
87             '主演': actors,
88             '评分': reviewScore,
89             '评分人数': reviewCount,
90         }
91         csvWriter.writerow(dit) # 逐行写入
92         # break
93 
94 for page in range(0, 250 * 25 + 1, 25):
95     print(f'--------------------------------------------正在采集第{int(page/25) + 1}页数据--------------------------------------------')
96     url = f'https://movie.douban.com/top250?start={page}&filter='
97     time.sleep(random.uniform(2, 5))
98     get_data(url)

 

posted @ 2021-12-13 15:44  、一叶孤城  阅读(295)  评论(0编辑  收藏  举报