1 from bs4 import BeautifulSoup
 2 import requests
 3 import urllib.request as req
 4 import xlwt
 5 
 6 class Spider(object):
 7     def __init__(self):
 8         self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'}
 9         self.proxies = proxies
10         self.url = 'https://movie.douban.com/top250'
11 
12     def get_url(self, num):
13         next_url = 'https://movie.douban.com/top250?start=%d&filter=' % num
14         return next_url
15 
16     def run(self):
17         movie_index_list = []
18         movie_name_list = []
19         hero_name_list = []
20         movie_pic_list = []
21         movie_link_list = []
22         movie_score_list = []
23         movie_content_list = []
24         for num in range(10):
25             next_url = self.get_url(num * 25)
26             response = requests.get(next_url, proxies=self.proxies, headers=self.headers).text
27             ret = BeautifulSoup(response, "html.parser")
28             for tag in ret.find_all(attrs={"class": "item"}):
29                 movie_index = tag.find('em').get_text()
30                 movie_name = tag.find(attrs={"class": "title"}).get_text()
31                 hero_name = tag.find('p').get_text().strip().split(' ')[1]
32                 movie_pic = tag.find('img').get('src')
33                 movie_link = tag.find('a').get("href")
34                 movie_score = tag.find(attrs={"class": "rating_num"}).get_text()
35                 movie_content = tag.find_all('span')[-2].get_text()
36                 movie_index_list.append(movie_index)
37                 movie_name_list.append(movie_name)
38                 hero_name_list.append(hero_name)
39                 movie_pic_list.append(movie_pic)
40                 movie_link_list.append(movie_link)
41                 movie_score_list.append(movie_score)
42                 movie_content_list.append(movie_content)
43 
44         return movie_index_list, movie_name_list, hero_name_list, movie_pic_list, movie_link_list, movie_score_list, movie_content_list
45 
46     def write_to_excel(self):
47         movie_index, movie_name, hero_name, movie_pic, movie_link, movie_score, movie_content = self.run()
48         workbook = xlwt.Workbook(encoding='utf-8')
49         worksheet = workbook.add_sheet('sheet1')
50 
51         for i in range(250):
52             for j in range(7):
53                 if j == 0:
54                     worksheet.write(i, 0, movie_index[i])
55                 elif j == 1:
56                     worksheet.write(i, 1, movie_name[i])
57                 elif j == 2:
58                     worksheet.write(i, 2, hero_name[i])
59                 elif j == 3:
60                     worksheet.write(i, 3, movie_pic[i])
61                 elif j == 4:
62                     worksheet.write(i, 4,  movie_link[i])
63                 elif j == 5:
64                     worksheet.write(i, 5, movie_score[i])
65                 else:
66                     worksheet.write(i, 6, movie_content[i])
67         workbook.save('./spider.xls')
68 
69 if __name__ == '__main__':
70     sp = Spider()
71     sp.write_to_excel()

运行结果如下图:

 

posted on 2020-12-09 17:22  风雨无阻!  阅读(205)  评论(0编辑  收藏  举报