1 from bs4 import BeautifulSoup 2 import requests 3 import urllib.request as req 4 import xlwt 5 6 class Spider(object): 7 def __init__(self): 8 self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'} 9 self.proxies = proxies 10 self.url = 'https://movie.douban.com/top250' 11 12 def get_url(self, num): 13 next_url = 'https://movie.douban.com/top250?start=%d&filter=' % num 14 return next_url 15 16 def run(self): 17 movie_index_list = [] 18 movie_name_list = [] 19 hero_name_list = [] 20 movie_pic_list = [] 21 movie_link_list = [] 22 movie_score_list = [] 23 movie_content_list = [] 24 for num in range(10): 25 next_url = self.get_url(num * 25) 26 response = requests.get(next_url, proxies=self.proxies, headers=self.headers).text 27 ret = BeautifulSoup(response, "html.parser") 28 for tag in ret.find_all(attrs={"class": "item"}): 29 movie_index = tag.find('em').get_text() 30 movie_name = tag.find(attrs={"class": "title"}).get_text() 31 hero_name = tag.find('p').get_text().strip().split(' ')[1] 32 movie_pic = tag.find('img').get('src') 33 movie_link = tag.find('a').get("href") 34 movie_score = tag.find(attrs={"class": "rating_num"}).get_text() 35 movie_content = tag.find_all('span')[-2].get_text() 36 movie_index_list.append(movie_index) 37 movie_name_list.append(movie_name) 38 hero_name_list.append(hero_name) 39 movie_pic_list.append(movie_pic) 40 movie_link_list.append(movie_link) 41 movie_score_list.append(movie_score) 42 movie_content_list.append(movie_content) 43 44 return movie_index_list, movie_name_list, hero_name_list, movie_pic_list, movie_link_list, movie_score_list, movie_content_list 45 46 def write_to_excel(self): 47 movie_index, movie_name, hero_name, movie_pic, movie_link, movie_score, movie_content = self.run() 48 workbook = xlwt.Workbook(encoding='utf-8') 49 worksheet = workbook.add_sheet('sheet1') 50 51 for i in range(250): 52 for j in range(7): 53 if j == 0: 54 worksheet.write(i, 0, movie_index[i]) 55 elif j == 1: 56 worksheet.write(i, 1, movie_name[i]) 57 elif j == 2: 58 worksheet.write(i, 2, hero_name[i]) 59 elif j == 3: 60 worksheet.write(i, 3, movie_pic[i]) 61 elif j == 4: 62 worksheet.write(i, 4, movie_link[i]) 63 elif j == 5: 64 worksheet.write(i, 5, movie_score[i]) 65 else: 66 worksheet.write(i, 6, movie_content[i]) 67 workbook.save('./spider.xls') 68 69 if __name__ == '__main__': 70 sp = Spider() 71 sp.write_to_excel()
运行结果如下图: