datawhale爬虫task01
#使用requests、正则表达式,爬取豆瓣电影top250排行榜 #要求抓取名次、影片名称、年份、导演等字段。 import requests import re import csv import time class doubanTop250(): film_list = [] #1.发送请求 def send_request(self,url): #1.1添加请求头 headers= {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"} #1.3 发送请求 response = requests.get(url=url,headers=headers) print(response.status_code) return response #2.解析数据 def parse(self,response): data = response.content.decode() rank = re.findall('<em class="">(\d+)</em>', data) name = re.findall('<img width="100" alt="(.*) src=', data) country = re.findall(' / (.*) / ', data) director = re.findall('导演:(.*)', data) score = re.findall('<span class="rating_num" property="v:average">(.*)</span>', data) for i in range(0, len(rank)): film_dict = {} film_dict['rank'] = rank[i] film_dict['name'] = name[i] film_dict['country'] = country[i] film_dict['director'] = director[i] film_dict['score'] = score[i] self.film_list.append(film_dict) #3.存储数据 def save_data(self): #0.创建开启文件 csv_file = open('top250.csv', 'w', encoding='utf-8') #1.创建csv写入器 csv_writer = csv.writer(csv_file) #2.写入表头 csv_writer.writerow(self.film_list[0].keys()) #3.写入内容 csv_list = [] for film in self.film_list: film_data = film.values() csv_list.append(film_data) csv_writer.writerows(csv_list) #4.关闭文件 csv_file.close() pass #4.运行 def run(self): # 1.1目标url地址 # 拼接url base_url = "https://movie.douban.com/top250?start=" for i in range(0,225,25): final_url = base_url + str(i) #1.发送请求,返回response对象 response = self.send_request(final_url) #2.解析response数据 self.parse(response) time.sleep(5) #3.保存数据 self.save_data() doubanTop250().run()