猫眼电影爬取
'''爬取猫眼电影TOP100,并将其保存''' from urllib import request import re import csv import time import random import os class MaoyanSpider: def __init__(self): self.url = 'https://maoyan.com/board/4?offset={}' self.ua_list = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201' ] # 获取页面 def get_page(self, url): # 每次使用随机的User-Agent headers = {'User-Agent': random.choice(self.ua_list)} req = request.Request(url=url, headers=headers) res = request.urlopen(req) html = res.read().decode('utf-8') # 调用解析函数 self.parse_page(html) # 解析页面 def parse_page(self, html): pattern = re.compile( r'<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>', re.S) r_list = pattern.findall(html) self.save_info(r_list) # 打印页面 # def save_info(self, r_list): # one_film_dict = {} # for rt in r_list: # one_film_dict['name'] = rt[0].strip() # one_film_dict['stars'] = rt[1].strip() # one_film_dict['time'] = rt[2].strip() # print(one_film_dict) # 保存页面 def save_info(self, r_list): film_list = [] with open('./maoyan.csv', 'a', encoding='utf-8', newline='') as f: # 方法一:一条一条写入csv # for rt in r_list: # writer = csv.writer(f) # writer.writerow([rt[0].strip(), rt[1].strip(), rt[2].strip()]) # 方法二:一次性写入csv,减少IO writer = csv.writer(f) for rt in r_list: # 把处理过的数据定义成元组 t = (rt[0].strip(), rt[1].strip(), rt[2].strip()) film_list.append(t) writer.writerows(film_list) # 主函数 def main(self): if os.path.exists('./maoyan.csv'): os.remove('./maoyan.csv') with open('./maoyan.csv', 'a', encoding='utf-8', newline='') as f: writer = csv.writer(f) writer.writerow(['电影名称', '主演', '上映时间']) i = 1 for offset in range(0, 91, 10): url = self.url.format(offset) self.get_page(url) print('第{}页成功下载'.format(i)) i += 1 # time.sleep(random.randint(1, 4)) if __name__ == '__main__': start = time.time() spider = MaoyanSpider() spider.main() end = time.time() print('程序执行时间为: %.2f' % (end - start))