【BOOK】【实例】【requests库+正则表达式】猫眼TOP100电影排名爬取
猫眼电影TOP100页面爬取
https://maoyan.com/board/4
##猫眼电影TOP100爬取 import requests import re import json import time ## 页面抓取 def get_one_page(url): try: headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return None except: print('爬取失败!') ##正则解析 def parse_one_page(response): pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name.*?a.*?>(.*?)</a>.*?star.*?>(.*?)</p>.*?releasetime.*?>(.*?)</p>.*?integer.*?>(.*?)</i>.*?fraction.*?>(.*?)</i>.*?</dd>', re.S) items = re.findall(pattern,response) ## 遍历结果,生成字典 for item in items: # yeild生成器 yield { '排名':item[0], '海报': item[1], '片名': item[2].strip(), '主演': item[3].strip()[3:] if len(item[3]) > 3 else '', '上映时间': item[4].strip()[5:] if len(item[4]) > 5 else '', '评分': item[5].strip() + item[6].strip(), } ##写入文件 def write_to_file(content): with open('film.txt', 'a', encoding='utf-8')as f: f.write(json.dumps(content, ensure_ascii=False)+'\n') def main(offset): url = 'https://maoyan.com/board/4?offset=' + str(offset) response = get_one_page(url) for item in parse_one_page(response): print(item) write_to_file(item) if __name__ == '__main__': for i in range(10): main(offset=i*10) time.sleep(1) ## 访问时间过快可能会反爬虫,增加一个延时等待