requests+re+multiprocessing爬取猫眼电影top100
import re import json import requests from multiprocessing import Pool from requests.exceptions import RequestException def get_one_page(url): """ 获取单页面信息 :param url: :return: """ try: response = requests.get(url) if response.status_code == 200: return response.text return None except RequestException: return None def parse_one_page(html): """ 解析页面信息 :param html: :return: """ pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?poster-default.*?src="(.*?)"' '.*?name"><a.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)' '</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>', re.S) items = re.findall(pattern, html) for item in items: yield { "index": item[0], "image": item[1], "title": item[2], "star": item[3].strip()[3:], "time": item[4].strip()[5:], "score": item[5]+item[6] } def save_to_file(content): """ 将信息保存到文件中 :param content: :return: """ with open("maoyan.txt", "a", encoding="utf-8") as f: f.write(json.dumps(content, ensure_ascii=False) + "\n") def main(offset): url = "https://maoyan.com/board/4?offset={}".format(offset) html = get_one_page(url) for item in parse_one_page(html): save_to_file(item) if __name__ == "__main__": # for i in range(10): # main(i*10) # 使用多进程请求多个url来减少网络等待浪费的时间 # map默认异步执行任务、自带close和join功能 pool = Pool() pool.map(main, [i*10 for i in range(10)])