多进程爬虫
import requests from multiprocessing import Pool import re from requests.exceptions import RequestException import json def get_one_page(url): try: res = requests.get(url) if res.status_code == 200: return res.text return None except RequestException: return None def parse_one_page(html): pat = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a' +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?(/dd)', re.S) inem = re.findall(pat, html) for item in inem: yield { '名次':item[0], '图片':item[1], '名称':item[2], '主演':item[3].strip()[3:],#strip() --- 去空格 [3:] --切片 '上映时间': item[4].strip()[5:], '评分': item[5]+ item[6].strip() } def wrire_to(content): with open('dianying.txt', 'a', encoding='utf-8') as f: f.write(json.dumps(content, ensure_ascii=False)+ '\n') f.close() def main(offset): url = "https://maoyan.com/board/4?offset=" + str(offset) html = get_one_page(url) for item in parse_one_page(html): print(item) wrire_to(item) if __name__ == '__main__': # for i in range(10): # main(i*10) pool = Pool() pool.map(main, [i*10 for i in range(10)])