爬取猫眼TOP100电影
import json import re import requests from requests import RequestException from multiprocessing import Pool #引入进程池 def get_page(url):#获取网页 try: headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'} response=requests.get(url,headers=headers) if response.status_code==200: return response.text return None except RequestException: return None def parse_one_page(html):#正则表达式提取 pattern=re.compile('<dd>.*?board-index.*?">(\d+?)</i>.*?title="(.*?)".*?<img.*?img.*?src=' '"(.*?)".*?>.*?"star">(.*?)</p>.*?"releasetime">(.*?)</p>.*?integer">(.*?)' '</i>.*?fraction">(\d+?)</i>.*?</dd>',re.S) items=re.findall(pattern,html) for item in items: yield { '排名:':item[0], '电影:':item[1], '图片:':item[2], '主演:':item[3].strip()[3:], '上映时间:':item[4].strip()[5:], '评分:':item[5]+item[6] } def write_file(content):#写入文件 with open('movie.txt','a',encoding='utf-8') as f:#以utf-8编码新建 f.write(json.dumps(content,ensure_ascii=False)+'\n')#不转为unicode编码 def main(page): url='http://maoyan.com/board/4' html=get_page(url=url+'?offset='+str(page*10)) # print(html) items=parse_one_page(html) for item in items: write_file(item) if __name__ == '__main__': #单进程 ''' for i in range(10): main(i) ''' #多进程 pool=Pool() pool.map(main,[i for i in range(10)])