多进程爬虫

import requests
from multiprocessing import Pool
import re
from requests.exceptions import RequestException
import json

def get_one_page(url):
    try:
        res = requests.get(url)
        if res.status_code ==  200:
            return res.text
        return None
    except RequestException:
        return None
def parse_one_page(html):
    pat = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
                     +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                     +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?(/dd)', re.S)
    inem = re.findall(pat, html)
    for item in inem:
        yield  {
            '名次':item[0],
            '图片':item[1],
            '名称':item[2],
            '主演':item[3].strip()[3:],#strip() --- 去空格 [3:] --切片
            '上映时间': item[4].strip()[5:],
            '评分': item[5]+ item[6].strip()

        }

def wrire_to(content):
    with open('dianying.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False)+ '\n')
        f.close()
def main(offset):
    url = "https://maoyan.com/board/4?offset=" + str(offset)
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        wrire_to(item)

if __name__ == '__main__':
#     for i in range(10):
#         main(i*10)
    pool = Pool()
    pool.map(main, [i*10 for i in range(10)])

 

posted on 2019-06-26 15:44  ||子义  阅读(133)  评论(0编辑  收藏  举报

导航