requests+re+multiprocessing爬取猫眼电影top100

import re
import json
import requests
from multiprocessing import Pool
from requests.exceptions import RequestException


def get_one_page(url):
    """
    获取单页面信息
    :param url:
    :return:
    """
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None


def parse_one_page(html):
    """
    解析页面信息
    :param html:
    :return:
    """
    pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?poster-default.*?src="(.*?)"'
                         '.*?name"><a.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)'
                         '</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>', re.S)
    items = re.findall(pattern, html)
    for item in items:
        yield  {
            "index": item[0],
            "image": item[1],
            "title": item[2],
            "star": item[3].strip()[3:],
            "time": item[4].strip()[5:],
            "score": item[5]+item[6]
        }


def save_to_file(content):
    """
    将信息保存到文件中
    :param content:
    :return:
    """
    with open("maoyan.txt", "a", encoding="utf-8") as f:
        f.write(json.dumps(content, ensure_ascii=False) + "\n")


def main(offset):
    url = "https://maoyan.com/board/4?offset={}".format(offset)
    html = get_one_page(url)
    for item in parse_one_page(html):
        save_to_file(item)


if __name__ == "__main__":
    # for i in range(10):
    #     main(i*10)

    # 使用多进程请求多个url来减少网络等待浪费的时间
    # map默认异步执行任务、自带close和join功能
    pool = Pool()
    pool.map(main, [i*10 for i in range(10)])
View Code

 

posted @ 2019-04-05 14:19  就俗人一个  阅读(142)  评论(0编辑  收藏  举报