requests库爬取猫眼电影“最受期待榜”榜单 --网络爬虫

目标站点:https://maoyan.com/board/6

# coding:utf8
import requests, re, json
from requests.exceptions import RequestException


# from multiprocessing import Pool

# 获取页面
def get_one_page(url):
    try:
        resp = requests.get(url)
        if resp.status_code == requests.codes.ok:
            return resp.text
        else:
            return None
    except RequestException:
        return None


# 页面解析
def parse_one_page(html):
    pattern = re.compile('<dd>.*?board-index.*?>(\\d+)</i>.*?data-src="(.*?)"'
                         '.*?name"><a.*?">(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                         '.*?</dd>', re.S)
    items = re.findall(pattern, html)  # result is a list,made up of tuples
    for item in items:
        # 生成字典
        yield {
            'index': item[0],
            'img_url': item[1],
            'title': item[2],
            'stars': item[3][3:],
            'releasetime': item[4],
        }


# 将爬取到的内容写入到文件中
def write_file(content):
    with open('content.txt', 'a', encoding='utf-8') as f:
        str_content = json.dumps(content, ensure_ascii=False)  # 转换成字符串
        f.write(str_content + '\n')
        f.close()


# 主函数
def main(offset):
    url = "https://maoyan.com/board/6/?offset=" + str(offset)
    html = get_one_page(url)
    for item in parse_one_page(html):
        write_file(item)
        print(item)


if __name__ == "__main__":
    # 请求5次
    for i in range(5):
        main(i * 10)

在这里插入图片描述

posted @ 2019-04-23 18:23  牛新龙的IT技术博客  阅读(417)  评论(0编辑  收藏  举报