利用requests和正则表达式爬取猫眼Top250电影

import time
import os
import requests
from requests.exceptions import RequestException
import re
import json
from multiprocessing import Pool

# 获取网页的html
def get_one_page(url):
    try:
        headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) Apple\
                          WebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
        }
        response = requests.get(url, headers=headers)
        response.encoding = response.apparent_encoding
        if response.status_code == 200:
            return response.text
        return None
    except RequestException as e:
        return None

# 解析html,获取所需的排名,图片等信息
def parse_one_page(html):
    pattern = re.compile(r'<li>.*?class="pic".*?class="">(\d+)</em>.*?src="(.*?)".*?class="hd".*?'
                         + r'title">(.*?)</span>.*?class="bd".*?">(.*?)<br>.*?rating_num.*?average">(.*?)</span>'
                         + r'.*?content.*?<span>(.*?)</span>.*?</li>', re.S)
    items = re.findall(pattern, html)
    for item in items:
        # print(item)
        yield {
            'index': item[0],
            'image': item[1],
            'title': item[2],
            'director': item[3].strip().split('   ')[0][4:],
            # 'actor': item[3].strip().split('   ')[1][4:].strip(' /...'),   # 
            'score': item[4],
            'score_number': item[5].strip()
        }

    # print(items)

# 将筛选出来的信息存入文件
def write_to_file(content):
    with open('result.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')
        f.close()

# 启动函数
def main(offset):
    url = 'https://movie.douban.com/top250?start=' + str(offset)
    html = get_one_page(url)
    # time.sleep(1)
    # parse_one_page(html)
    # print(html)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)


if __name__ == '__main__':
    if os.path.exists('result.txt'):
        os.remove('result.txt')
    pool = Pool()
    pool.map(main, [i*25 for i in range(10)])

  1.在利用进程池爬取网页时,会出现文件的编码格式错误,目前还未找到原因,使用单进程不会出现这种情况。

posted @ 2020-03-26 12:05  飞蝎儿  阅读(265)  评论(0编辑  收藏  举报