【BOOK】【实例】【requests库+正则表达式】猫眼TOP100电影排名爬取

猫眼电影TOP100页面爬取

https://maoyan.com/board/4

 

##猫眼电影TOP100爬取
import requests
import re
import json
import time

## 页面抓取
def get_one_page(url):
    try:
        headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except:
        print('爬取失败!')

##正则解析
def parse_one_page(response):
    pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name.*?a.*?>(.*?)</a>.*?star.*?>(.*?)</p>.*?releasetime.*?>(.*?)</p>.*?integer.*?>(.*?)</i>.*?fraction.*?>(.*?)</i>.*?</dd>', re.S)
    items = re.findall(pattern,response)
    ## 遍历结果,生成字典
    for item in items:
        # yeild生成器
        yield {
            '排名':item[0],
            '海报': item[1],
            '片名': item[2].strip(),
            '主演': item[3].strip()[3:] if len(item[3]) > 3 else '',
            '上映时间': item[4].strip()[5:] if len(item[4]) > 5 else '',
            '评分': item[5].strip() + item[6].strip(),
        }

##写入文件
def write_to_file(content):
    with open('film.txt', 'a', encoding='utf-8')as f:
        f.write(json.dumps(content, ensure_ascii=False)+'\n')

def main(offset):
    url = 'https://maoyan.com/board/4?offset=' + str(offset)
    response = get_one_page(url)
    for item in parse_one_page(response):
        print(item)
        write_to_file(item)

if __name__ == '__main__':
    for i in range(10):
        main(offset=i*10)
        time.sleep(1) ## 访问时间过快可能会反爬虫,增加一个延时等待

  

posted @ 2020-03-09 11:25  kuluma  阅读(244)  评论(0编辑  收藏  举报