提取猫眼电影正在热映电影以及评分

目标:提取热映电影的名称和评分

首先获取响应内容

from fake_useragent import UserAgent
import requests


# url地址
url = 'https://www.maoyan.com/films?showType=1'
# 设置请求头
headers = {'User-Agent': UserAgent().chrome}
# 发送请求
resp = requests.get(url, headers=headers)
# 将响应内容写入maoyan.txt
with open('maoyan1.txt', 'w') as f:
    f.write(resp.text)

运行代码

bs4的css选择器提取

from bs4 import BeautifulSoup


def bs_extract():
    # 读取maoyan.txt
    with open('maoyan.txt', 'r') as f:
        resp = f.read()
    # bs4解析响应
    soup = BeautifulSoup(resp, 'lxml')
    # 提取名称
    names = [div.text for div in soup.select('div[class="channel-detail movie-item-title"]>a')]
    # 提取评分
    scores = [div.text for div in soup.select('div[class="channel-detail channel-detail-orange"]')]
    # 打印结果
    for n,s in zip(names, scores):
        print(f'{n}: {s}')


if __name__ == '__main__':
    bs_extract()

pyquery提取

from pyquery import PyQuery as pq


def pyquery_extract():
    # 读取maoyan.txt中的响应结果
    with open('maoyan1.txt', 'r')as f:
        resp = f.read()
    # 解析响应
    # 构建一个pyquery对象
    doc = pq(resp)
    name_divs = doc('div.channel-detail.movie-item-title')
    names = [name_divs.eq(i).text() for i in range(len(name_divs))]
    score_divs = doc('div.channel-detail.channel-detail-orange')
    scores = [score_divs.eq(i).text() for i in range(len(score_divs))]
    # 打印结果
    for n, s in zip(names, scores):
        print(f'{n}: {s}')
        
        
if __name__ == '__main__':
    pyquery_extract()

xpath提取

from lxml import etree


def xpath_extract():
    # 读取maoyan.txt中存放的响应内容
    with open('maoyan.txt', 'r')as f:
        resp = f.read()
    # 解析响应内容
    # 创建etree对象
    e = etree.HTML(resp)
    # 提取名称
    names = e.xpath('//dd/div[2]/a')
    # 提取评分
    scores = [div.xpath('string(.)') for div in e.xpath('//div[@class="channel-detail channel-detail-orange"]')]
    # 打印结果
    for n, s in zip(names, scores):
        print(f'{n.text}: {s}')


if __name__ == '__main__':
    xpath_extract()

re提取

import re


def re_extract():
    with open('maoyan.txt', 'r')as f:
        resp = f.read()
    names = re.findall('<div class="channel-detail movie-item-title" title="(.+?)">', resp)
    scores = re.findall('<div class="channel-detail channel-detail-orange">(.+?)</div>',resp)
    num = len(scores)
    for i in range(num):
        if scores[i] != '暂无评分':
            # print(re.findall('<i class="integer">(\\d).</i><i class="fraction">(\\d)</i>', scores[i])) 结果为元组存放在列表中[('9', '1')]
            scores[i] = '.'.join(re.findall('<i class="integer">(\\d).</i><i class="fraction">(\\d)</i>', scores[i])[0])
    for n, s in zip(names, scores):
        print(f'{n}: {s}')


if __name__ == '__main__':
    re_extract()

运行结果

浴火之路: 9.1
只此青绿: 9.5
749局: 8.6
哈利·波特与魔法石: 暂无评分
志愿军:存亡之战: 9.7
出走的决心: 9.5
野孩子: 9.2
变形金刚:起源: 9.4
熊猫计划: 9.4
里斯本丸沉没: 9.6
名侦探柯南:百万美元的五棱星: 9.1
危机航线: 9.4
异形:夺命舰: 8.7
爆款好人: 9.0
荒野机器人: 9.5
姥姥的外孙: 9.4
哈利·波特与密室: 暂无评分
绑架游戏: 暂无评分
新大头儿子和小头爸爸6:迷你大冒险: 9.3
一雪前耻: 8.8

 

posted @ 2024-10-09 15:59  松鼠q  阅读(14)  评论(0编辑  收藏  举报