提取猫眼电影正在热映电影以及评分
目标:提取热映电影的名称和评分
首先获取响应内容
from fake_useragent import UserAgent
import requests
# url地址
url = 'https://www.maoyan.com/films?showType=1'
# 设置请求头
headers = {'User-Agent': UserAgent().chrome}
# 发送请求
resp = requests.get(url, headers=headers)
# 将响应内容写入maoyan.txt
with open('maoyan1.txt', 'w') as f:
f.write(resp.text)
运行代码
bs4的css选择器提取
from bs4 import BeautifulSoup
def bs_extract():
# 读取maoyan.txt
with open('maoyan.txt', 'r') as f:
resp = f.read()
# bs4解析响应
soup = BeautifulSoup(resp, 'lxml')
# 提取名称
names = [div.text for div in soup.select('div[class="channel-detail movie-item-title"]>a')]
# 提取评分
scores = [div.text for div in soup.select('div[class="channel-detail channel-detail-orange"]')]
# 打印结果
for n,s in zip(names, scores):
print(f'{n}: {s}')
if __name__ == '__main__':
bs_extract()
pyquery提取
from pyquery import PyQuery as pq
def pyquery_extract():
# 读取maoyan.txt中的响应结果
with open('maoyan1.txt', 'r')as f:
resp = f.read()
# 解析响应
# 构建一个pyquery对象
doc = pq(resp)
name_divs = doc('div.channel-detail.movie-item-title')
names = [name_divs.eq(i).text() for i in range(len(name_divs))]
score_divs = doc('div.channel-detail.channel-detail-orange')
scores = [score_divs.eq(i).text() for i in range(len(score_divs))]
# 打印结果
for n, s in zip(names, scores):
print(f'{n}: {s}')
if __name__ == '__main__':
pyquery_extract()
xpath提取
from lxml import etree
def xpath_extract():
# 读取maoyan.txt中存放的响应内容
with open('maoyan.txt', 'r')as f:
resp = f.read()
# 解析响应内容
# 创建etree对象
e = etree.HTML(resp)
# 提取名称
names = e.xpath('//dd/div[2]/a')
# 提取评分
scores = [div.xpath('string(.)') for div in e.xpath('//div[@class="channel-detail channel-detail-orange"]')]
# 打印结果
for n, s in zip(names, scores):
print(f'{n.text}: {s}')
if __name__ == '__main__':
xpath_extract()
re提取
import re
def re_extract():
with open('maoyan.txt', 'r')as f:
resp = f.read()
names = re.findall('<div class="channel-detail movie-item-title" title="(.+?)">', resp)
scores = re.findall('<div class="channel-detail channel-detail-orange">(.+?)</div>',resp)
num = len(scores)
for i in range(num):
if scores[i] != '暂无评分':
# print(re.findall('<i class="integer">(\\d).</i><i class="fraction">(\\d)</i>', scores[i])) 结果为元组存放在列表中[('9', '1')]
scores[i] = '.'.join(re.findall('<i class="integer">(\\d).</i><i class="fraction">(\\d)</i>', scores[i])[0])
for n, s in zip(names, scores):
print(f'{n}: {s}')
if __name__ == '__main__':
re_extract()
运行结果
浴火之路: 9.1
只此青绿: 9.5
749局: 8.6
哈利·波特与魔法石: 暂无评分
志愿军:存亡之战: 9.7
出走的决心: 9.5
野孩子: 9.2
变形金刚:起源: 9.4
熊猫计划: 9.4
里斯本丸沉没: 9.6
名侦探柯南:百万美元的五棱星: 9.1
危机航线: 9.4
异形:夺命舰: 8.7
爆款好人: 9.0
荒野机器人: 9.5
姥姥的外孙: 9.4
哈利·波特与密室: 暂无评分
绑架游戏: 暂无评分
新大头儿子和小头爸爸6:迷你大冒险: 9.3
一雪前耻: 8.8