爬取猫眼电影
起因
有一份工作需要我列出两个电影院的每天电影排期信息,我不想每次都要去猫眼上复制粘贴。所以做了个爬虫
爬虫1.0版本
功能
能够知道每天的电影排期信息
使用限制
只能在当天使用,不能在前一晚上使用,后面我会再考虑修改
代码
# *coding:UTF-8 *
import requests
import re
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
ua = UserAgent()
headers = {
'Content-Type': 'text/plain; charset=UTF-8',
'uuid' : 'A864343071E911EB963CF9FD0B38DF1428081ABA4F764AF692B0DE0AF1486195',
'Origin': 'https://maoyan.com',
'Referer': 'https://maoyan.com/board/4',
'User-Agent': f'{ua.random}'
}
"""
爬虫,爬取猫眼页面
"""
def get_one_page(url, headers):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except requests.RequestException:
return None
"""
输入:
html是网页源代码
name是电影院名字
date是猫眼上面每部片子的日期,
格式,[匹配的日期][总日期]
例子: 12
1表示第一天,2表示总共有两天(今天、明天或者明天、后天)
注意! 只有日期没有场次的日期不算
"""
class analyze_one_cinema(object):
def __init__(self, html, name, date):
self.html = html
self.name = name
self.date = date
self.bs = BeautifulSoup(html, "html.parser")
self.all_movie_names = []
self.all_movies_dates = {}
self.today = self.bs.select('.show-date')[1].get_text().split('\n')[3][-4:]
"""
获取电影名字,和生成电影名字对应的时间空表
"""
def get_movie_names(self):
origin_movie_names = self.bs.select('.movie-name')
for i in origin_movie_names:
self.all_movie_names.append(i.get_text())
# print(self.all_movie_names)
for i in self.all_movie_names:
self.all_movies_dates[i] = []
"""
获取第二天的电影排期
"""
def get_movie_dates(self, id):
try:
x = self.bs.select('.plist')[id].get_text().split('\n')
date = []
tmp_date = []
pattern = re.compile(r'^[0-9][0-9]:[0-9][0-9]')
for i in x:
if pattern.match(i):
tmp_date.append(i)
for i in range(0, len(tmp_date), 2):
date.append(f"{tmp_date[i]} -> {tmp_date[i + 1][:-2]}")
return date
except:
pass
"""
将第二天电影排期放入对应的时间表中
"""
def add_movie_dates(self):
tmp = -1
for i in range(len(self.all_movie_names)):
tmp1 = self.date[i] // 10
tmp2 = self.date[i] % 10
if tmp1 == 0:
tmp = tmp + tmp2
continue
self.all_movies_dates[self.all_movie_names[i]] = self.get_movie_dates(tmp + tmp1)
tmp = tmp + tmp2
"""
生成最终广告宣传结果
"""
def result(self):
if self.name == 'xx公司1':
print("金逸电影场次")
print(self.today)
print("票价:25/张")
for key, value in self.all_movies_dates.items():
print(f"《{key}》")
for i in value:
print(i)
if self.name == 'xx公司2':
print("中影电影场次")
print(self.today)
print("票价:22/张")
try:
for key, value in self.all_movies_dates.items():
print(f"《{key}》")
for i in value:
print(i)
except:
pass
def start(self):
self.get_movie_names() #获取所有的电影名字
self.add_movie_dates()
self.result()
def main():
url = "https://maoyan.com/cinema/8300?poi=5056730"
html = get_one_page(url, headers, )
analyze_one_cinema(html, 'xx公司1', [11,11,22,22,11,11]).start()
url = "https://maoyan.com/cinema/16862?poi=150389803"
html = get_one_page(url, headers)
analyze_one_cinema(html,'xx公司2', [11,11,11,22,11,11,11]).start()
if __name__ == '__main__':
main()
部分效果图:
还不赖
搞CTF