爬取猫眼电影

起因

有一份工作需要我列出两个电影院的每天电影排期信息,我不想每次都要去猫眼上复制粘贴。所以做了个爬虫

爬虫1.0版本

功能
能够知道每天的电影排期信息

使用限制
只能在当天使用,不能在前一晚上使用,后面我会再考虑修改

代码

# *coding:UTF-8 *
import requests
import re
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

ua = UserAgent()
headers = {
    'Content-Type': 'text/plain; charset=UTF-8',
    'uuid' : 'A864343071E911EB963CF9FD0B38DF1428081ABA4F764AF692B0DE0AF1486195',
    'Origin': 'https://maoyan.com',
    'Referer': 'https://maoyan.com/board/4',
    'User-Agent': f'{ua.random}'
}

"""
爬虫,爬取猫眼页面
"""


def get_one_page(url, headers):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except requests.RequestException:
        return None


"""
输入:
html是网页源代码
name是电影院名字
date是猫眼上面每部片子的日期,
        格式,[匹配的日期][总日期]
        例子:     12
                  1表示第一天,2表示总共有两天(今天、明天或者明天、后天)
        注意! 只有日期没有场次的日期不算
"""
class analyze_one_cinema(object):
    def __init__(self, html, name, date):
        self.html = html
        self.name = name
        self.date = date
        self.bs = BeautifulSoup(html, "html.parser")
        self.all_movie_names = []
        self.all_movies_dates = {}
        self.today = self.bs.select('.show-date')[1].get_text().split('\n')[3][-4:]

    """
    获取电影名字,和生成电影名字对应的时间空表
    """
    def get_movie_names(self):
        origin_movie_names = self.bs.select('.movie-name')
        for i in origin_movie_names:
            self.all_movie_names.append(i.get_text())
        # print(self.all_movie_names)
        for i in self.all_movie_names:
            self.all_movies_dates[i] = []

    """
    获取第二天的电影排期
    """
    def get_movie_dates(self, id):
        try:
            x = self.bs.select('.plist')[id].get_text().split('\n')
            date = []
            tmp_date = []
            pattern = re.compile(r'^[0-9][0-9]:[0-9][0-9]')
            for i in x:
                if pattern.match(i):
                    tmp_date.append(i)
            for i in range(0, len(tmp_date), 2):
                date.append(f"{tmp_date[i]} -> {tmp_date[i + 1][:-2]}")
            return date
        except:
            pass

    """
    将第二天电影排期放入对应的时间表中
    """
    def add_movie_dates(self):
        tmp = -1
        for i in range(len(self.all_movie_names)):
            tmp1 = self.date[i] // 10
            tmp2 = self.date[i] %  10
            if tmp1 == 0:
                tmp = tmp + tmp2
                continue
            self.all_movies_dates[self.all_movie_names[i]] = self.get_movie_dates(tmp + tmp1)
            tmp = tmp + tmp2

    """
    生成最终广告宣传结果
    """
    def result(self):
        if self.name == 'xx公司1':
            print("金逸电影场次")
            print(self.today)
            print("票价:25/张")
            for key, value in self.all_movies_dates.items():
                print(f"《{key}》")
                for i in value:
                    print(i)
        if self.name == 'xx公司2':
            print("中影电影场次")
            print(self.today)
            print("票价:22/张")
            try:
                for key, value in self.all_movies_dates.items():
                    print(f"《{key}》")
                    for i in value:
                        print(i)
            except:
                pass

    def start(self):
        self.get_movie_names() #获取所有的电影名字
        self.add_movie_dates()
        self.result()


def main():
    url = "https://maoyan.com/cinema/8300?poi=5056730"
    html = get_one_page(url, headers, )
    analyze_one_cinema(html, 'xx公司1', [11,11,22,22,11,11]).start()

    url = "https://maoyan.com/cinema/16862?poi=150389803"
    html = get_one_page(url, headers)
    analyze_one_cinema(html,'xx公司2', [11,11,11,22,11,11,11]).start()

if __name__ == '__main__':
    main()

部分效果图:

还不赖

posted @ 2021-02-20 01:11  Throokie  阅读(89)  评论(0编辑  收藏  举报