爬取mtime网目标城市的热映电影

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

声明:仅学习参考

版本:verison_0

效果图:

 

 

说明:city_name需要手动输入,即当前城市的url需要手动构造

   version_1待实现的功能:(1)通过输入城市名称,即可对应于对应url,而无需手动构造

               (2)多线程(在循环电影数据的时候,可构造一个线程,通过for的方式,生成多线程,注意设置为守护线程,join)

流程:(1)输入城市名字(注意热门城市,是China_Beijing字样,非热门城市是China_Hubei_Province_Huanggang字样)

   (2)获取当前目录下的所有上映的热门电影(如果不需要提取图片等信息,可直接抓取“<script type="text/javascript">var hotplaySvList”下的信息,包括id,url和title

   (3)通过获取到的url和id构造详情页请求,获取主演,类型,描述,热门评论等信息

   (4)票房或评分等信息在ajax请求(Movie.api....)中,通过id可构造相应的请求

注意点:(1)在每次请求前带上referer

    (2)请求的频率不宜过快,且应该随机

    (3)输入city_name后注意首次发起的请求是否成功,如果不成功,说明url没有构造好,应该在网站上找出规律,如.../China_Hubei_Province_Huanggang字样

源码:

from urllib import parse,error
import requests
from lxml import etree
import re
import json
from pprint import pprint
import time,random


class MtimeSpider(object):
    def __init__(self):
        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
    
    def parse_ajax_content(self,response):
        totalboxoffice = re.findall(r'"TotalBoxOffice":"(.*?)"',string=response,flags=re.S)
        totalboxoffice = totalboxoffice[0] if totalboxoffice else None
        totalboxofficeunit = re.findall(r'"TotalBoxOfficeUnit":"(.*?)"',string=response,flags=re.S) # 获取单位
        totalboxofficeunit = totalboxofficeunit[0] if totalboxofficeunit else None
        if totalboxoffice is not None and totalboxofficeunit is not None:
            totalboxoffice = totalboxoffice+totalboxofficeunit
        ratingfinall = re.findall(r'"RatingFinal":(.*?),',string=response,flags=re.S)
        ratingfinall = ratingfinall[0] if ratingfinall else None
        return totalboxoffice,ratingfinall

    def parse_movie_detail(self,response):
        html = etree.HTML(response)
        # 获取分类和上映日期
        cate = str()
        category = html.xpath("//div[contains(@class,'db_head')]/div[last()]/a/text()")
        for str_ in category[0:-1]:
            cate += "/"+str_
        showdata = category[-1]
        # 获取描述
        desc = html.xpath('//p[contains(@class,"mt6 lh18")]/text()')
        description = desc[0] if desc else None
        # 获取主演
        main_actor= html.xpath("//dl[contains(@class,'main_actor')]/dd/a/img/@alt")
        return cate,showdata,description,main_actor

    
    def parse_hotplay(self,hot_play_list,referer):
        items_list = list()
        for hot_play in hot_play_list:
            item = dict()
            item['title'] = hot_play['Title']
            item['href'] = hot_play['Url']
            # 获取到单个电影的分类,描述,主演,总票房和评分
            ## 获取电影详情页的响应
            ### 获取分类,上映时间,描述,主演
            movie_html_str = self.get_request(url=hot_play['Url'],referer=referer)
            category,showdata,description,main_actor = self.parse_movie_detail(movie_html_str)
            item["category"] = category
            item['showdata'] = showdata
            item['description'] = description
            item['main_actor'] = main_actor
            items_list.append(item)
            # 获取总票房和评分
            ajax_url = "http://service.library.mtime.com/Movie.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Library.Services&Ajax_CallBackMethod=GetMovieOverviewRating&Ajax_CallBackArgument0={}".format(hot_play['Id'])
            ajax_content = self.get_request(url=ajax_url,referer=hot_play['Url'])
            totalboxoffice,ratingfinall = self.parse_ajax_content(response=ajax_content)
            item['totalboxoffice'] = totalboxoffice
            item['ratingfinall'] = ratingfinall
            time.sleep(random.random()) 
        return items_list
            
    def parse_city_request(self,response):
        useful_info = re.findall(pattern=r'hotplaySvList\s+=\s+(.*?);',string=response,flags=re.S)
        useful_info = useful_info[0] if useful_info else None
        return json.loads(s=useful_info,encoding='utf-8')

    def get_request(self,url,referer=None):
        headers = {
            "User-Agent":self.user_agent,
            "Referer":referer,
        }
        try:
            response = requests.get(url=url,headers=headers)
            if response.status_code == 200 and response.text is not None:
                return response.content.decode()
        except error.HTTPError as e:
            print(e)
        
    def run(self,city_name):
        # 获取当前城市正在热映的电影
        url = "http://theater.mtime.com/China_{}/".format(city_name)
        city_html_str = self.get_request(url,referer='http://www.mtime.com/')
        hot_play_list = self.parse_city_request(city_html_str)   # 拿到当前城市的所有热映电影,返回列表
        # 进入电影详情页,获取电影标题,分类,描述,主演,总票房
        items_list = self.parse_hotplay(hot_play_list,referer=url)
        pprint(items_list)

    
if __name__=="__main__":
    # city_name = input("请输入城市(拼音形式,首字母大写):")
    city_name = "Huangshi"
    obj = MtimeSpider()
    obj.run(city_name=city_name)

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

posted @ 2020-06-14 13:25  Norni  阅读(230)  评论(0编辑  收藏  举报