# -*- coding:utf-8 -*-
# 字体文件处理网站: https://font.qqe2.com/index-en.html
"""
任务:
    先处理字体文件
        从他的请求当中获取到当前请求附带的字体文件

    爬虫每次运行,获取的数据都是最新的--获取最新的请求--获取最新的字体文件

    固定的请求参数会过期
        1、时间戳
        2、index怎么来的
        3、signkey
            md5加密后的结果

    网页数据每次变动(请求了一下)
        1、每次都会出现一个新的请求,请求当中会生产一个signkey
        2、如果刚刚的signKey: f是生成signkey的代码:
            在这个位置打上断点
            浏览器在下一次请求的时候会执行signKey: f
                当执行到这段代码

    signkey后面的值就是 f
    f又是什么?

"""
import io
import re
import json
import time
import random
import hashlib
# pip install ddddocr
import ddddocr
import requests
# pip install pillow==9.4.0
from PIL import Image, ImageDraw, ImageFont
from urllib.parse import urlencode
# pip install fonttools
from fontTools.ttLib import TTFont
class Movie_Data(object):
    def __init__(self):
        self.ocr = ddddocr.DdddOcr()
        self.url = "https://piaofang.maoyan.com/dashboard-ajax?"
        self.timestamp = int(time.time()*1000)
        self.index = int(1000 * random.random() + 1)
        self.content = f"method=GET&timeStamp={self.timestamp}&User-Agent=TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEyNC4wLjAuMCBTYWZhcmkvNTM3LjM2IEVkZy8xMjQuMC4wLjA=&index={self.index}&channelId=40009&sVersion=2&key=A013F70DB97834C0A5492378BD76C53A"
        self.signkey = hashlib.md5(self.content.encode()).hexdigest()
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0"
        }

    def parse_data_index(self):
        params = {
            "orderType": "0",
            "uuid": "18affa452e4c8-057e2dc1cfbe0c-78505771-384000-18affa452e55",
            "timeStamp": self.timestamp,
            "User-Agent": "TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEyNC4wLjAuMCBTYWZhcmkvNTM3LjM2IEVkZy8xMjQuMC4wLjA=",
            "index": self.index,
            "channelId": "40009",
            "sVersion": "2",
            "signKey": self.signkey
        }

        url = self.url + urlencode(params)
        response = requests.get(url, headers=self.headers)
        return response.text

    def get_font_index(self, response):
        dict_data = json.loads(response)["fontStyle"]
        font_url = "https:" + re.search(r'opentype"\),url\("(.*?)"\);}',dict_data).group(1)
        resp = requests.get(font_url)
        with open("movie.woff", "wb") as file:
            file.write(resp.content)
        tfont = TTFont("movie.woff")
        font_list = tfont.getGlyphOrder()[2:]
        return font_list

    def parse_font_index(self, font_list):
        charlist = []
        # 加载字体文件
        font = ImageFont.truetype("movie.woff", 40)
        for uchar in font_list:
            uniknow_char = f"\\u{uchar[3:]}".encode().decode("unicode_escape")
            im = Image.new(mode="RGB", size=(42, 40), color="white")
            draw = ImageDraw.Draw(im=im)
            draw.text(xy=(0, 0), text=uniknow_char, fill=0, font=font)
            img_byte = io.BytesIO()
            im.save(img_byte, format="JPEG")
            charlist.append(self.ocr.classification(img_byte.getvalue()))
        return charlist

    def font_replace(self, response, old_font_list, new_font_list):
        font_dict = {}
        for font in list(zip(new_font_list, old_font_list)):
            font_dict[font[0]] = font[1].lower()
        resp = response.replace("&#x", "uni").replace(";","")
        for num, code in font_dict.items():
            resp = re.sub(code, str(num), resp)
        data_list = json.loads(resp)["movieList"]["data"]["list"]
        for data in data_list:
            title = data["movieInfo"]["movieName"]
            price = data["boxSplitUnit"]["num"]
            print(f"电影名称---{title}, 电影票房---{price}")


    def main(self):
        response = self.parse_data_index()
        old_font_list = self.get_font_index(response)
        # print(f"识别之前:{old_font_list}")
        new_font_list = self.parse_font_index(old_font_list)
        # print(f"识别之后:{new_font_list}")
        self.font_replace(response=response, old_font_list=old_font_list, new_font_list=new_font_list)

if __name__ == '__main__':
    movie = Movie_Data()
    movie.main()




posted on 2024-06-07 16:44  下雨天的眼睛  阅读(19)  评论(0编辑  收藏  举报