# -*- coding:utf-8 -*-
# 字体文件处理网站: https://font.qqe2.com/index-en.html
"""
任务:
先处理字体文件
从他的请求当中获取到当前请求附带的字体文件
爬虫每次运行,获取的数据都是最新的--获取最新的请求--获取最新的字体文件
固定的请求参数会过期
1、时间戳
2、index怎么来的
3、signkey
md5加密后的结果
网页数据每次变动(请求了一下)
1、每次都会出现一个新的请求,请求当中会生产一个signkey
2、如果刚刚的signKey: f是生成signkey的代码:
在这个位置打上断点
浏览器在下一次请求的时候会执行signKey: f
当执行到这段代码
signkey后面的值就是 f
f又是什么?
"""
import io
import re
import json
import time
import random
import hashlib
# pip install ddddocr
import ddddocr
import requests
# pip install pillow==9.4.0
from PIL import Image, ImageDraw, ImageFont
from urllib.parse import urlencode
# pip install fonttools
from fontTools.ttLib import TTFont
class Movie_Data(object):
def __init__(self):
self.ocr = ddddocr.DdddOcr()
self.url = "https://piaofang.maoyan.com/dashboard-ajax?"
self.timestamp = int(time.time()*1000)
self.index = int(1000 * random.random() + 1)
self.content = f"method=GET&timeStamp={self.timestamp}&User-Agent=TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEyNC4wLjAuMCBTYWZhcmkvNTM3LjM2IEVkZy8xMjQuMC4wLjA=&index={self.index}&channelId=40009&sVersion=2&key=A013F70DB97834C0A5492378BD76C53A"
self.signkey = hashlib.md5(self.content.encode()).hexdigest()
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0"
}
def parse_data_index(self):
params = {
"orderType": "0",
"uuid": "18affa452e4c8-057e2dc1cfbe0c-78505771-384000-18affa452e55",
"timeStamp": self.timestamp,
"User-Agent": "TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEyNC4wLjAuMCBTYWZhcmkvNTM3LjM2IEVkZy8xMjQuMC4wLjA=",
"index": self.index,
"channelId": "40009",
"sVersion": "2",
"signKey": self.signkey
}
url = self.url + urlencode(params)
response = requests.get(url, headers=self.headers)
return response.text
def get_font_index(self, response):
dict_data = json.loads(response)["fontStyle"]
font_url = "https:" + re.search(r'opentype"\),url\("(.*?)"\);}',dict_data).group(1)
resp = requests.get(font_url)
with open("movie.woff", "wb") as file:
file.write(resp.content)
tfont = TTFont("movie.woff")
font_list = tfont.getGlyphOrder()[2:]
return font_list
def parse_font_index(self, font_list):
charlist = []
# 加载字体文件
font = ImageFont.truetype("movie.woff", 40)
for uchar in font_list:
uniknow_char = f"\\u{uchar[3:]}".encode().decode("unicode_escape")
im = Image.new(mode="RGB", size=(42, 40), color="white")
draw = ImageDraw.Draw(im=im)
draw.text(xy=(0, 0), text=uniknow_char, fill=0, font=font)
img_byte = io.BytesIO()
im.save(img_byte, format="JPEG")
charlist.append(self.ocr.classification(img_byte.getvalue()))
return charlist
def font_replace(self, response, old_font_list, new_font_list):
font_dict = {}
for font in list(zip(new_font_list, old_font_list)):
font_dict[font[0]] = font[1].lower()
resp = response.replace("&#x", "uni").replace(";","")
for num, code in font_dict.items():
resp = re.sub(code, str(num), resp)
data_list = json.loads(resp)["movieList"]["data"]["list"]
for data in data_list:
title = data["movieInfo"]["movieName"]
price = data["boxSplitUnit"]["num"]
print(f"电影名称---{title}, 电影票房---{price}")
def main(self):
response = self.parse_data_index()
old_font_list = self.get_font_index(response)
# print(f"识别之前:{old_font_list}")
new_font_list = self.parse_font_index(old_font_list)
# print(f"识别之后:{new_font_list}")
self.font_replace(response=response, old_font_list=old_font_list, new_font_list=new_font_list)
if __name__ == '__main__':
movie = Movie_Data()
movie.main()