爬虫(三)-猫眼电影,动态字体库
猫眼电影用了动态字体库
猫眼电影榜单国内票房榜,地址:https://maoyan.com/board/1
分析字体文件
刷新几次发现字体有变化
找到关系自动替换
原理
字体库用表结构存储文字,例如cmap表记录unicode索引和字形对应关系。glyf是字形表,记录文字笔画等数据,
有loca表记录glyf表里的字形位置,使用字体库的文字通过loca表找。
可以匹配glyf的字形来找unicode和文字的关系
参考: https://www.cnblogs.com/shenyiyangle/p/10700156.html
思路图:
1.下载一个字体做基准,提取基准字体unicode和文字映射。
2.刷新后的新字体记为网站字体2,用网站字体1和网站字体2的字形找到两种unicode的关系。
3.相同的unicode找到字体库和unicode的关系,最后unicode替换成文字。
代码
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"}
r=requests.get("https://maoyan.com/board/1",headers=headers)
font1_url="http:"+re.findall("url\(\'(\/\/.*?woff)\'\)",r.text,re.M)[0]
#创建font目录保存基准字体
if not os.path.exists("font"):
font1=requests.get(font1_url,headers=headers)
os.mkdir("font")
with open("./font/base.woff","wb")as f:
f.write(font1.content)
基准字体保存到font目录
base_font = TTFont('./font/base.woff')
base_dict=[]
for i in range(len(baseFont.getGlyphOrder()[2:])):
print(f"对应的数字{i+1}:")
w=input()
base_dict.append({"code":baseFont.getGlyphOrder()[2:][i],"num":w})
用FontCreator看字体信息
基准字体关系只需要一次
new_font_url="http:"+re.findall("url\(\'(\/\/.*?woff)\'\)",r.text,re.M)[0]
font=requests.get(new_font_url,headers=headers)
with open("new_font.woff","wb")as f:
f.write(font.content)
new_font = TTFont('new_font.woff')
new_font_code_list=new_font.getGlyphOrder()[2:]
刷新下载新字体,获取unicode
replace_dic=[]
for i in range(10):
news = new_font['glyf'][new_font_code_list[i]]
for j in range(10):
bases = base_font['glyf'][base_dict[j]["code"]]
if news == bases:
unicode=new_font_code_list[i].lower().replace("uni","&#x")+";"
num= base_dict[j]["num"]
replace_dic.append({"code":unicode,"num":num})
新unicode和文字的关系
org_data=r.text
for i in range(len(replace_dic)):
new_data=new_data.replace(replace_dic[i]["code"],replace_dic[i]["num"])
全局替换
tree=etree.HTML(org_data)
dds=tree.xpath('//dl[@class="board-wrapper"]/dd')
info=[]
for dd in dds:
title=dd.xpath('.//p[@class="name"]/a/@title')[0]
star=dd.xpath('.//p[@class="star"]/text()')[0].replace("主演:","")
time=dd.xpath('.//p[@class="releasetime"]/text()')[0].replace("上映时间:","")
realticket=dd.xpath('.//p[@class="realtime"]//text()')[1]+dd.xpath('.//p[@class="realtime"]//text()')[2].strip()
totalticket=dd.xpath('.//p[@class="total-boxoffice"]//text()')[1]+dd.xpath('.//p[@class="total-boxoffice"]//text()')[2].strip()
info.append({"标题":title,"主演":star,"上映时间":time,"实时票房":realticket,"总票房":totalticket})
import csv
csv_file = open("1325.csv", 'w', newline='')
keys = []
writer = csv.writer(csv_file)
keys = info[1].keys()
writer.writerow(keys)
for dic in info:
for key in keys:
if key not in dic:
dic[key ] = ''
writer.writerow(dic.values())
csv_file.close()