汽车之家字体反爬

#!/usr/bin/env python  
# encoding: utf-8  
from requests_html import HTMLSession
import re
import os
from fontTools.ttLib import TTFont

class QiCheZhiJia():
    def __init__(self):
        self.url="https://club.autohome.com.cn/bbs/thread/bb8c36ced93ce182/74203500-1.html"
        self.hanzi=['','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','']
        self.session=HTMLSession()
        self.f_dict={}
        self.uniWordList=[]
        self.utf8WordList=[]
    def create_font(self,font_url):
        # 列出已下载文件
        font_file=font_url.split('/')[-1]

        if not os.path.exists("./fonts"):
            os.makedirs("./fonts")
        file_list = os.listdir('./fonts')

        if font_file not in file_list:
            # 未下载则下载新库
            print('不在字体库中, 下载:', font_file)
            new_file = self.session.get(font_url).content
            with open('./fonts/' + font_file, 'wb') as f:
                f.write(new_file)
            font = TTFont('./fonts/' + font_file)
        else:
            font = TTFont('./fonts/' + font_file)
            gly_list = font.getGlyphOrder()[1:]
        gly_list = font.getGlyphOrder()[1:]
        for number,gly in enumerate(gly_list):
            self.f_dict.setdefault(gly.lower().replace('uni','&#x'),self.hanzi[number])
        self.uniWordList = font['cmap'].tables[0].ttFont.getGlyphOrder()
        self.utf8WordList = [uniWord.replace("uni",r"\u").encode('utf-8').decode("unicode-escape") for uniWord in self.uniWordList[1:]]
        print(self.utf8WordList)
    def run(self):
        req=self.session.get(self.url)
        source=req.text
        font_url=self.parse(source)
        self.create_font(font_url)
        info=req.html.xpath("//div[@class='tz-paragraph' and string-length(text())>1]//text()")
        print(info)
        elem=""
        for item in info:
            elem += item
        for i in range(len(self.utf8WordList)):
            # 将自定的字体信息,替换成国际标准
            elem = elem.replace(self.utf8WordList[i], self.hanzi[i])
        print(elem)

    def parse(self,source):
        plat=re.compile("'\),url\('(.*?)'\)")
        font_url="http:"+plat.findall(source)[0]
        return font_url
if __name__ == '__main__':
    QiCheZhiJia().run()

 

posted @ 2023-10-26 22:22  冰底熊  阅读(52)  评论(0编辑  收藏  举报