汽车之家字体反爬
#!/usr/bin/env python # encoding: utf-8 from requests_html import HTMLSession import re import os from fontTools.ttLib import TTFont class QiCheZhiJia(): def __init__(self): self.url="https://club.autohome.com.cn/bbs/thread/bb8c36ced93ce182/74203500-1.html" self.hanzi=['不','了','呢','更','是','四','小','七','三','多','得','一','着','下','十','少','长','二','六','远','左','地','短','九','五','上','坏','很','右','低','高','矮','八','近','大','好','的','和'] self.session=HTMLSession() self.f_dict={} self.uniWordList=[] self.utf8WordList=[] def create_font(self,font_url): # 列出已下载文件 font_file=font_url.split('/')[-1] if not os.path.exists("./fonts"): os.makedirs("./fonts") file_list = os.listdir('./fonts') if font_file not in file_list: # 未下载则下载新库 print('不在字体库中, 下载:', font_file) new_file = self.session.get(font_url).content with open('./fonts/' + font_file, 'wb') as f: f.write(new_file) font = TTFont('./fonts/' + font_file) else: font = TTFont('./fonts/' + font_file) gly_list = font.getGlyphOrder()[1:] gly_list = font.getGlyphOrder()[1:] for number,gly in enumerate(gly_list): self.f_dict.setdefault(gly.lower().replace('uni','&#x'),self.hanzi[number]) self.uniWordList = font['cmap'].tables[0].ttFont.getGlyphOrder() self.utf8WordList = [uniWord.replace("uni",r"\u").encode('utf-8').decode("unicode-escape") for uniWord in self.uniWordList[1:]] print(self.utf8WordList) def run(self): req=self.session.get(self.url) source=req.text font_url=self.parse(source) self.create_font(font_url) info=req.html.xpath("//div[@class='tz-paragraph' and string-length(text())>1]//text()") print(info) elem="" for item in info: elem += item for i in range(len(self.utf8WordList)): # 将自定的字体信息,替换成国际标准 elem = elem.replace(self.utf8WordList[i], self.hanzi[i]) print(elem) def parse(self,source): plat=re.compile("'\),url\('(.*?)'\)") font_url="http:"+plat.findall(source)[0] return font_url if __name__ == '__main__': QiCheZhiJia().run()
有疑问可以加wx:18179641802,进行探讨