爬虫起点css破解
#均为个人原创,转载请注明出处:https://www.cnblogs.com/HugJun/p/11506270.html
import requests,time,re,pprint from fontTools.ttLib import TTFont from io import BytesIO def get_font(url): resp = requests.get(url) font = TTFont(BytesIO(resp.content)) web_font_relation = font.getBestCmap() font.close() return web_font_relation def get_html_info(url): headers = { 'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } html_data = requests.get(url, headers=headers) url_ttf_pattern = re.compile('<style>(.*?)\s*</style>',re.S) fonturl = re.findall(url_ttf_pattern,html_data.text)[0] url_ttf = re.search('woff.*?url.*?\'(.+?)\'.*?truetype', fonturl).group(1) print(url_ttf) return url_ttf,html_data.text def get_encode_font(data,web_font_relation): """ 将加密的css字母替换成原本的数字 :param data: 后台源码 :return: """ python_font_relation = { 'one':1, 'two':2, 'three':3, 'four':4, 'five':5, 'six':6, 'seven':7, 'eight':8, 'nine':9, 'zero':0, 'period':'.' } for k,v in web_font_relation.items(): if str(k) in data: print(k,v) fon_css = '&#'+str(k)+';' data = re.sub(fon_css, str(python_font_relation[v]), data) total_num = int(float(''.join(re.findall('<span class=".+">(.+)</span></em><cite>万字',data)))*10000) #总字数 total_recommend = int(float(''.join(re.findall('<span class=".+">(.+)</span></em><cite>万总推荐',data)))*10000) #总推荐 week_recommend = int(float(''.join(re.findall('<span class=".+">(.+)</span></em><cite>周推荐',data)))*10000) #周推荐 print(total_num,total_recommend,week_recommend) def main(url): fan_info, data = get_html_info(url) web_font_relation = get_font(fan_info) get_encode_font(data, web_font_relation) """程序主入口""" if __name__=='__main__': url = 'https://book.qidian.com/info/1115277' # 选取某一小说 main(url)
本文章仅供学习参考,如有版权侵犯,请联系作者修改,转载请注明出处!