实战-快手H5字体反爬

实战-快手H5字体反爬

前言#

快手H5端的粉丝数是字体反爬,抓到的html文本是乱码 <SPAN STYLE='FONT-FAMILY: kwaiFont;'>&#xebe9;&#xea80;&#xf6d0;&#xe7c7;&#xed42;&#xeb5e;</SPAN> 可以看到对应的字体格式为 kwaiFont

经过一顿分析操作,发现每次返回的ttf文件内容每次都不太一样,无法自己做一份映射模板, 那么就不做模板了。可以通过OCR 或者 KNN 进行内容识别。本人采用 OCR 方式进行识别。这里推荐一个很吊的 OCRddddocr.

流程分析#

  1. 找到对应ttf文件
  2. 分析ttf文件,将每个字体转换成图片
  3. 图片识别成文本
  4. 乱码映射

直接上代码#

import re
import ddddocr
import requests
from lxml import etree
from io import BytesIO
from fontTools.ttLib import TTFont
from fontTools.pens.basePen import BasePen
from reportlab.graphics.shapes import Path
from reportlab.lib import colors
from reportlab.graphics import renderPM
from reportlab.graphics.shapes import Group, Drawing


class ReportLabPen(BasePen):
    """
    绘图
    """

    def __init__(self, glyph_set, path=None):
        BasePen.__init__(self, glyph_set)
        if path is None:
            path = Path()
        self.path = path

    def _moveTo(self, p):
        (x, y) = p
        self.path.moveTo(x, y)

    def _lineTo(self, p):
        (x, y) = p
        self.path.lineTo(x, y)

    def _curveToOne(self, p1, p2, p3):
        (x1, y1) = p1
        (x2, y2) = p2
        (x3, y3) = p3
        self.path.curveTo(x1, y1, x2, y2, x3, y3)

    def _closePath(self):
        self.path.closePath()


class KuaiShouSpider(object):
    """
    快手爬虫
    """

    def __init__(self):
        # OCR 识别类
        self.ocr = ddddocr.DdddOcr()

    def ttf_2_word_map(self, ttf_content, fmt="png"):
        """
        ttf内容转文本
        :param ttf_content:
        :param fmt:
        :return:
        """
        font = TTFont(BytesIO(ttf_content))
        gs = font.getGlyphSet()
        glyphNames = font.getGlyphNames()

        uniMap = font['cmap'].tables[0].ttFont.getBestCmap()
        key_map = dict()
        for k, v in uniMap.items():
            key_map[v] = hex(k)

        data_dict = dict()
        for i in glyphNames:
            # 跳过'.notdef', '.null'
            if i[0] == '.':
                continue

            g = gs[i]
            pen = ReportLabPen(gs, Path(fillColor=colors.black, strokeWidth=5))
            g.draw(pen)
            w = 800
            h = 800
            g = Group(pen.path)
            g.translate(0, 0)

            d = Drawing(w, h)
            d.add(g)
            img = renderPM.drawToString(d, fmt)
            data = self.ocr.classification(img)
            if data == '十':
                data = '+'
            elif data in [',', '。']:
                data = '.'
            key = key_map[i]
            data_dict[key] = data

        return data_dict
    
    @staticmethod
    def uni_code_2_word(uni_code, word_map):
        """
        unicode 转 文本
        :param uni_code:
        :param word_map:
        :return:
        """

        def _sub(num):
            num = num.group()
            num = re.findall(r'\d+', num)[0]
            num = str(hex(int(num)))
            return word_map[num]

        data = re.sub('&#(.+?);', _sub, uni_code)

        return data

    def get_user_info(self):
        """
        获取用户数据
        :return:
        """

        url = 'https://c.kuaishou.com/fw/user/ounixiong?fid=0&cc=share_copylink&followRefer=151&shareMethod=TOKEN&kpn=KUAISHOU&subBiz=PROFILE&shareId=16509009682073&shareToken=X-7IIolIHVVgN2bx&shareResourceType=PROFILE_OTHER&shareMode=APP&originShareId=16509009682073&appType=21&shareObjectId=136457866&shareUrlOpened=0&timestamp=1633759010452&captchaToken=HEADCgp6dC5jYXB0Y2hhEscCX569ztU1Y9XCAVp1Q5Rsm1H8fPYfPZBHvTyg5mwPyIQrJSR_j2mphorguzP9cB2sNWhg61OwW_LQEBvnHRS47j0GpmjIBOeqJ9j9kIbNTsXgNSQYZxkdToAm25EKa4ZLXOmE9ez5Bl-UMzRs4P2_g6SzI3fBs1yFvI7_eLd_yFogwimBE5eyopG9qDDm5lFPfSPm0GI6IhqLKpA1VBZd9cjZxsxq4jGlld1vYRxOFyfJis4oFSVM8fpDArN32KQ2pqejgjV8kK42jW-kpg4fl-1g5iWmqSczszEvEdB9s4l3QmQBfztuDSPbGf0yfY-whf93nOynaRmSeLH49sHSaPr_nwcGvjNjqeFdZoTpf2VBLV7mWvkVdthG0yV5Y6BqDPWSr57Js-dvLIcYlyq3gLbNxQOsulNch6o-HQ7dw2CZY006z-_eGhLniyxQb2WiE0ZVkCv0UGAb2gsoBTACTAIL'

        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Cookie': 'did=web_232e842d3bcd4eceb358abfcf31ec030; didv=1634614098000; sid=e7921611a1cbb9669d28ce19; Hm_lvt_86a27b7db2c5c0ae37fee4a8a35033ee=1634614100; Hm_lpvt_86a27b7db2c5c0ae37fee4a8a35033ee=1634614104',
            'DNT': '1',
            'Host': 'c.kuaishou.com',
            'Pragma': 'no-cache',
            'Referer': 'https://c.kuaishou.com/fw/user/ounixiong?fid=0&cc=share_copylink&followRefer=151&shareMethod=TOKEN&kpn=KUAISHOU&subBiz=PROFILE&shareId=16509009682073&shareToken=X-7IIolIHVVgN2bx&shareResourceType=PROFILE_OTHER&shareMode=APP&originShareId=16509009682073&appType=21&shareObjectId=136457866&shareUrlOpened=0&timestamp=1633759010452',
            'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
            'sec-ch-ua-mobile': '?1',
            'sec-ch-ua-platform': '"Android"',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'same-origin',
            'Sec-Fetch-User': '?1',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36',
        }

        response = requests.get(url, headers=headers)

        # 获取网页中的ttf文件
        try:
            ttf_file = re.findall(r'url\((https:.+?\.ttf)\)', response.text)[0]
        except Exception as err:
            print('网页访问异常')
            return

        ttf_data = requests.get(ttf_file)

        ttf_word = self.ttf_2_word_map(ttf_data.content)

        # 解析
        html = etree.HTML(response.text)

        fans_node = html.xpath('//span[contains(text(),"粉丝")]/preceding-sibling::span[1]')[0]
        focus_node = html.xpath('//span[contains(text(),"关注")]/preceding-sibling::span[1]')[0]
        fans = etree.tostring(fans_node).decode('utf-8')
        focus = etree.tostring(focus_node).decode('utf-8')
        fans = re.findall('>(.+?)<', fans)[0]
        focus = re.findall('>(.+?)<', focus)[0]
        fans = self.uni_code_2_word(fans, ttf_word)
        focus = self.uni_code_2_word(focus, ttf_word)
        print(fans)
        print(focus)


if __name__ == '__main__':
    spider = KuaiShouSpider()
    spider.get_user_info()

后记#

可以考虑一下用 KNN的方式根据字体特征进行分类,准备好一些样本,进行训练.

posted @   小伟哥哥~  阅读(1168)  评论(1编辑  收藏  举报
编辑推荐:
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
阅读排行:
· 单线程的Redis速度为什么快?
· 展开说说关于C#中ORM框架的用法!
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· Pantheons:用 TypeScript 打造主流大模型对话的一站式集成库
· SQL Server 2025 AI相关能力初探
点击右上角即可分享
微信分享提示
主题色彩