公告

总体思路

破解字体反爬：

　　1. 获取字体文件，

　　2.Unicode解码字体文件，保存为ttf文件，

　　3.打开ttf文件，查看映射关系创建映射字典（字形与实际字体之间的关系）（或者可以设定自动识别）

　　4.通过code与name的关系，寻找name与字形之间的关系，得到code（需要做hex）与字形关系之间的关系

　　5.第二次获取响应时，直接通过code与字形之间的关系，得到实际的字体，并将code替换成实际字体

破解过程

可以明显的看到显示正常但html中的是乱码。

1.获取字体文件

如上所示，字体文件隐藏在css中，点击右上角的style。

上图中那一段密文就是我们需要的ttf文件（这一段密文是可变的，但其中的映射关系不变）

获取那段代码然后保存为ttf文件，查看字形与实际字体的对应关系

进行base64解码（为什么是bs64呢，因为font-face中标明了是base64）并保存为ttf文件，使用Fontcreator打开，可以看到字形与实际字体的对应关系，也就是映射

因为是自定义的字形，且个数不多，手动建立映射表

这里我手动建：

    font_dict = {
        'glyph1': '0',
        'glyph2': '1',
        'glyph3': '2',
        'glyph4': '3',
        'glyph5': '4',
        'glyph6': '5',
        'glyph7': '6',
        'glyph8': '7',
        'glyph9': '8',
        'glyph10': '9',
    }

2.将ttf文件保存为xml文件，通过xml文件分析code，name，字形之间的关系

import re
import requests
import base64
from fontTools.ttLib import TTFont
import io

def get_ttf():
    url = 'https://cd.58.com/chuzu/'
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
    }
    page_text = requests.get(url,headers).text
    font_ttf = re.search(r"charset=utf-8;base64,(.*?)\'",page_text).group(1)
    font = TTFont(io.BytesIO(base64.b64decode(font_ttf))) # 放入到内存中，以字节形式存在
    font.saveXML("wuba.xml")


def main():
    get_ttf()


if __name__ == '__main__':
    main()

分析xml文件

回顾一下上文中提到的三个关系：code ——>name,name——>字形，字形——>实际字体（值）

前面说过：code与name是变化的，但字形与实际值是不变的，然后我们分析一下name与字形之间的关系

name：glyph0001

字形：glyph1

很明显中间多了几个零而已，处理一下，直接让code与字形对应起来，下一次请求过来时，直接通过code找到字形，最终找到值做code替换。

处理转换一下数据：

梳理一下：当获取到响应的html时，我们会先拿到ttf文件，然后通过fonttools模块处理code，转换name为字形，通过映射字典自动转换为值，这样code与值就直接对应上了

到这里我们还需要处理一下code，因为现在得到的code与html中的文本是有一点小差别的

可以看到0变成了&#后面多了"；"

解析来，code直接映射值，然后替换html中的文本

# -*- coding: utf-8 -*-
# __author__ = "maple"
import re
import requests
import base64
from fontTools.ttLib import TTFont
import io

def get_ttf():
    url = 'https://cd.58.com/chuzu/'
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
    }
    font_dict = {
        'glyph1': '0',
        'glyph2': '1',
        'glyph3': '2',
        'glyph4': '3',
        'glyph5': '4',
        'glyph6': '5',
        'glyph7': '6',
        'glyph8': '7',
        'glyph9': '8',
        'glyph10': '9',
    }
    page_text = requests.get(url,headers).text
    font_ttf = re.search(r"charset=utf-8;base64,(.*?)\'",page_text).group(1)
    font = TTFont(io.BytesIO(base64.b64decode(font_ttf))) # 放入到内存中，以字节形式存在
    code_name = font.getBestCmap()
    for code,name in code_name.items():
        # 对code做哈希，将数字转换为对应的码
        code = hex(code).replace("0","&#") +";"
        # 自动将name转换为字形
        name = name.split("000")[0]+str(int(name.split("000")[-1]))
        code_value = font_dict[name]  # 通过name找到值 
        page_text = page_text.replace(code,code_value) # 执行替换操作，将code替换为值
        


def main():
    get_ttf()


if __name__ == '__main__':
    main()

3.数据解析

最后写入数据库或这redis即可

代码：

# -*- coding: utf-8 -*-
# __author__ = "maple"
import re
import requests
import base64
from fontTools.ttLib import TTFont
from lxml import etree
import io

def get_ttf():
    url = 'https://cd.58.com/chuzu/'
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
    }
    font_dict = {
        'glyph1': '0',
        'glyph2': '1',
        'glyph3': '2',
        'glyph4': '3',
        'glyph5': '4',
        'glyph6': '5',
        'glyph7': '6',
        'glyph8': '7',
        'glyph9': '8',
        'glyph10': '9',
    }
    page_text = requests.get(url,headers).text
    font_ttf = re.search(r"charset=utf-8;base64,(.*?)\'",page_text).group(1)
    font = TTFont(io.BytesIO(base64.b64decode(font_ttf))) # 放入到内存中，以字节形式存在
    code_name = font.getBestCmap()
    for code,name in code_name.items():
        # 对code做哈希，将数字转换为对应的码
        code = hex(code).replace("0","&#") +";"
        # 自动将name转换为字形
        name = name.split("000")[0]+str(int(name.split("000")[-1]))
        code_value = font_dict[name]  # 通过name找到值
        page_text = page_text.replace(code,code_value) # 执行替换操作，将code替换为值
    tree = etree.HTML(page_text)
    for li in tree.xpath("/html/body/div[6]/div[2]/ul/li"):
        try:
            # 标题
            title = li.xpath("./div[@class='des']/h2/a/text()")[0].replace(" ", "").strip('\n')
            # 房屋类型及面积
            room = li.xpath("./div[@class='des']/p[@class='room']/text()")[0].replace(" ", "").split(
                "\xa0")  # \xa0是不间断空白符 &nbsp;
            # 核验
            check_status = li.xpath("./div[@class='des']/p[@class='room']/i/text()")[0]
            # 地点
            infor = "".join(li.xpath("./div[@class='des']/p[@class='infor']//text()")).replace("\xa0", "").replace("\n",
                                                                                                                   "").replace(
                " ", "").replace(":", "\:")
            # 来源
            gongyu = "".join(li.xpath("./div[@class='des']/p[3]//text()|./div[@class='des']/div//text()")).replace(" ",
                                                                                                                   "").replace(
                "\n", " ")
            # 价格
            price = "".join(li.xpath('./div[@class="list-li-right"]/div[@class="money"]//text()')).replace(" ",
                                                                                                           "").replace(
                "\n", " ")
            # 发布时间
            up_date = "".join(li.xpath('./div[@class="list-li-right"]/div[@class="send-time"]//text()')).replace(" ",
                                                                                                                 "").replace(
                "\n", " ")
            # 来源
            # print(title)

            type_room = room[0]  # 房子类型：几室几厅
            area_room = room[-1].strip("\n")  # 面积m**2
            print(title,check_status,infor,gongyu,price,up_date,type_room,area_room)
        except Exception:
            pass
def main():
    get_ttf()


if __name__ == '__main__':
    main()

使用多任务异步协程

# -*- coding: utf-8 -*-
# __author__ = "maple"

# -*- coding: utf-8 -*-
# __author__ = "maple"
import io
import re
import requests
import base64
import pymysql
import json
import asyncio
import aiohttp
from fontTools.ttLib import TTFont
from lxml import etree
# 数据库连接
conn = pymysql.connect(host='localhost', user='root', password='123456', db='wuba', charset='utf8')


async def get_ttf(url):
    """
    获取ttf文件,ttf文件中字形与实际字体对应关系不变
    :return:返回字体文件和响应文本
    """

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36",
    }
    async with aiohttp.ClientSession() as sess:
        async with await sess.get(url=url, headers=headers) as response:
            page_text = await response.text()
            font_sert =  re.search(r"charset=utf-8;base64,(.*?)\'", page_text).group(1)
            font = base64.b64decode(font_sert)
            return font, page_text


def page_source(task):
    """
    回调函数，处理文本，数据解析
    :param font:
    :param page_text:
    :return:
    """
    # 字体形状 --> 实际的字体
    font_dict = {
        'glyph1': '0',
        'glyph2': '1',
        'glyph3': '2',
        'glyph4': '3',
        'glyph5': '4',
        'glyph6': '5',
        'glyph7': '6',
        'glyph8': '7',
        'glyph9': '8',
        'glyph10': '9',
    }
    # 获取结果
    font,page_text = task.result()
    font = TTFont(io.BytesIO(font))  # 不保存，从内存中读取
    for code, name in font['cmap'].getBestCmap().items():
        # 38006 glyph00002 code做哈希变为0x993c
        # print(code,name)
        code = str(hex(code)).replace("0", "&#") + ";"
        name = name.split("000")[0] + str(int(name.split("000")[-1]))  # 将glyph00008变成字体形状glyph8
        # &#x993c; glyph8
        real_text = str(font_dict[name])
        page_text = page_text.replace(code, real_text)
    tree = etree.HTML(page_text)
    for li in tree.xpath("/html/body/div[6]/div[2]/ul/li"):
        try:
            # 标题
            title = li.xpath("./div[@class='des']/h2/a/text()")[0].replace(" ", "").strip('\n')
            # 房屋类型及面积
            room = li.xpath("./div[@class='des']/p[@class='room']/text()")[0].replace(" ", "").split(
                "\xa0")  # \xa0是不间断空白符 &nbsp;
            # 核验
            check_status = li.xpath("./div[@class='des']/p[@class='room']/i/text()")[0]
            # 地点
            infor = "".join(li.xpath("./div[@class='des']/p[@class='infor']//text()")).replace("\xa0", "").replace("\n",
                                                                                                                   "").replace(
                " ", "").replace(":", "\:")
            # 来源
            gongyu = "".join(li.xpath("./div[@class='des']/p[3]//text()|./div[@class='des']/div//text()")).replace(" ",
                                                                                                                   "").replace(
                "\n", " ")
            # 价格
            price = "".join(li.xpath('./div[@class="list-li-right"]/div[@class="money"]//text()')).replace(" ",
                                                                                                           "").replace(
                "\n", " ")
            # 发布时间
            up_date = "".join(li.xpath('./div[@class="list-li-right"]/div[@class="send-time"]//text()')).replace(" ",
                                                                                                                 "").replace(
                "\n", " ")
            # 来源
            # print(title)

            type_room = room[0]  # 房子类型：几室几厅
            area_room = room[-1].strip("\n")  # 面积m**2
            items = {
                "title": title,
                "check_status": check_status,
                "infor": infor,
                "gongyu": gongyu,
                "price": price,
                "up_date": up_date,
                "type_room": type_room,
                "area_room": area_room,
            }
            json.dumps(items)
            # 持久化存储调用
            storage(items, conn)
        except Exception:
            pass

def storage(items, conn):
    """
    持久化存储
    :param items:
    :param conn:
    :return:
    """
    ex = conn.cursor()
    title = items['title']
    type_room = items['type_room']
    area_room = items['area_room']
    infor = items['infor']
    gongyu = items['gongyu']
    price = items['price']
    check_status = items['check_status']
    up_date = items['up_date']

    sql = r"""
    insert into chuzu(title,type_room,area_room,infor,gongyu,price,check_status,up_date) values('%s','%s','%s','%s','%s','%s','%s','%s')
    """ % (title, type_room, area_room, infor, gongyu, price, check_status, up_date)
    try:
        ex.execute(sql)
        conn.commit()
    except Exception as e:
        print(e)
        conn.rollback()



def main():
    task_list = []
    for num in range(1, 71):
        url = f'https://cd.58.com/chuzu/pn{num}/'
        # 协程对象
        c = get_ttf(url)
        # 任务对象
        task = asyncio.ensure_future(c)
        # 绑定回调
        task.add_done_callback(page_source)
        # 将任务对象放到列表中，后续提交给任务循环对象
        task_list.append(task)
    # 创建事件循环对象
    loop = asyncio.get_event_loop()
    # 提交并执行
    loop.run_until_complete(asyncio.wait(task_list))


if __name__ == '__main__':
    main()

成果：

posted on 2020-12-01 11:22 kindvampire 阅读(727) 评论(0) 编辑收藏举报

刷新页面返回顶部