小学生学习汉字，汉字抓取

主要在正则表达式。

words = "泉眼无声惜细流，树荫照水弄轻柔。小荷才露尖尖角，早有蜻蜓立上头。"

words_info = {}
# url_format = 'https://hanyu.baidu.com/s?wd={0}&from=zici'


def getWords(words):
    url_format = 'https://hanyu.baidu.com/s?wd={0}&ptype=zici'
    for char in words:
        # 判断是否是汉字
        if not ('\u4e00' <= char <= '\u9fff'):
            continue
        if char in words_info:
            print("\t 已经采集过，跳过")
            continue
        res = requests.get(url_format.format(char))
        html = res.text
        print(f"@{time.time()}")
        print(f"开始采集加载分析汉字：[{char}]")
        info = char_info(char, html)
        info["name"] = char
        words_info[char] = info

        gif_url = info["tupian"]
        res = requests.get(gif_url)
        fs_name = "gif\{0}.gif".format(char)
        # fs_name="aa.gif"
        with open(fs_name, "wb") as fs:
            fs.write(res.content)
            print(f"{char} write sussful！采集完成")
        # 避免采集过快，被屏蔽
        # time.sleep(2)
    return words_info


def char_info(name: str, html: str):
    info = {}
    info["name"] = name

    # 部首
    position = html.find('<li id="radical">')
    assert position > 0
    end = html.find("</li>", position)
    match = re.search(r"<span>(.{1})</span>", html[position:end])
    info["bushou"] = match.group(1) if match else ""
    print("部首：", info["bushou"])

    # 拼音
    position = html.find('<li id="tone_py">')
    assert position > 0
    end = html.find("</li>", position)
    match = re.search(r"<b>(.*)</b>", html[position:end])
    info["pinyin"] = match.group(1) if match else ""
    print("拼音：", info["pinyin"])

    # 组词
    position = html.find(
        '<h1><b class="title" id="related_term">相关组词</b></h1>')
    assert position > 0
    end = html.find("</div>", position)
    match = re.findall(r"<a [^>]*>([^<]*)</a>",
                       html[position:end], re.IGNORECASE | re.MULTILINE)
    info["zhuci"] = match
    print("组词", info["zhuci"])

    # 书写gif图片
    position = html.find('<img id="word_bishun" class="bishun"')
    assert position > 0
    end = html.find("</div>", position)
    match = re.search(
        #r'data-gif="([^"]*)"'   match.group(1)
        #r"""data-gif=["']([^"]*)["']"""   match.group(1)
        r'https://hanyu-word-gif.cdn.bcebos.com/\w{30,}.gif', html[position:end], re.IGNORECASE | re.MULTILINE)
    info["tupian"] = match.group(0) if match else ""
    print("笔画顺序：", info["tupian"])

    return info


if __name__ == '__main__':
    import os
    if os.path.exists('words.json'):
        with open('words.json', 'r',encoding="utf-8") as fs:
            json_text = fs.read()
            if json_text.startswith("{"):
                words_info = words_info or {}
                words_info.update(json.loads(json_text))

    getWords(words)

    with open('words.json', 'w',encoding="utf-8") as fs:
        json_text = json.dumps(words_info,ensure_ascii=False)
        fs.write(json_text)
posted @ 2022-04-24 11:22 BigRain 阅读(61) 评论(0) 编辑收藏举报
刷新页面返回顶部
BigRain

小学生学习汉字，汉字抓取

公告