小学生学习汉字,汉字抓取
主要在正则表达式。
words = "泉眼无声惜细流,树荫照水弄轻柔。小荷才露尖尖角,早有蜻蜓立上头。" words_info = {} # url_format = 'https://hanyu.baidu.com/s?wd={0}&from=zici' def getWords(words): url_format = 'https://hanyu.baidu.com/s?wd={0}&ptype=zici' for char in words: # 判断是否是汉字 if not ('\u4e00' <= char <= '\u9fff'): continue if char in words_info: print("\t 已经采集过,跳过") continue res = requests.get(url_format.format(char)) html = res.text print(f"@{time.time()}") print(f"开始采集加载分析汉字:[{char}]") info = char_info(char, html) info["name"] = char words_info[char] = info gif_url = info["tupian"] res = requests.get(gif_url) fs_name = "gif\{0}.gif".format(char) # fs_name="aa.gif" with open(fs_name, "wb") as fs: fs.write(res.content) print(f"{char} write sussful!采集完成") # 避免采集过快,被屏蔽 # time.sleep(2) return words_info def char_info(name: str, html: str): info = {} info["name"] = name # 部首 position = html.find('<li id="radical">') assert position > 0 end = html.find("</li>", position) match = re.search(r"<span>(.{1})</span>", html[position:end]) info["bushou"] = match.group(1) if match else "" print("部首:", info["bushou"]) # 拼音 position = html.find('<li id="tone_py">') assert position > 0 end = html.find("</li>", position) match = re.search(r"<b>(.*)</b>", html[position:end]) info["pinyin"] = match.group(1) if match else "" print("拼音:", info["pinyin"]) # 组词 position = html.find( '<h1><b class="title" id="related_term">相关组词</b></h1>') assert position > 0 end = html.find("</div>", position) match = re.findall(r"<a [^>]*>([^<]*)</a>", html[position:end], re.IGNORECASE | re.MULTILINE) info["zhuci"] = match print("组词", info["zhuci"]) # 书写gif图片 position = html.find('<img id="word_bishun" class="bishun"') assert position > 0 end = html.find("</div>", position) match = re.search( #r'data-gif="([^"]*)"' match.group(1) #r"""data-gif=["']([^"]*)["']""" match.group(1) r'https://hanyu-word-gif.cdn.bcebos.com/\w{30,}.gif', html[position:end], re.IGNORECASE | re.MULTILINE) info["tupian"] = match.group(0) if match else "" print("笔画顺序:", info["tupian"]) return info if __name__ == '__main__': import os if os.path.exists('words.json'): with open('words.json', 'r',encoding="utf-8") as fs: json_text = fs.read() if json_text.startswith("{"): words_info = words_info or {} words_info.update(json.loads(json_text)) getWords(words) with open('words.json', 'w',encoding="utf-8") as fs: json_text = json.dumps(words_info,ensure_ascii=False) fs.write(json_text)