根据指定汉语关键字获取语料数据
from selenium import webdriver from selenium.webdriver import ChromeOptions from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from lxml import etree import time option = ChromeOptions() option.add_argument( 'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36"' ) browser = webdriver.Chrome(options=option) browser.maximize_window() # 页面最大化 def get_content(keyword): url = 'http://corpus.zhonghuayuwen.org/ACindex.aspx' browser.get(url) input_tag = browser.find_element_by_id('TextBoxACkeywords') #获取搜索框元素 input_tag.send_keys(keyword) #输入关键字 browser.find_element_by_id('RadioButtonLIKE').click()#点击选项按钮 input_tag.send_keys(Keys.ENTER) #回车 # browser.implicitly_wait(8) #隐式等待 WebDriverWait(browser,8).until(EC.presence_of_element_located((By.ID,'PanellSResults'))) #显示等待 current_page = 0 while True: try: lists = [] HTML = etree.HTML(browser.page_source) text_lists = HTML.xpath('//*[@id="PanellSResults"]/div/span[position()>3]') if text_lists: current_page += 1 print('\n------------------------------当前关键字:《{}》,当前页码:{}------------------------------。'.format(keyword,current_page)) for i in text_lists: text_list = ''.join(i.xpath('.//text()')) lists.append(text_list) step = 3 item_lists = [lists[k:k+step] for k in range(0,len(lists),step)] #处理合适的数据结构 for item in item_lists: text_info = ''.join(item) save_keyword_info(text_info) time.sleep(2) next_button = browser.find_element_by_link_text('下一页') #循环点击下一页 next_button.click() WebDriverWait(browser, 8).until(EC.presence_of_element_located((By.ID, 'PanellSResults'))) #显示等待 else: print('检索不到:《{}》关键字的语料信息。\n'.format(keyword)) invalid_keyword(keyword) break except: break #记录找不到结果的关键字信息 def invalid_keyword(keyword): with open('invalid_data.txt','a+')as f: f.write(keyword + '\n') #保存关键字语料信息 def save_keyword_info(text_info): with open('corpus_data_01.txt','a+',encoding='utf-8')as f: f.write(text_info + '\n') print(text_info) #读取关键字文件 def read_text(): with open('生僻字++.txt','r',encoding='utf-8')as f: data_lists = f.readlines() for i in data_lists: keyword = i.strip() print('\n开始抓取关键字:《{}》。'.format(keyword)) get_content(keyword) time.sleep(6) if __name__ == '__main__': read_text()
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型,支持深度思考和联网搜索!
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· 25岁的心里话
· ollama系列01:轻松3步本地部署deepseek,普通电脑可用
· 按钮权限的设计及实现