根据指定汉语关键字获取语料数据

from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from lxml import etree
import time



option = ChromeOptions()
option.add_argument(
    'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36"'
)
browser = webdriver.Chrome(options=option)
browser.maximize_window()  # 页面最大化


def get_content(keyword):
    url = 'http://corpus.zhonghuayuwen.org/ACindex.aspx'
    browser.get(url)
    input_tag = browser.find_element_by_id('TextBoxACkeywords')   #获取搜索框元素
    input_tag.send_keys(keyword)     #输入关键字
    browser.find_element_by_id('RadioButtonLIKE').click()#点击选项按钮
    input_tag.send_keys(Keys.ENTER)  #回车
    # browser.implicitly_wait(8)   #隐式等待
    WebDriverWait(browser,8).until(EC.presence_of_element_located((By.ID,'PanellSResults')))  #显示等待
    current_page = 0
    while True:
        try:
            lists = []
            HTML = etree.HTML(browser.page_source)
            text_lists = HTML.xpath('//*[@id="PanellSResults"]/div/span[position()>3]')
            if text_lists:
                current_page += 1
                print('\n------------------------------当前关键字:《{}》,当前页码:{}------------------------------。'.format(keyword,current_page))
                for i in text_lists:
                    text_list = ''.join(i.xpath('.//text()'))
                    lists.append(text_list)
                step = 3
                item_lists = [lists[k:k+step] for k in range(0,len(lists),step)]  #处理合适的数据结构
                for item in item_lists:
                    text_info = ''.join(item)
                    save_keyword_info(text_info)
                time.sleep(2)
                next_button = browser.find_element_by_link_text('下一页')  #循环点击下一页
                next_button.click()
                WebDriverWait(browser, 8).until(EC.presence_of_element_located((By.ID, 'PanellSResults'))) #显示等待
            else:
                print('检索不到:《{}》关键字的语料信息。\n'.format(keyword))
                invalid_keyword(keyword)
                break
        except:
            break


#记录找不到结果的关键字信息
def invalid_keyword(keyword):
    with open('invalid_data.txt','a+')as f:
        f.write(keyword + '\n')


#保存关键字语料信息
def save_keyword_info(text_info):
    with open('corpus_data_01.txt','a+',encoding='utf-8')as f:
        f.write(text_info + '\n')
    print(text_info)

#读取关键字文件
def read_text():
    with open('生僻字++.txt','r',encoding='utf-8')as f:
        data_lists = f.readlines()
    for i in data_lists:
        keyword = i.strip()
        print('\n开始抓取关键字:《{}》。'.format(keyword))
        get_content(keyword)
        time.sleep(6)





if __name__ == '__main__':
    read_text()

 

posted @ 2023-03-17 17:09  lvye001  阅读(21)  评论(0编辑  收藏  举报