1109-朝代年号标准化|诗词时间提取

朝代年号标准化

朝代年号来源

 

 代码

import requests
from bs4 import BeautifulSoup
from lxml import etree
import re

headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#创建头部信息


url='http://www.360doc.com/content/15/0610/22/8378385_477251847.shtml'
r=requests.get(url,headers=headers)
content=r.content.decode('utf-8')
soup = BeautifulSoup(content, 'html.parser')
table=soup.find('table',class_='MsoNormalTable')
tr_list=table.tbody.find_all('tr')
desty_all=[]
for i in range(2,len(tr_list)):
    tr=tr_list[i]
    d_list=tr.find_all('p',class_='MsoNormal')
    list=[]
    for it in d_list:
        text=str(it.text)
        if bool(re.search(r'\d', str(it.text)))  == True:
            text=str(it.text).replace('\xa0','').replace('公元','').replace('','')
        list.append(text)
    desty_all.append(list)
# print(desty_all)
#
import xlwt

xl = xlwt.Workbook()
# 调用对象的add_sheet方法
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)

sheet1.write(0,0,"emperor")
sheet1.write(0,1,'year_hao')
sheet1.write(0,2,'time')


for i in range(0,len(desty_all)):
    sheet1.write(i+1,0,desty_all[i][0])
    sheet1.write(i+1, 1, desty_all[i][1])
    sheet1.write(i + 1, 2, desty_all[i][2])

xl.save("qing_desty.xlsx")

结果

 

 

 

 诗词时间提取

设计提取手段

 

从这两个字段里提取出对应的时间关键词,优先从背景提取,背景中提取不到后在选择appear

采用pyhanlp里的rcf实体识别,找出包含的“数量词”,“时间词”,提出后还需要找到包含数字的时间词,作为本诗的作诗时间

代码 

import re
from pyhanlp import *
import pandas as pd
#人名“nr“
#地名“ns”
#机构名“nt”

def demo_CRF_lexical_analyzer(text):
    CRFnewSegment = HanLP.newSegment("crf")
    term_list = CRFnewSegment.seg(text)
    ans=[]
    for it in term_list:
        if str(it.nature)=='t' or str(it.nature)=='m':
            ans.append(str(it.word))
    print(ans)
    return ans

from xlrd import open_workbook
from xlutils.copy import copy
#将分类结果重新写入原excel中
def write_to(data,file):
    print(len(data))
    xl =open_workbook(file)
    excel = copy(xl)
    sheet1 = excel.get_sheet(0)

    sheet1.write(0, 9, "data")
    for i in range(0, len(data)):
        sheet1.write(i + 1, 9, data[i])

    excel.save(file)

pome_time=[]

file='./data/tang.xlsx'
data=pd.read_excel(file).fillna("")
appear=data.appear
back=data.background
for i in range(6000):
    app=appear[i]
    bk=back[i]
    #print("===============欣赏===================")
    app_time=demo_CRF_lexical_analyzer(app)
    #print("===============背景===================")
    bk_time=demo_CRF_lexical_analyzer(bk)

    f=False
    for it in bk_time:
        if bool(re.search(r'\d', it))  == True:
            print(it)
            pome_time.append(it)
            f=True
            break
    if f==False:
        for it in app_time:
            if bool(re.search(r'\d', it)) == True:
                print(it)
                pome_time.append(it)
                f=True
                break
    if f==False:
        print("")
        pome_time.append("")
write_to(pome_time,file)

结果

 

 

 

posted @ 2021-11-09 22:31  清风紫雪  阅读(119)  评论(0编辑  收藏  举报