1103-词牌名，合称，诗词形式

词牌名收集

原网页形式

数据收集

import requests
from bs4 import BeautifulSoup
from lxml import etree

headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#创建头部信息
cipai=[]

for i in range(1,7):
    url='https://www.xungushici.com/cipais/p'+str(i)
    r=requests.get(url,headers=headers)
    content=r.content.decode('utf-8')
    soup = BeautifulSoup(content, 'html.parser')

    hed=soup.find('ul',class_='list-unstyled d-flex flex-row flex-wrap align-items-center w-100')
    list=hed.find_all('li',class_="m-1 badge badge-light")

    for it in list:
        if it.a!=None:
            cipai.append(it.a.text)

import xlwt

xl = xlwt.Workbook()
# 调用对象的add_sheet方法
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)

sheet1.write(0,0,"title")
for i in range(0,len(cipai)):
    sheet1.write(i+1,0,cipai[i])

xl.save("cipai_name.xlsx")

存储形式

诗人合称

原数据网页

数据收集

import requests
from bs4 import BeautifulSoup
from lxml import etree

headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#创建头部信息

hc=[]

url='https://www.xungushici.com/authors'
r=requests.get(url,headers=headers)
content=r.content.decode('utf-8')
soup = BeautifulSoup(content, 'html.parser')
orign_href='https://www.xungushici.com'


hecheng=soup.find('div',id='divHeCheng')
list=hecheng.find_all('li',class_="m-1 badge badge-light")
dic={}
for i in range(1,len(list)):
    href=orign_href+list[i].a['href']
    hecehng=list[i].a.text
    hc.append(hecehng)
    r2 = requests.get(href, headers=headers)
    content2 = r2.content.decode('utf-8')
    soup2 = BeautifulSoup(content2, 'html.parser')
    pomdiv=soup2.find('div',class_='col col-sm-12 col-lg-9')
    card=pomdiv.find_all('div',class_='card mt-3')
    author_list=[]
    for it in card:
        h4=it.find('h4',class_='card-title')
        list_a=h4.find_all('a')
        desty=list_a[0].text
        author=list_a[1].text
        author_list.append(author)
    dic[hecehng]=",".join(author_list)

import xlwt

xl = xlwt.Workbook()
# 调用对象的add_sheet方法
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)

sheet1.write(0,0,"hc")
sheet1.write(0,1,'author')
for i in range(0,len(hc)):
    sheet1.write(i+1,0,hc[i])
    sheet1.write(i+1,1,dic[hc[i]])

xl.save("common_name.xlsx")


for it in hc:
    print(it+": "+dic[it])

存储形式

之后将读取该表，对应到诗人表中添加一列合称属性

诗词形式

形式分类

按照一句话中诗词的个数分为：五言，七言

按照诗词的句子，每首四局为绝句，每首八句为律诗。绝句分为：五言绝句和七言绝句；律诗分为：五言律诗和七言律诗

数据处理

新学到一个表格追加使用技巧：

from xlrd import open_workbook
from xlutils.copy import copy
#将分类结果重新写入原excel中
def write_to(data,file):
    print(len(data))
    xl =open_workbook(file)
    excel = copy(xl)
    sheet1 = excel.get_sheet(0)

    sheet1.write(0, 8, "formal")
    for i in range(0, len(data)):
        sheet1.write(i + 1, 8, data[i])

    excel.save(file)

数据处理源码

import xlwt
import pandas as pd

#读取源数据，获取诗词内容
def read_excel(file):
    data=pd.read_excel(file)
    content=data.content
    return content

#诗词形式获取
def formal(content):
    formal_list=[]
    for it in content:
        ju_list=str(it).replace('\n','').replace('.','。').split('。')
        print(ju_list)
        if (len(ju_list)-1==8):
            if len(ju_list[0])==11:
                formal_list.append("五言律诗")
                print("五言律诗")
            elif len(ju_list[0])==15:
                formal_list.append("七言律诗")
                print("七言律诗")
            else:
                formal_list.append("无")
                print("无")
        elif len(ju_list)-1==4:
            if len(ju_list[0])==11:
                formal_list.append("五言绝句")
                print("五言绝句")
            elif len(ju_list[0])==15:
                formal_list.append("七言绝句")
                print("七言绝句")
            else:
                formal_list.append("无")
                print("无")
        else:
            if len(ju_list[0])==11:
                formal_list.append("五言")
                print("五言")
            elif len(ju_list[0]) == 15:
                formal_list.append("七言")
                print("七言")
            else:
                formal_list.append("无")
                print("无")
    return formal_list

from xlrd import open_workbook
from xlutils.copy import copy
#将分类结果重新写入原excel中
def write_to(data,file):
    print(len(data))
    xl =open_workbook(file)
    excel = copy(xl)
    sheet1 = excel.get_sheet(0)

    sheet1.write(0, 8, "formal")
    for i in range(0, len(data)):
        sheet1.write(i + 1, 8, data[i])

    excel.save(file)

#获取指定文件夹下的excel
import os
def get_filename(path,filetype):  # 输入路径、文件类型例如'.xlsx'
    name = []
    for root,dirs,files in os.walk(path):
        for i in files:
            if os.path.splitext(i)[1]==filetype:
                name.append(i)
    return name            # 输出由有后缀的文件名组成的列表


if __name__ == '__main__':
    #获取指定文件夹下的源数据
    file='data/'
    list=get_filename(file,'.xlsx')
    for it in list:
        newfile=file+it
        #获取诗词内容
        data=read_excel(newfile)
        #根据诗词内容，获取对应的诗词形式
        formal_data=formal(data)
        #将诗词形式重新写入源数据
        write_to(formal_data,newfile)

结果展示

明天任务

1.曲牌名筛选出

2.飞花令爬取

3.找出诗句对应的“飞花令”

4.中文分词，试图将诗人个人经历，逐个分段，梳理出这几类关键信息：人物，时间，事件，地点。将文本抽取为规则化的数据格式

posted @ 2021-11-03 22:18 清风紫雪阅读(525) 评论(0) 收藏举报

刷新页面返回顶部

清风紫雪

1103-词牌名，合称，诗词形式

词牌名收集

原网页形式

数据收集

存储形式

诗人合称

原数据网页

数据收集

存储形式

诗词形式

形式分类

数据处理

结果展示

明天任务

公告