数据清洗之从项目简介中提取关键字

具体代码

from jieba import analyse
import pandas as pd
import numpy as np
# 读取csv文件内容
df = pd.read_csv("D:/12140/Desktops/111/222/test002.csv", encoding="utf-8")  # 编码默认UTF-8,若乱码自行更改

data = df[['成果简介_y']]

data = data.fillna("0")

data_array = np.array(data.stack())  # 首先将pandas读取的数据转化为array
data_list = data_array.tolist()  # 然后转化为list形式
#print(data_list)

# 存储分词后的关键词列表
data2_list = {}
for i in range(1, len(data_list)):
    tfidf = analyse.extract_tags
    if tfidf(data_list[i]) == None:
        data2_list[i] = None
    else:
        data2_list[i] = tfidf(str(data_list[i]))
    #print(tfidf(data_list[i]))
    #print(str(data2_list[i]))

print(len(data2_list))

# 将关键字存储到关键词字段
df2 = pd.read_csv('D:/12140/Desktops/111/222/test002.csv',encoding='utf-8')

data2 = df2['关键词']
#print(data2)
print(len(data2))

list = []
for i in range(1, len(data2_list)):
        result = ""
        for j in range(1,len(data2_list[i])):
            result += str(data2_list[i][j])
        #print(result)
        list.append(result)
#print(list)

print(len(list))

dd = pd.DataFrame(list)
dd.fillna("0")

df2['关键词'] = dd

print(df2['关键词'])

df2.to_csv("D:/12140/Desktops/111/222/test003.csv", index=False)

效果展示

posted @ 2024-03-06 17:19  yesyes1  阅读(52)  评论(0编辑  收藏  举报