数据清洗之从项目简介中提取关键字


 

具体代码

from jieba import analyse
import pandas as pd
import numpy as np
# 读取csv文件内容
df = pd.read_csv("D:/12140/Desktops/111/222/test002.csv", encoding="utf-8")  # 编码默认UTF-8,若乱码自行更改

data = df[['成果简介_y']]

data = data.fillna("0")

data_array = np.array(data.stack())  # 首先将pandas读取的数据转化为array
data_list = data_array.tolist()  # 然后转化为list形式
#print(data_list)

# 存储分词后的关键词列表
data2_list = {}
for i in range(1, len(data_list)):
    tfidf = analyse.extract_tags
    if tfidf(data_list[i]) == None:
        data2_list[i] = None
    else:
        data2_list[i] = tfidf(str(data_list[i]))
    #print(tfidf(data_list[i]))
    #print(str(data2_list[i]))

print(len(data2_list))

# 将关键字存储到关键词字段
df2 = pd.read_csv('D:/12140/Desktops/111/222/test002.csv',encoding='utf-8')

data2 = df2['关键词']
#print(data2)
print(len(data2))

list = []
for i in range(1, len(data2_list)):
        result = ""
        for j in range(1,len(data2_list[i])):
            result += str(data2_list[i][j])
        #print(result)
        list.append(result)
#print(list)

print(len(list))

dd = pd.DataFrame(list)
dd.fillna("0")

df2['关键词'] = dd

print(df2['关键词'])

df2.to_csv("D:/12140/Desktops/111/222/test003.csv", index=False)
posted @   yblll  阅读(3)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律
点击右上角即可分享
微信分享提示