机器学习预处理——文献数据

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import random

 

plt.rcParams['font.sans-serif'] = 'Microsoft YaHei'
plt.rcParams['axes.unicode_minus'] = False
data = pd.read_csv("D:/01研/03研二/细胞生物学的原文和引文  1990-2010/yuanyin/1990.csv", encoding="utf8", low_memory=False)# 数据条数

data.head()
data.info()
len(data["Article Title"])

填补缺失值

data["Author Full Names"] = data["Author Full Names"].fillna("nan")

data["Author Keywords"] = data["Author Keywords"].fillna("nan")
data["Abstract"] = data["Abstract"].fillna("nan")
data["ORCIDs"] = data["ORCIDs"].fillna("nan")
data["Funding Orgs"] = data["Funding Orgs"].fillna("nan")
data["Funding Text"] = data["Funding Text"].fillna("nan")
data["Addresses"] = data["Addresses"].fillna("nan")

data["Publication Date"] = data["Publication Date"].fillna("nan")
data["DOI"] = data["DOI"].fillna("nan")

data["Open Access Designations"] = data["Open Access Designations"].fillna("0")
data["Highly Cited Status"] = data["Highly Cited Status"].fillna("0")
data["Hot Paper Status"] = data["Hot Paper Status"].fillna("0")
n_of_rows = len(data["Article Title"])
print(n_of_rows)

根据;空格等 提取个数

# 根据作者字段,计算作者数量
for i in range(0, 0+n_of_rows):
    data.loc[i,'Number of authors']=len(data["Author Full Names"][i].split(";"))
# Number of keywords
for i in range(0, 0+n_of_rows):
    data.loc[i,'Number of keywords']=len(data["Author Keywords"][i].split(";"))
# Length of abstract
for i in range(0, 0+n_of_rows):
    data.loc[i,'Length of abstract']=len(data["Abstract"][i].split(" "))
# WoS Categories
for i in range(0, 0+n_of_rows):
    data.loc[i,'Number of WoS categories']=len(data["WoS Categories"][i].split(";"))
# Addresses
for i in range(0, 0+n_of_rows):
    data.loc[i,'Number of Addresses']=len(data["Addresses"][i].split(";"))

无序类别

# 编码表示
# funding
for i in range(0, 0+n_of_rows):
    if data["Funding Orgs"][i] == "nan":
        data.loc[i,'IF Funding'] = 0
    else:
        data.loc[i,'IF Funding'] = 1
# Open Access Designations
for i in range(0, 0+n_of_rows):
    if data["Open Access Designations"][i] == "0":
        data.loc[i,'IF Open'] = 0
    else:
        data.loc[i,'IF Open'] = 1
data["Publication Date"][1].upper()

'17-DEC'

# Publication Date

for i in range(0, 0+n_of_rows):
    
    # 值为-1,不包含该元素
    # 第一季度
    if data["Publication Date"][i].upper().find("JAN") != -1: 
        data.loc[i,'Publication Quarter'] = 1
    elif data["Publication Date"][i].upper().find("FEB") != -1:
        data.loc[i,'Publication Quarter'] = 1
    elif data["Publication Date"][i].upper().find("MAR") != -1:
        data.loc[i,'Publication Quarter'] = 1
    
    # 第二季度
    elif data["Publication Date"][i].upper().find("APR") != -1:
        data.loc[i,'Publication Quarter'] = 2
    elif data["Publication Date"][i].upper().find("MAY") != -1:
        data.loc[i,'Publication Quarter'] = 2
    elif data["Publication Date"][i].upper().find("JUN") != -1:
        data.loc[i,'Publication Quarter'] = 2
    
    # 第三季度
    elif data["Publication Date"][i].upper().find("JUL") != -1:
        data.loc[i,'Publication Quarter'] = 3
    elif data["Publication Date"][i].upper().find("AUG") != -1:
        data.loc[i,'Publication Quarter'] = 3
    elif data["Publication Date"][i].upper().find("SEP") != -1:
        data.loc[i,'Publication Quarter'] = 3
    
    # 第四季度
    elif data["Publication Date"][i].upper().find("OCT") != -1:
        data.loc[i,'Publication Quarter'] = 4
    elif data["Publication Date"][i].upper().find("NOV") != -1:
        data.loc[i,'Publication Quarter'] = 4
    elif data["Publication Date"][i].upper().find("DEC") != -1:
        data.loc[i,'Publication Quarter'] = 4
    
    # 没有月份
    else:
        data.loc[i,'Publication Quarter'] = 0
        

提取被引频次

# 被引频次
df = data.iloc[:,[88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117]]
first_nonzero_colnames = (df > 0).idxmax(axis=1, skipna=True)
# https://blog.csdn.net/weixin_39678451/article/details/110777970
for i in range(0, 0+n_of_rows):
    data.loc[i,'First citation speed'] = int(first_nonzero_colnames[i])-int(1990)
# 保存特征
data_1990 = data.iloc[:,[]]
data_1990.to_csv("data_1990.csv")
posted on 2022-08-22 11:28  cookie的笔记簿  阅读(52)  评论(0编辑  收藏  举报