正负样本切分
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
# 基本统计量
data = pd.read_csv("1990-2010_11_bofengexp0.csv", encoding="utf8", low_memory=False)# 数据条数
data['Cited Reference Count'].describe()
#---------------------------------------------------------------------------------
# 准备数据
# 导入数据
data = pd.read_csv("1990-2010_11_bofengexp0.csv", encoding="utf8", low_memory=False)# 数据条数
# 填充、修改
data["peak_interval"]=data["peak_interval"].fillna(9999)
data["high_quality_1"]=data["high_quality_1"].fillna(0)
data["high_quality_5"]=data["high_quality_5"].fillna(0)
data["high_quality_10"]=data["high_quality_10"].fillna(0)
data.info()
# 数据集是不是失衡样本
data['high_quality'].value_counts()/len(data)
# 数据集中含波峰的情况
data['num_peak'].value_counts()
df=data
# 选取特征及分类标签
features = df[["Number_of_authors", "Number_of_Keywords", "Length_of_Abstract", "Number of Pages", "Cited Reference Count", "first_citation_interval","Fluctuation_of_annual_citation","num_peak","peak_interval","first_peak","highest_peak"]].values
#features = df[["id","rank_id","Number_of_authors", "Number_of_Keywords", "Length_of_Abstract", "Number of Pages", "Cited Reference Count", "first_citation_interval","Fluctuation_of_annual_citation","num_peak","peak_interval","first_peak","highest_peak"]].values
targets = df["high_quality"].values
# 方差筛选20211003
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
features=sel.fit_transform(features)
features.shape
# Univariate feature selection单变量的特征选择
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import chi2, f_classif
#X_new = SelectKBest(chi2, k=10).fit_transform(features, targets)# 90%信息
sp = SelectPercentile(f_classif,percentile= 90)
X_new = sp.fit_transform(features, targets)
X_new.shape
sp.get_support()
# 各个变量之间的相关系数,去除相关系数大于0.9的
corrcoef = np.corrcoef(features.astype('float'), rowvar=False) #变量,列
corrcoef.shape
# 保存
corrcoef=pd.DataFrame(corrcoef)
corrcoef.columns=["Number_of_authors", "Number_of_Keywords", "Length_of_Abstract", "Number of Pages", "Cited Reference Count", "first_citation_interval","Fluctuation_of_annual_citation","num_peak","peak_interval","first_peak","highest_peak"]
corrcoef.to_csv('corrcoef.csv',header=True,index=False)
# 相关系数图可视化
corrcoef = pd.read_csv("corrcoef.csv", encoding="utf8", low_memory=False, index_col=0)# 数据条数
corrcoef
plt.rcParams['font.sans-serif']=['SimHei']#图片显示中文
plt.rcParams['axes.unicode_minus']=False#正常显示负号
plt.subplots(figsize=(20, 16))#调整画布大小
sns.heatmap(corrcoef
,vmax=1
,square=True
,annot=True
#,cmap="Blues"
,cmap="YlGnBu"
#,cmap="YlGnBu_r"
#,fmt="d"
,xticklabels=True
,yticklabels=True
)
# 设置刻度字体大小
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
训练集测试集划分
# 划分测试集
test_200810=pd.concat([data[data["Publication Year"]==2008],data[data["Publication Year"]==2009],data[data["Publication Year"]==2010]])
# 重新设置检索
test_200810.reset_index(drop=True,inplace=True)
test_200810.to_csv("test_200810.csv")
# 训练集处理
train_19902007=pd.concat([data[data["Publication Year"]==1990],data[data["Publication Year"]==1991],data[data["Publication Year"]==1992],data[data["Publication Year"]==1993],data[data["Publication Year"]==1994]
,data[data["Publication Year"]==1995],data[data["Publication Year"]==1996],data[data["Publication Year"]==1997],data[data["Publication Year"]==1998],data[data["Publication Year"]==1999]
,data[data["Publication Year"]==2000],data[data["Publication Year"]==2001],data[data["Publication Year"]==2002],data[data["Publication Year"]==2003],data[data["Publication Year"]==2004]
,data[data["Publication Year"]==2005],data[data["Publication Year"]==2006],data[data["Publication Year"]==2007]]
)
train_19902007.reset_index(drop=True,inplace=True)
train_19902007.to_csv("train_19902007.csv")