文献指标统计
import xlrd import numpy as np import matplotlib.pyplot as plt import pandas as pd # 设定行名称,第1行为字段名称 #data = pd.read_excel("D:/01研/03研二/20220321help/历史 图情 经济 哲学三类文献原文及引文数据 1997-2016/历史 图情 经济 哲学三类文献原文及引文数据/Information science and Library Science/原文/ISLS.xlsx") data = pd.read_csv("D:/01研/03研二/20220321help/历史 图情 经济 哲学三类文献原文及引文数据 1997-2016/历史 图情 经济 哲学三类文献原文及引文数据/Economics/Economic2007_2016.csv", low_memory=False) #data.head() # 作者数量计算 data['Author Full Names'] = data['Author Full Names'].fillna("nan") #data['Language'] = data['Language'].fillna("0") #data['Number of Pages'] = data['Number of Pages'].fillna("0") #data['Cited Reference Count'] = data['Cited Reference Count'].fillna("0") # Number of authors data.loc[:,'Number_of_authors'] = data['Author Full Names'].apply(lambda x: len(x.split(";"))) data.info() print("文献类型:") df1 = data['Document_Type_new'].value_counts() print(df1) print("未被引文献类型:") #df2 = data[data["Times Cited, All Databases"]==0]['Document_Type_new'].value_counts() df2 = data[data["sum5"]==0]['Document_Type_new'].value_counts() print(df2) print("年度分布:") df3 = data["Publication Year"].value_counts(sort = False) print(df3) print("未被引文献年度分布:") #df4 = data[data["Times Cited, All Databases"]==0]["Publication Year"].value_counts() df4 = data[data["sum5"]==0]["Publication Year"].value_counts(sort = False) print(df4) #-------------------------------------------# # 零被引文献,完全匹配 # 零被引三类文献 #data0 = data[data["Times Cited, All Databases"]==0] data0 = data[data["sum5"]==0] #data0 data0_sanlei = (data0[data0["Document_Type_new"]=="Article"].append(data0[data0["Document_Type_new"]=="Proceedings Paper"])).append(data0[data0["Document_Type_new"]=="Review"]) # 三类文献 data_sanlei = (data[data["Document_Type_new"]=="Article"].append(data[data["Document_Type_new"]=="Proceedings Paper"])).append(data[data["Document_Type_new"]=="Review"]) #-------------------------------------------# # 三类文献时间分布 print("三类文献年度分布:") df5 = data_sanlei["Publication Year"].value_counts(sort = False) print(df5) # 三类文献中零被引年度分布 print("三类文献零被引年度分布:") #df6 = data_sanlei[data["Times Cited, All Databases"]==0]["Publication Year"].value_counts() df6 = data_sanlei[data["sum5"]==0]["Publication Year"].value_counts(sort = False) print(df6) # 三类文献中语言分布 print("三类文献中语言分布:") df7 = data_sanlei["Language"].value_counts() print(df7) # 三类文献中零被引语言分布 print("三类文献中零被引语言分布") #df8 = data_sanlei[data["Times Cited, All Databases"]==0]["Language"].value_counts() df8 = data_sanlei[data["sum5"]==0]["Language"].value_counts() print(df8) #-------------------------------------------# # 三类文献长度分布 data_sanlei_pages = data_sanlei["Number of Pages"] print("三类文献页数") p = len(data_sanlei_pages) print(p) a = pd.cut(data_sanlei_pages,[0,5,10,15,20,25,30,35,40,10000], labels=[u"(0,5]",u"(5,10]",u"(10,15]",u"(15,20]",u"(20,25]",u"(25,30]",u"(30,35]",u"(35,40]",u"(40,10000]"]) print("频数分布:") b = a.value_counts().sort_index() print(b) data0_sanlei_pages = data0_sanlei["Number of Pages"] a1 = pd.cut(data0_sanlei_pages,[0,5,10,15,20,25,30,35,40,10000], labels=[u"(0,5]",u"(5,10]",u"(10,15]",u"(15,20]",u"(20,25]",u"(25,30]",u"(30,35]",u"(35,40]",u"(40,10000]"]) print("三类文献中零被引文献页数") b1 = a1.value_counts().sort_index() print(b1) #-------------------------------------------# # 三类文献文献作者数量 print("三类文献作者") data_sanlei_authors = data_sanlei["Number_of_authors"] a2 = pd.cut(data_sanlei_authors,[0,1,2,3,4,5,6,7,10000], labels=[u"(0,1]",u"(1,2]",u"(2,3]",u"(3,4]",u"(4,5]",u"(5,6]",u"(6,7]",u"(7,10000]"]) b2 = a2.value_counts().sort_index() print(b2) # 三类零被引文献作者数量 print("三类文献中零被引文献作者") data0_sanlei_authors = data0_sanlei["Number_of_authors"] a3 = pd.cut(data0_sanlei_authors,[0,1,2,3,4,5,6,7,10000], labels=[u"(0,1]",u"(1,2]",u"(2,3]",u"(3,4]",u"(4,5]",u"(5,6]",u"(6,7]",u"(7,10000]"]) b3 = a3.value_counts().sort_index() print(b3) #-------------------------------------------# # 三类文献参考文献数量 data_sanlei_references = data_sanlei["Cited Reference Count"] data0_sanlei_references = data0_sanlei["Cited Reference Count"] print("三类文献参考文献数量分布") a4 = pd.cut(data_sanlei_references ,[0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,10000] ,labels=[u"(0,5]",u"(5,10]",u"(10,15]",u"(15,20]",u"(20,25]",u"(25,30]",u"(30,35]",u"(35,40]",u"(40,45]",u"(45,50]",u"(50,55]",u"(55,60]",u"(60,65]",u"(65,70]",u"(70,]"]) b4 = a4.value_counts().sort_index() print(b4) print("total") d = len(a4) print(d) # 三类零被引文献参考文献数量 print("三类文献中零被引文献参考文献") data0_sanlei_references = data0_sanlei["Cited Reference Count"] a5 = pd.cut(data0_sanlei_references , [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 10000] , labels = [u"(0,5]", u"(5,10]", u"(10,15]", u"(15,20]", u"(20,25]", u"(25,30]", u"(30,35]", u"(35,40]", u"(40,45]", u"(45,50]", u"(50,55]", u"(55,60]", u"(60,65]", u"(65,70]", u"(70,]"]) b5 = a5.value_counts().sort_index() print(b5) l = len(a5) print(l)