根据某字段分类后,计算excel某几列之和
'''
一个学科
一个语言为一类
一年为单位进行统计
统计发表前五年被引情况
'''
import xlrd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#data.iloc[10:,111:]
# 设定行名称
# 表字段在第11行
data = pd.read_excel("history/Chinese 1.xls",header=10)
# 多个表,合并一下
data0 = pd.read_excel("history/English 1998 1.xls",header=10)
data1 = pd.read_excel("history/English 1999 1.xls",header=10)
data = data0.append(data1)
print(len(data))
# 预览
# 行,列
# 提取的列在第112列
data[data["Publication Year"]==1994].iloc[0:,111:]
# 行,列
data[data["Publication Year"]==1994].iloc[0:,111+4:111+4+5]
# 年为单位,前五年被引之和 # 循环
sum5 = []
for i in range(0,19):
sum5.append(np.array(data[data["Publication Year"]==1990+i].iloc[0:,111+i:111+i+5].sum(axis=1)).tolist())
print("OK!")
每年为单位,输出该年满足条件的文献的被引频次;
[[],[],[]…]
[[],[],[],[],[]…]
[[],[],[],[],[],[],[]…]
for i in range(len(sum5)):
for j in range(len(sum5[i])):
if sum5[i][j] != 0:
print(sum5[i][j])
names = locals()
sum5 = []
a = []
for i in range(0,19):
names['sum5_' + str(1990+i) ] = np.array(data[data["Publication Year"]==1990+i].iloc[0:,111+i:111+i+5].sum(axis=1)).tolist()
sum5.append(names.get('sum5_' + str(1990+i)))
for j in range(len(names.get('sum5_' + str(1990+i)))):
if names.get('sum5_' + str(1990+i))[j] != 0:
# 输出前五年被引频次之和不为零的年份,及其对应的被引频次数值
print('sum5_' + str(1990+i), names.get('sum5_' + str(1990+i))[j])
# 每年里面有多少篇文献
a.append(names.get('sum5_' + str(1990+i))[j])
print(len(a))
# 新建一个dataframe
df = pd.DataFrame()