In [1]: import pandas as pd
In [2]: import numpy as np
In [4]: df = pd.DataFrame([[np.nan,3],[3.5,np.nan],[4,5],[np.nan,5.5]],
index=list('abcd'),columns=['one','two'])
In [5]: df
Out[5]:
one two
a NaN 3.0
b 3.5 NaN
c 4.05.0
d NaN 5.5#默认axis=0,沿行统计
In [6]: df.sum()
Out[6]:
one 7.5
two 13.5
dtype: float64
In [7]: df.sum(axis=1)
Out[7]:
a 3.0
b 3.5
c 9.0
d 5.5
dtype: float64
#skipna=False表示有np.nan值时跳过不统计
In [8]: df.mean(axis=1, skipna=False)
Out[8]:
a NaN
b NaN
c 4.5
d NaN
dtype: float64
#idxmax()返回最大值的索引,默认axis=0
In [9]: df.idxmax()
Out[9]:
one c
two d
dtype: object
In [10]: df.idxmax(axis=1)
Out[10]:
a two
b one
c two
d two
dtype: object#累加,默认axis=0
In [11]: df.cumsum()
Out[11]:
one two
a NaN 3.0
b 3.5 NaN
c 7.58.0
d NaN 13.5
In [12]: df.cumsum(axis=1)
Out[12]:
one two
a NaN 3.0
b 3.5 NaN
c 4.09.0
d NaN 5.5#统计描述
In [13]: df.describe()
Out[13]:
one two
count 2.0000003.000000
mean 3.7500004.500000
std 0.3535531.322876min3.5000003.00000025% 3.6250004.00000050% 3.7500005.00000075% 3.8750005.250000max4.0000005.500000
In [16]: obj = pd.Series(['a','a','b','c']*4)
In [17]: obj
Out[17]:
0 a
1 a
2 b
3 c
4 a
5 a
6 b
7 c
8 a
9 a
10 b
11 c
12 a
13 a
14 b
15 c
dtype: object
In [18]: obj.describe()
Out[18]:
count 16
unique 3
top a
freq 8
dtype: object
In [37]: obj = pd.Series(['c','a','d','a','a','b','b','c','c'])
In [38]: obj
Out[38]:
0 c
1 a
2 d
3 a
4 a
5 b
6 b
7 c
8 c
dtype: object#判断obj的唯一性
In [39]: uniques = obj.unique()
In [40]: uniques
Out[40]: array(['c', 'a', 'd', 'b'], dtype=object)
#对obj进行计数
In [41]: obj.value_counts()
Out[41]:
a 3
c 3
b 2
d 1
dtype: int64
In [42]: pd.value_counts(obj.values,sort=False)
Out[42]:
b 2
c 3
a 3
d 1
dtype: int64
In [43]: obj
Out[43]:
0 c
1 a
2 d
3 a
4 a
5 b
6 b
7 c
8 c
dtype: object#判断obj是否包含b或c元素
In [44]: mask = obj.isin(['b','c'])
In [45]: mask
Out[45]:
0True1False2False3False4False5True6True7True8True
dtype: bool
In [46]: obj[mask]
Out[46]:
0 c
5 b
6 b
7 c
8 c
dtype: object#pd.Index(unique_vals)获取unique_vals的索引,get_indexer(to_match)获取to_match与unique_vals一样值的索引
In [47]: to_match = pd.Series(['c','a','b','b','c','a'])
In [48]: unique_vals = pd.Series(['c','b','a'])
In [49]: pd.Index(unique_vals).get_indexer(to_match)
Out[49]: array([0, 2, 1, 1, 0, 2], dtype=int64)
In [50]: data = pd.DataFrame({'Q1':[1,3,4,3,4],'Q2':[2,3,1,2,3],'Q3':[1,5,2,4,4]})
#对每一行或列的元素进行计数,返回唯一元素当做索引,统计数量作为值的表格
In [51]: data
Out[51]:
Q1 Q2 Q3
01211335241233244434
In [52]: data.apply(pd.value_counts)
Out[52]:
Q1 Q2 Q3
11.01.01.02 NaN 2.01.032.02.0 NaN
42.0 NaN 2.05 NaN NaN 1.0
In [53]: data.apply(pd.value_counts).fillna(0)
Out[53]:
Q1 Q2 Q3
11.01.01.020.02.01.032.02.00.042.00.02.050.00.01.0
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通