Pandas统计计算

基本的统计方法

Method Description
count Number of non-NA values
describe Compute set of summary statistics for Series or each DataFrame column
min,max Compute minimum and maximum values
argmin,argmax Compute index locations (integers) at which minimum or maximum value obtained, respectively
idxmin,idxmax Compute index labels at which minimum or maximum value obtained, respectively
quantile Compute sample quantile ranging from 0 to 1
sum Sum of values
mean Mean of values
median Arithmetic median (50% quantile) of values
mad Mean absolute deviation from mean value
prod Product of all values
var Sample variance of values
std Sample standard deviation of values
skew Sample skewness (third moment) of values
kurt Sample kurtosis (fourth moment) of values
cumsum Cumulative sum of values
cummin,cummax Cumulative minimum or maximum of values, respectively
cumprod Cumulative product of values
diff Compute First arithmetic diference (useful for time series)
pct_change Compute percent changes
参数 描述
axis 默认axis=0沿行统计,设置axis=1沿列统计
skipna 默认skipna=True忽略NaN值,skipna=False时不忽视
level 用于多层索引
#统计相加
sum(axis=None, skipna=None, level=None, numeric_only=None, **kwargs)
#平均值
mean(axis=None, skipna=None, level=None, numeric_only=None, **kwargs)
#返回索引值
idxmax(axis=0, skipna=True)
#累加
cumsum(axis=None, skipna=True, *args, **kwargs)
#统计描述,默认percentiles=[.25, .5, .75]三个分位数
describe(percentiles=None, include=None, exclude=None)
In [1]: import pandas as pd
In [2]: import numpy as np

In [4]: df = pd.DataFrame([[np.nan,3],[3.5,np.nan],[4,5],[np.nan,5.5]],
                          index=list('abcd'),columns=['one','two'])

In [5]: df
Out[5]:
   one  two
a  NaN  3.0
b  3.5  NaN
c  4.0  5.0
d  NaN  5.5

#默认axis=0,沿行统计
In [6]: df.sum()
Out[6]:
one     7.5
two    13.5
dtype: float64

In [7]: df.sum(axis=1)
Out[7]:
a    3.0
b    3.5
c    9.0
d    5.5
dtype: float64

#skipna=False表示有np.nan值时跳过不统计
In [8]: df.mean(axis=1, skipna=False)
Out[8]:
a    NaN
b    NaN
c    4.5
d    NaN
dtype: float64

#idxmax()返回最大值的索引,默认axis=0
In [9]: df.idxmax()
Out[9]:
one    c
two    d
dtype: object

In [10]: df.idxmax(axis=1)
Out[10]:
a    two
b    one
c    two
d    two
dtype: object

#累加,默认axis=0
In [11]: df.cumsum()
Out[11]:
   one   two
a  NaN   3.0
b  3.5   NaN
c  7.5   8.0
d  NaN  13.5

In [12]: df.cumsum(axis=1)
Out[12]:
   one  two
a  NaN  3.0
b  3.5  NaN
c  4.0  9.0
d  NaN  5.5

#统计描述
In [13]: df.describe()
Out[13]:
            one       two
count  2.000000  3.000000
mean   3.750000  4.500000
std    0.353553  1.322876
min    3.500000  3.000000
25%    3.625000  4.000000
50%    3.750000  5.000000
75%    3.875000  5.250000
max    4.000000  5.500000

In [16]: obj = pd.Series(['a','a','b','c']*4)

In [17]: obj
Out[17]:
0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [18]: obj.describe()
Out[18]:
count     16
unique     3
top        a
freq       8
dtype: object

相关性和协方差

#增长率
pct_change(
    periods=1,
    fill_method='pad',
    limit=None,
    freq=None,
    **kwargs,
)

#相关系数(pearson)
corr(method='pearson', min_periods=1)

corrwith(other, axis=0, drop=False, method='pearson')

#协方差
cov(min_periods=None)
In [25]: volumn = pd.read_pickle('D:\个人文件\学习资料\Python\利用Python进行数
    ...: 据分析\datasets\examples\yahoo_volume.pkl')

In [26]: price
Out[26]:
                  AAPL        GOOG         IBM       MSFT
Date
2010-01-04   27.990226  313.062468  113.304536  25.884104
2010-01-05   28.038618  311.683844  111.935822  25.892466
2010-01-06   27.592626  303.826685  111.208683  25.733566
2010-01-07   27.541619  296.753749  110.823732  25.465944
2010-01-08   27.724725  300.709808  111.935822  25.641571
...                ...         ...         ...        ...
2016-10-17  117.550003  779.960022  154.770004  57.220001
2016-10-18  117.470001  795.260010  150.720001  57.660000
2016-10-19  117.120003  801.500000  151.259995  57.529999
2016-10-20  117.059998  796.969971  151.520004  57.250000
2016-10-21  116.599998  799.369995  149.630005  59.660000

[1714 rows x 4 columns]

In [27]: volumn
Out[27]:
                 AAPL      GOOG       IBM      MSFT
Date
2010-01-04  123432400   3927000   6155300  38409100
2010-01-05  150476200   6031900   6841400  49749600
2010-01-06  138040000   7987100   5605300  58182400
2010-01-07  119282800  12876600   5840600  50559700
2010-01-08  111902700   9483900   4197200  51197400
...               ...       ...       ...       ...
2016-10-17   23624900   1089500   5890400  23830000
2016-10-18   24553500   1995600  12770600  19149500
2016-10-19   20034600    116600   4632900  22878400
2016-10-20   24125800   1734200   4023100  49455600
2016-10-21   22384800   1260500   4401900  79974200

[1714 rows x 4 columns]
                                 
#计算与上一个周期值的偏差率:(现期-基期)/基期
In [28]: returns=price.pct_change()
In [29]: returns
Out[29]:
                AAPL      GOOG       IBM      MSFT
Date
2010-01-04       NaN       NaN       NaN       NaN
2010-01-05  0.001729 -0.004404 -0.012080  0.000323
2010-01-06 -0.015906 -0.025209 -0.006496 -0.006137
2010-01-07 -0.001849 -0.023280 -0.003462 -0.010400
2010-01-08  0.006648  0.013331  0.010035  0.006897
...              ...       ...       ...       ...
2016-10-17 -0.000680  0.001837  0.002072 -0.003483
2016-10-18 -0.000681  0.019616 -0.026168  0.007690
2016-10-19 -0.002979  0.007846  0.003583 -0.002255
2016-10-20 -0.000512 -0.005652  0.001719 -0.004867
2016-10-21 -0.003930  0.003011 -0.012474  0.042096

[1714 rows x 4 columns]

#tail()最后的五条数据,head()开始的五条数据                               
In [30]: returns.tail()
Out[30]:
                AAPL      GOOG       IBM      MSFT
Date
2016-10-17 -0.000680  0.001837  0.002072 -0.003483
2016-10-18 -0.000681  0.019616 -0.026168  0.007690
2016-10-19 -0.002979  0.007846  0.003583 -0.002255
2016-10-20 -0.000512 -0.005652  0.001719 -0.004867
2016-10-21 -0.003930  0.003011 -0.012474  0.042096

#corr()相关系数
In [31]: returns['MSFT'].corr(returns['IBM'])
Out[31]: 0.49976361144151144

#cov()协方差
In [32]: returns['MSFT'].cov(returns['IBM'])
Out[32]: 8.870655479703546e-05

In [33]: returns.MSFT.corr(returns.IBM)
Out[33]: 0.49976361144151144

#相关矩阵
In [34]: returns.corr()
Out[34]:
          AAPL      GOOG       IBM      MSFT
AAPL  1.000000  0.407919  0.386817  0.389695
GOOG  0.407919  1.000000  0.405099  0.465919
IBM   0.386817  0.405099  1.000000  0.499764
MSFT  0.389695  0.465919  0.499764  1.000000

#协方差矩阵
In [35]: returns.cov()
Out[35]:
          AAPL      GOOG       IBM      MSFT
AAPL  0.000277  0.000107  0.000078  0.000095
GOOG  0.000107  0.000251  0.000078  0.000108
IBM   0.000078  0.000078  0.000146  0.000089
MSFT  0.000095  0.000108  0.000089  0.000215
                                 
#与具体的某个对象的相关性系数
In [36]: returns.corrwith(returns['IBM'])
Out[36]:
AAPL    0.386817
GOOG    0.405099
IBM     1.000000
MSFT    0.499764
dtype: float64

Pandas唯一性\计数\隶属关系函数

#唯一性
unique()

#计数
value_counts(normalize=False, sort=True, ascending=False, bins=None, dropna=True,)

#隶属关系判断
isin(values)

#获取匹配获取索引值
pd.Index.get_indexer(self, target, method=None, limit=None, tolerance=None)
In [37]: obj = pd.Series(['c','a','d','a','a','b','b','c','c'])

In [38]: obj
Out[38]:
0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

#判断obj的唯一性
In [39]: uniques = obj.unique()

In [40]: uniques
Out[40]: array(['c', 'a', 'd', 'b'], dtype=object)

#对obj进行计数
In [41]: obj.value_counts()
Out[41]:
a    3
c    3
b    2
d    1
dtype: int64

In [42]: pd.value_counts(obj.values,sort=False)
Out[42]:
b    2
c    3
a    3
d    1
dtype: int64

In [43]: obj
Out[43]:
0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

#判断obj是否包含b或c元素
In [44]: mask = obj.isin(['b','c'])

In [45]: mask
Out[45]:
0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [46]:  obj[mask]
Out[46]:
0    c
5    b
6    b
7    c
8    c
dtype: object

#pd.Index(unique_vals)获取unique_vals的索引,get_indexer(to_match)获取to_match与unique_vals一样值的索引
In [47]: to_match = pd.Series(['c','a','b','b','c','a'])

In [48]: unique_vals = pd.Series(['c','b','a'])

In [49]: pd.Index(unique_vals).get_indexer(to_match)
Out[49]: array([0, 2, 1, 1, 0, 2], dtype=int64)
    
In [50]: data = pd.DataFrame({'Q1':[1,3,4,3,4],'Q2':[2,3,1,2,3],'Q3':[1,5,2,4,4]})

#对每一行或列的元素进行计数,返回唯一元素当做索引,统计数量作为值的表格
In [51]: data
Out[51]:
   Q1  Q2  Q3
0   1   2   1
1   3   3   5
2   4   1   2
3   3   2   4
4   4   3   4

In [52]: data.apply(pd.value_counts)
Out[52]:
    Q1   Q2   Q3
1  1.0  1.0  1.0
2  NaN  2.0  1.0
3  2.0  2.0  NaN
4  2.0  NaN  2.0
5  NaN  NaN  1.0

In [53]: data.apply(pd.value_counts).fillna(0)
Out[53]:
    Q1   Q2   Q3
1  1.0  1.0  1.0
2  0.0  2.0  1.0
3  2.0  2.0  0.0
4  2.0  0.0  2.0
5  0.0  0.0  1.0
posted @ 2022-01-24 16:38  溪奇的数据  阅读(161)  评论(0编辑  收藏  举报