python全栈闯关--pandas
1、导入
import pandas as pd import numpy as np
2、数据结构
1、Series
s = pd.Series([1, 2, 3, 4, 5, np.nan, 6, 7]) print(s) # 0 1.0 # 1 2.0 # 2 3.0 # 3 4.0 # 4 5.0 # 5 NaN # 6 6.0 # 7 7.0 # dtype: float64
2、DataFrame
dates = pd.date_range('20190101', periods=6) # index行名,columns列名 df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd']) print('列选取'.center(50, '-')) print(df) # a b c d # 2019-01-01 -1.294464 0.706790 -0.164825 -0.237432 # 2019-01-02 -1.091822 0.824446 0.748465 -0.191267 # 2019-01-03 -0.755218 1.637604 -1.896371 -0.093815 # 2019-01-04 -2.610031 -0.705783 -1.247235 -1.398978 # 2019-01-05 -0.324550 1.014212 1.375527 -0.409117 # 2019-01-06 -0.512911 0.301417 1.227190 0.771551 # 按照列选择 print('列选取'.center(50, '-')) print(df['c']) # 2019-01-01 -0.164825 # 2019-01-02 0.748465 # 2019-01-03 -1.896371 # 2019-01-04 -1.247235 # 2019-01-05 1.375527 # 2019-01-06 1.227190 # Freq: D, Name: c, dtype: float64
3、创建特定数据的DataFrame
df_1 = pd.DataFrame({ 'A': 1, # 直接赋值,如果index数量大于1,默认此列按照这个值重复 'B': pd.Timestamp('20190930'), # 直接赋值Timestamp格式,多行,重复这个值 'C': pd.Series(2, index=list(range(4)), dtype='float'), # Series结构使用index,定义出整个函数的值 'D': pd.Categorical([1, 2, 3, 4]), 'E': ['a', 'b', 'c', 'd'], 'F': 'beer', 'G': [1, 5, 4, 4] }) print(df_1) # A B C D E F G # 0 1 2019-09-30 2.0 1 a beer 1 # 1 1 2019-09-30 2.0 2 b beer 5 # 2 1 2019-09-30 2.0 3 c beer 4 # 3 1 2019-09-30 2.0 4 d beer 4
4、DataFrame常用属性及排序
print('types'.center(50, '-')) print(df_1.dtypes) # 按列列出每列的数据类型 # A int64 # B datetime64[ns] # C float64 # D category # E object # F object # G int64 # dtype: object print('index'.center(50, '-')) print(df_1.index, type(df_1.index)) # Int64Index([0, 1, 2, 3], dtype='int64') # Int64Index([0, 1, 2, 3], dtype='int64') <class 'pandas.core.indexes.numeric.Int64Index'> print('columns'.center(50, '-')) print(df_1.columns, type(df_1.columns)) # Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object') # Index(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='object') <class 'pandas.core.indexes.base.Index'> print('values'.center(50, '-')) print(df_1.values, type(df_1.values)) # 打印值,结果未numpy结构 # [[1 Timestamp('2019-09-30 00:00:00') 2.0 1 'a' 'beer' 1] # [1 Timestamp('2019-09-30 00:00:00') 2.0 2 'b' 'beer' 5] # [1 Timestamp('2019-09-30 00:00:00') 2.0 3 'c' 'beer' 4] # [1 Timestamp('2019-09-30 00:00:00') 2.0 4 'd' 'beer' 4]] <class 'numpy.ndarray'> print('describe'.center(50, '-')) print(df_1.describe()) # 打印值,结果为numpy结构 # A C G # count 4.0 4.0 4.000000 计数 # mean 1.0 2.0 2.500000 平均数 # std 0.0 0.0 1.290994 方差 # min 1.0 2.0 1.000000 最小值 # 25% 1.0 2.0 1.750000 第一分位数 # 50% 1.0 2.0 2.500000 第二分位数 # 75% 1.0 2.0 3.250000 第三分位数 # max 1.0 2.0 4.000000 最大值 print('数据翻转'.center(50, '-')) print(df_1.T) # 打印值,结果未numpy结构 # 0 ... 3 # # A 1 ... 1 # # B 2019-09-30 00:00:00 ... 2019-09-30 00:00:00 # # C 2 ... 2 # # D 1 ... 4 # # E a ... d # # F beer ... beer # # G 1 ... 4 # # [7 rows x 4 columns] print('数据按照列排序'.center(50, '-')) # axis=1,按照行进行排序 # axis=0,按照列进行排序 # ascending=False倒序输出,True为正序输出 print('按照index排序'.center(50, '-')) print(df_1.sort_index(axis=0, ascending=False)) # A B C D E F G # 3 1 2019-09-30 2.0 4 d beer 4 # 2 1 2019-09-30 2.0 3 c beer 4 # 1 1 2019-09-30 2.0 2 b beer 5 # 0 1 2019-09-30 2.0 1 a beer 1 print('G列排序'.center(50, '-')) print(df_1.sort_values(by='G', ascending=False)) # 按照一列值排序 # A B C D E F G # 1 1 2019-09-30 2.0 2 b beer 5 # 2 1 2019-09-30 2.0 3 c beer 4 # 3 1 2019-09-30 2.0 4 d beer 4 # 0 1 2019-09-30 2.0 1 a beer 1 print('G,D列排序'.center(50, '-')) print(df_1.sort_values(by=['G', 'D'], ascending=False)) # 按照多列值排序 # A B C D E F G # 1 1 2019-09-30 2.0 2 b beer 5 # 3 1 2019-09-30 2.0 4 d beer 4 # 2 1 2019-09-30 2.0 3 c beer 4 # 0 1 2019-09-30 2.0 1 a beer 1 print('数据按照行排序'.center(50, '-')) index = list(range(4)) col = ['A', 'B', "C", 'D', 'E'] d_sort = pd.DataFrame(np.arange(20).reshape(4, 5), index=index, columns=col) # print(d_sort) print(d_sort.sort_values(by=[1, 2], axis=1, ascending=False)) # 按照行值排序 # E D C B A # 0 4 3 2 1 0 # 1 9 8 7 6 5 # 2 14 13 12 11 10 # 3 19 18 17 16 15
5、选择数据
index = ['A', 'B', "C", 'D', 'E'] dates = pd.date_range('20191001', periods=10) df = pd.DataFrame(np.random.randn(10, 5), index=dates, columns=index) # print(df) print('选择某列'.center(50, '-')) print(df['A']) # 2019-10-01 -0.595401 # 2019-10-02 1.264714 # 2019-10-03 1.179423 # 2019-10-04 -0.516471 # 2019-10-05 0.891850 # 2019-10-06 -0.011205 # 2019-10-07 -0.206089 # 2019-10-08 0.972745 # 2019-10-09 -0.135309 # 2019-10-10 1.590818 # Freq: D, Name: A, dtype: float64 print('切片选择'.center(50, '-')) print(df[0:3]) # 按照行数切片 print(df['2019-10-02':'2019-10-05']) # 按照索引值进行切片 # A B C D E # 2019-10-01 -0.595401 0.337930 0.034220 1.472752 -0.555414 # 2019-10-02 1.264714 0.518856 -1.148349 1.674159 -0.473919 # 2019-10-03 1.179423 2.036095 -0.719042 1.607909 2.659472 # A B C D E # 2019-10-02 1.264714 0.518856 -1.148349 1.674159 -0.473919 # 2019-10-03 1.179423 2.036095 -0.719042 1.607909 2.659472 # 2019-10-04 -0.516471 1.733509 -0.177231 0.260795 -0.106666 # 2019-10-05 0.891850 0.665301 0.013627 -1.346193 0.222099
# 按照行切片[0:3]值切片到了0-2行
# 按照值'2019-10-02':'2019-10-05'切片,切到完整的日期范围
print('按照行精确选择'.center(50, '-')) print(df.loc['2019-10-02', ['A', 'B']]) # 按照行精确选择列 # A 1.264714 # B 0.518856 # Name: 2019-10-02 00:00:00, dtype: float64 print('行号选择数据'.center(50, '-')) print(df.iloc[3, 1]) # 1.7335085248615345
# 行数从0开始计数
print(df.iloc[3:5, 0:2]) # 输出4到到5行的数据,1到2列的数据 # A B # 2019-10-04 -0.516471 1.733509 # 2019-10-05 0.891850 0.665301 # 切片从0开始计数,顾头部顾尾 print('混合选择'.center(50, '-')) print(df.ix[0:3, ['B', 'C']]) # B C # 2019-10-01 0.337930 0.034220 # 2019-10-02 0.518856 -1.148349 # 2019-10-03 2.036095 -0.719042 print('条件选择'.center(50, '-')) print(df[df.A > 0]) # A B C D E # 2019-10-01 0.391314 0.647378 0.065032 -0.436882 -0.482698 # 2019-10-02 1.742555 0.374014 0.737914 1.708461 0.328336 # 2019-10-03 0.024506 -0.455824 -0.397145 1.523103 1.361226 # 2019-10-04 0.140041 -0.604164 -0.397656 -0.423711 -0.626598 # 2019-10-05 0.027898 0.159293 -1.000558 0.921370 -1.613052 # 2019-10-08 1.411249 -1.292006 0.140944 0.699647 -0.065080 # 2019-10-10 0.306495 0.590515 -0.524972 0.521179 -0.805736