Python:Pandas学习
1 import pandas as pd 2 import numpy as np 3 s = pd.Series([1, 3, 6, np.nan, 44, 1]) 4 5 df= pd.DataFrame(np.random.random((4,5))) 6 7 # data frame 常用属性 8 df.dtypes 9 df.index 10 df.columns 11 df.values 12 13 # data frame 常用方法 14 df.describe() 15 df.T 16 df.sort_index(axis = 1, ascending = False) 17 df.sort_values(by = 4) 18 19 # 选择数据 20 dates = pd.date_range('20160101', periods = 6) 21 df = pd.DataFrame(np.arange(24).reshape((6,4)), index = dates, 22 columns = ['A', 'B', 'C', 'D']) 23 24 '''row or column''' # 行不可隔着选择 25 print(df[0:3]) 26 print(df[['A', 'D']]) 27 28 '''select by label:loc''' # 行不可隔着选择 29 print(df.loc['20160101', :]) 30 print(df.loc[:,['A', 'B']]) 31 32 '''select by position:iloc''' 33 print(df.iloc[[0, 2], [0, 3]]) 34 35 '''mixed selection:ix''' 36 print(df.ix[[0, 2], ['A', 'D']]) 37 38 '''Boolean indexing''' 39 print(df[df.B > 5]) 40 41 # 设置数据 42 df.iloc[2, 2] = 111 43 df.loc['20160101', 'D'] = 222 44 df.B[df.A > 5] = 0 45 print(df) 46 47 df['F'] = np.nan 48 df['E'] = range(6) 49 print(df) 50 51 # 处理缺失数据 52 df.iloc[0, 1] = np.nan 53 df.iloc[1, 2] = np.nan 54 print(df) 55 print(df.dropna(axis = 0, how = 'all')) # how = {'any', 'all'} 56 print(df.fillna(value = 0)) 57 print(np.any(df.isnull())) 58 59 # data frame 合并 60 '''concatenating''' 61 df1 = pd.DataFrame(np.ones((3,4))*0, columns = ['a', 'b', 'c', 'd']) 62 df2 = pd.DataFrame(np.ones((3,4))*1, columns = ['a', 'b', 'c', 'd']) 63 df3 = pd.DataFrame(np.ones((3,4))*2, columns = ['a', 'b', 'c', 'd']) 64 65 res = pd.concat([df1, df2, df3], axis = 0, ignore_index = True) 66 res1 = pd.concat([df1, df2, df3], axis = 1) 67 68 '''join参数''' 69 df1 = pd.DataFrame(np.ones((3,4))*0, columns = ['a', 'b', 'c', 'd'], index = [1, 2, 3]) 70 df2 = pd.DataFrame(np.ones((3,4))*1, columns = ['b', 'c', 'd', 'e'], index = [2, 3, 4]) 71 72 res = pd.concat([df1, df2], join = 'outer', ignore_index = True) 73 res = pd.concat([df1, df2], join = 'inner', ignore_index = True) 74 print(res) 75 76 '''join_axes''' 77 res = pd.concat([df1, df2], axis = 1, join = 'inner') 78 res = pd.concat([df1, df2], axis = 1, join_axes = [df1.index]) 79 80 # append 81 df1 = pd.DataFrame(np.ones((3,4))*0, columns = ['a', 'b', 'c', 'd'], index = [1, 2, 3]) 82 df2 = pd.DataFrame(np.ones((3,4))*1, columns = ['b', 'c', 'd', 'e'], index = [2, 3, 4]) 83 df3 = pd.DataFrame(np.ones((3,4))*1, columns = ['b', 'c', 'd', 'e'], index = [2, 3, 4]) 84 85 res = df1.append([df2, df3], ignore_index = True) 86 res1 = pd.concat([df1, df2, df3]) 87 print(res) 88 print(res1) 89 90 # data frame merge 91 '''merge one key''' 92 left = pd.DataFrame({'key':['K1','K2','K3'], 93 'A':[1,2,3], 94 'B':[4,5,6]}) 95 96 right = pd.DataFrame({'key':['K0','K1','K3'], 97 'A':[11,43,53], 98 'D':[12,-1,0]}) 99 res = pd.merge(left, right, on = 'key', how = 'outer') 100 print(res) 101 102 '''merge two or more keys''' 103 left = pd.DataFrame({'key0':['K1','K2','K3'], 104 'key1':['X0','X2','X3'], 105 'A':[1,2,3], 106 'B':[4,5,6]}) 107 108 right = pd.DataFrame({'key0':['K0','K1','K3'], 109 'key1':['X1','X0','K3'], 110 'A':[11,43,53], 111 'D':[12,-1,0]}) 112 res = pd.merge(left, right, on = ['key0', 'key1'], how = 'outer') 113 print(res) 114 115 '''merge index''' 116 left = pd.DataFrame({'A':[1,2,3], 117 'B':[4,5,6]}, 118 index = ['K0', 'K1', 'K2']) 119 120 right = pd.DataFrame({'A':[11,43,53], 121 'D':[12,-1,0]}, 122 index = ['K1', 'K2', 'K3']) 123 res = pd.merge(left, right, left_index = True, 124 right_index = True) 125 print(res) 126 127 '''handle overlapping columns''' 128 left = pd.DataFrame({'key':['K1','K2','K3'], 129 'A':[1,2,3], 130 'B':[4,5,6]}) 131 132 right = pd.DataFrame({'key':['K0','K1','K3'], 133 'A':[11,43,53], 134 'B':[12,-1,0]}) 135 res = pd.merge(left, right, on = 'key', 136 suffixes = ['_left', '_right'] , how = 'outer') 137 print(res) 138 139 # 作图 140 import pandas as pd 141 import numpy as np 142 import matplotlib.pyplot as plt 143 144 '''plot data''' 145 '''Series''' 146 data = pd.Series(np.random.randn(1000), index = np.arange(1000)) 147 data = data.cumsum() 148 data.plot() 149 print(data) 150 151 '''Data Frame''' 152 data = pd.DataFrame(np.random.randn(1000, 4), 153 index = np.arange(1000), 154 columns = list("ABCD")) 155 print(data.head()) 156 data = data.cumsum() 157 data.plot() 158 ax = data.plot.scatter(x = 'A', y = 'C', 159 color = 'Red', 160 label = 'Class 2') 161 data.plot.scatter(x = 'A', y = 'B', 162 color = 'DarkGreen', 163 label = 'Class 2', 164 ax = ax)