python之pandas练习
练习代码:
1 import numpy as np
2 import pandas as pd
3
4 #显示所有列
5 pd.set_option('display.max_columns', None)
6 #显示所有行
7 pd.set_option('display.max_rows', None)
8 #设置value的显示长度为100,默认为50
9 pd.set_option('max_colwidth',100)
10
11 df = pd.DataFrame(pd.read_csv('tags.csv',header=1))
12
13 df = pd.DataFrame(
14 {
15 "id":[1001,1002,1003,1004,1005,1006],
16 "date":pd.date_range('20130102', periods=6),
17 "city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
18 "age":[23,44,54,32,34,45],
19 "category":['100-A','100-B','110-A','110-C','210-A','130-F'],
20 "price":[1200.9,np.nan,2133,5433.2,np.nan,4432]
21 },
22 columns =['id','date','city','category','age','price'])
23
24 df1 = pd.DataFrame({
25 "id":[1001,1002,1003,1004,1005,1006,1007,108],
26 "gender":['male','female','male','female','male','female','male','female'],
27 "pay":['Y','N','Y','Y','N','Y','N','Y',],
28 "m-point":[10,12,20,40,40,40,30,20]})
29
30 if __name__ == '__main__':
31 print('********************************')
32 print('一、信息表信息查看')
33 print('********************************')
34 print('查看前2行')
35 print(df.head(2))
36 print('===============================')
37 print('查看后2行')
38 print(df.tail(2))
39 print('===============================')
40 print('维度查看')
41 print(df.shape)
42 print('===============================')
43 print('数据表基本信息(维度、列名称、数据格式、所占空间等)')
44 print(df.info())
45 print('===============================')
46 print('每一列数据的格式')
47 print(df.dtypes)
48 print('===============================')
49 print('category列格式')
50 print(df['category'].dtype)
51 print('===============================')
52 print('空值')
53 print(df.isnull())
54 print('===============================')
55 print('查看price列空值')
56 print(df['price'].isnull())
57 print('===============================')
58 print('查看age列的唯一值')
59 print(df['age'].unique())
60 print('===============================')
61 print('查看数据表的值')
62 print(df.values)
63 print('===============================')
64 print('查看列名称')
65 print(df.columns)
66 print('===============================')
67 print('********************************')
68 print('二、数据表清洗 ')
69 print('********************************')
70 print('使用列prince的均值对NA进行填充')
71 print(df)
72 print(df.fillna(value=100))
73 #fillna()的参数如下,其中value可以取df.median、df.mean
74 #fillna(self, value=None, method=None, axis=None, inplace=False, limit=None, downcast=None, **kwargs)
75 print( df['price'].fillna(df['price'].mean()) )
76 print('===============================')
77 print('清除city字段的字符前后空格,字符的大小写转换')
78 df['city']=df['city'].map(str.strip)
79 #大小写转换(小写lower)
80 df['city'] = df['city'].map(str.upper)
81 print( df )
82 print('===============================')
83 print('更改数据格式')
84 print(df['price'].fillna(value=100).astype('int'))
85 print('===============================')
86 print('更改列名称')
87 print(df.rename(columns={'category': 'category-size'}))
88 print('===============================')
89 print('删除后出现的重复值')
90 print(df['city'].drop_duplicates())
91 print('===============================')
92 print('数据替换')
93 df['city'] = df['city'].replace('SH', 'shanghai')
94 print(df)
95 print('===============================')
96 print('********************************')
97 print('三、数据预处理 ')
98 print('********************************')
99 print('内连接')
100 df_inner=pd.merge(df,df1,how='inner') # 匹配合并,交集
101 print( df_inner )
102 print('===============================')
103 print('左连接')
104 df_left=pd.merge(df,df1,how='left') # 左关联
105 print( df_left )
106 print('===============================')
107 print('右连接')
108 df_right=pd.merge(df,df1,how='right') # 右关联
109 print( df_right )
110 print('===============================')
111 print('外连接')
112 df_outer=pd.merge(df,df1,how='outer') # 外连接
113 print( df_outer )
114 print('===============================')
115 print('设置索引列')
116 df_outer.set_index('id')
117 df_outer = df_outer.sort_values('age') #按age的值排序
118 #df_outer = df_outer.sort_index() #按索引排序
119 print(df_outer)
120 print('===============================')
121 print('如果price列的值>3000,group列显示high,否则显示low')
122 df_outer['group'] = np.where(df_outer['price'] > 3000,'high','low')
123 print(df_outer)
124 print('===============================')
125 print('对复合多个条件的数据进行分组标记')
126 df_inner.loc[(df_inner['city'].map(str.lower) == 'beijing') & (df_inner['price'] >= 4000), 'sign'] = 1
127 print(df_inner)
128 print('===============================')
129 print('对category字段的值依次进行分列,并创建数据表,索引值为df_inner的索引列,列名称为category和size')
130 split = pd.DataFrame(
131 (x.split('-') for x in df_inner['category']),
132 index=df_inner.index,
133 columns=['category','size'])
134 print("split")
135 print(split)
136 print('===============================')
137 print('将完成分裂后的数据表和原df_inner数据表进行匹配')
138 df_inner=pd.merge(df_inner,split,right_index=True, left_index=True)
139 print( df_inner )
140 print('===============================')
141 print('********************************')
142 print('四、数据提取')
143 print('********************************')
144 print('按索引提取单行的数值')
145 print( df_inner.loc[0] )
146 print('===============================')
147 print('按索引提取区域行数值')
148 print( df_inner.iloc[0:2] )
149 print('===============================')
150 print('重设索引')
151 df_inner.reset_index()
152 print('===============================')
153 print('设置日期为索引')
154 df_inner=df_inner.set_index('date')
155 print('===============================')
156 print('提取4日之前的所有数据')
157 print( df_inner[:'2013-01-04'] )
158 print('===============================')
159 print('使用iloc按位置区域提取数据')
160 print(df_inner.iloc[:2,:2]) #冒号前后的数字不再是索引的标签名称,而是数据所在的位置,从0开始,前三行,前两列。
161 print('===============================')
162 print('适应iloc按位置单独提起数据')
163 print( df_inner.iloc[[0,2,5],[4,5]] )#提取第0、2、5行,4、5列
164 print('===============================')
165 print('使用ix按索引标签和位置混合提取数据')
166 print(df_inner.ix[:'2013-01-03',:4] )#2013-01-03号之前,前四列数据
167 print('===============================')
168 print('判断city列的值是否为北京')
169 print(df_inner['city'].isin(['BEIJING']))
170 print('===============================')
171 print('判断city列里是否包含beijing和shanghai,然后将符合条件的数据提取出来')
172 print(df_inner.loc[df_inner['city'].isin(['BEIJING','shanghai'])])
173 print('===============================')
174 print('提取前三个字符,并生成数据表')
175 test = pd.DataFrame(df['category'].str[:3])
176 print(test)
177 print('===============================')
178 print('********************************')
179 print('五、数据筛选')
180 print('********************************')
181 print('找出年龄大于25并且在beijing的记录')
182 print(df.loc[
183 (df['age'] > 25) & (df['city'].str.lower() == 'beijing'),
184 ['id','city','age','category','price']])
185 print('===============================')
186 print('找出年龄大于35或者在beijing的记录')
187 print('===============================')
188 print(df.loc[
189 (df['age'] > 35) | (df['city'].str.lower() == 'beijing'),
190 ['id','city','age','category','price']])
191 print('===============================')
192 print('找出不在beijing的记录,对筛选后的数据按city列进行计数')
193 print('===============================')
194 test = df.loc[
195 (df['city'].str.lower() != 'beijing'),
196 ['id','city','age','category','price']]
197 print(test)
198 print('总数:'+str(test.sort_values(['age'],ascending = False).city.count()))
199 print('===============================')
200 print('使用query函数进行筛选')
201 print('===============================')
202 test = df.query('city == ["BEIJING", "shanghai"]')
203 #注意这里的单引号与双引号,如果同为双引号或单引号会报错
204 print(test)
205 print('===============================')
206 print('对筛选后的结果按prince进行求和')
207 print('===============================')
208 print( df.query('city == ["BEIJING", "shanghai"]').price.sum() )
209 print('===============================')
210 print('********************************')
211 print('六、数据汇总')
212 print('********************************')
213 print('对所有的列进行计数汇总')
214 print('===============================')
215 print( df.groupby('city').count() )
216 print('===============================')
217 print('按城市对id字段进行计数')
218 print('===============================')
219 print( df.groupby('city')['id'].count() )
220 print('===============================')
221 print('对两个字段进行汇总计数')
222 print('===============================')
223 print( df.groupby(['city','age'])['id'].count() )
224 print('===============================')
225 print('对city字段进行汇总,并分别计算prince的合计和均值')
226 print('===============================')
227 print( df.groupby('city')['price'].agg([len,np.sum, np.mean]) )
228 print('===============================')
229 print('********************************')
230 print('七、数据统计')
231 print('********************************')
232 print('简单的数据采样 ')
233 print('===============================')
234 print( df_inner.sample(n=3) )
235 print('===============================')
236 print('手动设置采样权重 ')
237 print('===============================')
238 weights = [0, 0.1, 0.7, 0, 0.2, 0.1]
239 print( df.sample(n=2, weights=weights) )
240 print('===============================')
241 print('采样后不放回')
242 print('===============================')
243 print( df.sample(n=6, replace=False) )
244 print('===============================')
245 print('采样后放回')
246 print('===============================')
247 print( df.sample(n=6, replace=True) )
248 print('===============================')
249 print('数据表描述性统计')
250 print('===============================')
251 #round函数设置显示小数位,T表示转置
252 print( df.describe().round(2).T )
253 print('===============================')
254 print('计算列的标准差')
255 print('===============================')
256 print( df['price'].std() )
257 print('===============================')
258 print('计算两个字段间的协方差')
259 print('===============================')
260 print( df['price'].cov(df_inner['age']) )
261 print('===============================')
262 print('数据表中所有字段间的协方差')
263 print('===============================')
264 print( df.cov() )
265 print('===============================')
266 print('两个字段的相关性分析')
267 print('==============================='