python之pandas练习

练习代码:

  1 import numpy as np
  2 import pandas as pd
  3 
  4 #显示所有列
  5 pd.set_option('display.max_columns', None)
  6 #显示所有行
  7 pd.set_option('display.max_rows', None)
  8 #设置value的显示长度为100,默认为50
  9 pd.set_option('max_colwidth',100)
 10 
 11 df = pd.DataFrame(pd.read_csv('tags.csv',header=1))
 12 
 13 df = pd.DataFrame(
 14     {
 15         "id":[1001,1002,1003,1004,1005,1006],
 16         "date":pd.date_range('20130102', periods=6),
 17         "city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
 18         "age":[23,44,54,32,34,45],
 19         "category":['100-A','100-B','110-A','110-C','210-A','130-F'],
 20         "price":[1200.9,np.nan,2133,5433.2,np.nan,4432]
 21     },
 22     columns =['id','date','city','category','age','price'])
 23 
 24 df1 = pd.DataFrame({
 25         "id":[1001,1002,1003,1004,1005,1006,1007,108],
 26         "gender":['male','female','male','female','male','female','male','female'],
 27         "pay":['Y','N','Y','Y','N','Y','N','Y',],
 28         "m-point":[10,12,20,40,40,40,30,20]})
 29 
 30 if __name__ == '__main__':
 31     print('********************************')
 32     print('一、信息表信息查看')
 33     print('********************************')
 34     print('查看前2行')
 35     print(df.head(2))
 36     print('===============================')
 37     print('查看后2行')
 38     print(df.tail(2))
 39     print('===============================')
 40     print('维度查看')
 41     print(df.shape)
 42     print('===============================')
 43     print('数据表基本信息(维度、列名称、数据格式、所占空间等)')
 44     print(df.info())
 45     print('===============================')
 46     print('每一列数据的格式')
 47     print(df.dtypes)
 48     print('===============================')
 49     print('category列格式')
 50     print(df['category'].dtype)
 51     print('===============================')
 52     print('空值')
 53     print(df.isnull())
 54     print('===============================')
 55     print('查看price列空值')
 56     print(df['price'].isnull())
 57     print('===============================')
 58     print('查看age列的唯一值')
 59     print(df['age'].unique())
 60     print('===============================')
 61     print('查看数据表的值')
 62     print(df.values)
 63     print('===============================')
 64     print('查看列名称')
 65     print(df.columns)
 66     print('===============================')
 67     print('********************************')
 68     print('二、数据表清洗 ')
 69     print('********************************')
 70     print('使用列prince的均值对NA进行填充')
 71     print(df)
 72     print(df.fillna(value=100))    
 73     #fillna()的参数如下,其中value可以取df.median、df.mean
 74     #fillna(self, value=None, method=None, axis=None, inplace=False, limit=None, downcast=None, **kwargs)
 75     print( df['price'].fillna(df['price'].mean()) )
 76     print('===============================')
 77     print('清除city字段的字符前后空格,字符的大小写转换')
 78     df['city']=df['city'].map(str.strip)
 79     #大小写转换(小写lower)
 80     df['city'] = df['city'].map(str.upper)
 81     print( df )
 82     print('===============================')
 83     print('更改数据格式')
 84     print(df['price'].fillna(value=100).astype('int'))
 85     print('===============================')
 86     print('更改列名称')
 87     print(df.rename(columns={'category': 'category-size'}))
 88     print('===============================')
 89     print('删除后出现的重复值')
 90     print(df['city'].drop_duplicates())
 91     print('===============================')
 92     print('数据替换')
 93     df['city'] = df['city'].replace('SH', 'shanghai')
 94     print(df)
 95     print('===============================')
 96     print('********************************')
 97     print('三、数据预处理 ')
 98     print('********************************')
 99     print('内连接')
100     df_inner=pd.merge(df,df1,how='inner')   # 匹配合并,交集
101     print( df_inner )
102     print('===============================')
103     print('左连接')
104     df_left=pd.merge(df,df1,how='left')     # 左关联
105     print( df_left )
106     print('===============================')
107     print('右连接')
108     df_right=pd.merge(df,df1,how='right')   # 右关联
109     print( df_right )
110     print('===============================')
111     print('外连接')
112     df_outer=pd.merge(df,df1,how='outer')   # 外连接
113     print( df_outer )
114     print('===============================')
115     print('设置索引列')
116     df_outer.set_index('id')
117     df_outer = df_outer.sort_values('age')  #按age的值排序
118     #df_outer = df_outer.sort_index()  #按索引排序
119     print(df_outer)
120     print('===============================')
121     print('如果price列的值>3000,group列显示high,否则显示low')
122     df_outer['group'] = np.where(df_outer['price'] > 3000,'high','low')
123     print(df_outer)
124     print('===============================')
125     print('对复合多个条件的数据进行分组标记')
126     df_inner.loc[(df_inner['city'].map(str.lower) == 'beijing') & (df_inner['price'] >= 4000), 'sign'] = 1
127     print(df_inner)
128     print('===============================')
129     print('对category字段的值依次进行分列,并创建数据表,索引值为df_inner的索引列,列名称为category和size')
130     split = pd.DataFrame(
131         (x.split('-') for x in df_inner['category']),
132         index=df_inner.index,
133         columns=['category','size'])
134     print("split")
135     print(split)
136     print('===============================')
137     print('将完成分裂后的数据表和原df_inner数据表进行匹配')
138     df_inner=pd.merge(df_inner,split,right_index=True, left_index=True)
139     print( df_inner )
140     print('===============================')
141     print('********************************')
142     print('四、数据提取')
143     print('********************************')
144     print('按索引提取单行的数值')
145     print( df_inner.loc[0] )
146     print('===============================')
147     print('按索引提取区域行数值')
148     print( df_inner.iloc[0:2] )
149     print('===============================')
150     print('重设索引')
151     df_inner.reset_index()
152     print('===============================')
153     print('设置日期为索引')
154     df_inner=df_inner.set_index('date')
155     print('===============================')
156     print('提取4日之前的所有数据')
157     print( df_inner[:'2013-01-04'] )
158     print('===============================')
159     print('使用iloc按位置区域提取数据')
160     print(df_inner.iloc[:2,:2]) #冒号前后的数字不再是索引的标签名称,而是数据所在的位置,从0开始,前三行,前两列。
161     print('===============================')
162     print('适应iloc按位置单独提起数据')
163     print( df_inner.iloc[[0,2,5],[4,5]]  )#提取第0、2、5行,4、5列
164     print('===============================')
165     print('使用ix按索引标签和位置混合提取数据')
166     print(df_inner.ix[:'2013-01-03',:4] )#2013-01-03号之前,前四列数据
167     print('===============================')
168     print('判断city列的值是否为北京')
169     print(df_inner['city'].isin(['BEIJING']))
170     print('===============================')
171     print('判断city列里是否包含beijing和shanghai,然后将符合条件的数据提取出来')
172     print(df_inner.loc[df_inner['city'].isin(['BEIJING','shanghai'])])
173     print('===============================')
174     print('提取前三个字符,并生成数据表')
175     test = pd.DataFrame(df['category'].str[:3])
176     print(test)
177     print('===============================')
178     print('********************************')
179     print('五、数据筛选')
180     print('********************************')
181     print('找出年龄大于25并且在beijing的记录')
182     print(df.loc[
183         (df['age'] > 25) & (df['city'].str.lower() == 'beijing'),
184         ['id','city','age','category','price']])
185     print('===============================')
186     print('找出年龄大于35或者在beijing的记录')
187     print('===============================')
188     print(df.loc[
189         (df['age'] > 35) | (df['city'].str.lower() == 'beijing'),
190         ['id','city','age','category','price']])
191     print('===============================')
192     print('找出不在beijing的记录,对筛选后的数据按city列进行计数')
193     print('===============================')
194     test = df.loc[
195             (df['city'].str.lower() != 'beijing'),
196             ['id','city','age','category','price']]
197     print(test)
198     print('总数:'+str(test.sort_values(['age'],ascending = False).city.count()))
199     print('===============================')
200     print('使用query函数进行筛选')
201     print('===============================')
202     test = df.query('city == ["BEIJING", "shanghai"]')
203     #注意这里的单引号与双引号,如果同为双引号或单引号会报错
204     print(test)
205     print('===============================')
206     print('对筛选后的结果按prince进行求和')
207     print('===============================')
208     print( df.query('city == ["BEIJING", "shanghai"]').price.sum() )
209     print('===============================')
210     print('********************************')
211     print('六、数据汇总')
212     print('********************************')
213     print('对所有的列进行计数汇总')
214     print('===============================')
215     print( df.groupby('city').count() )
216     print('===============================')
217     print('按城市对id字段进行计数')
218     print('===============================')
219     print( df.groupby('city')['id'].count() )
220     print('===============================')
221     print('对两个字段进行汇总计数')
222     print('===============================')
223     print( df.groupby(['city','age'])['id'].count() )
224     print('===============================')
225     print('对city字段进行汇总,并分别计算prince的合计和均值')
226     print('===============================')
227     print( df.groupby('city')['price'].agg([len,np.sum, np.mean]) )
228     print('===============================')
229     print('********************************')
230     print('七、数据统计')
231     print('********************************')
232     print('简单的数据采样 ')
233     print('===============================')
234     print( df_inner.sample(n=3) )
235     print('===============================')
236     print('手动设置采样权重 ')
237     print('===============================')
238     weights = [0, 0.1, 0.7, 0, 0.2, 0.1]
239     print( df.sample(n=2, weights=weights) )
240     print('===============================')
241     print('采样后不放回')
242     print('===============================')
243     print( df.sample(n=6, replace=False) )
244     print('===============================')
245     print('采样后放回')
246     print('===============================')
247     print( df.sample(n=6, replace=True) )
248     print('===============================')
249     print('数据表描述性统计')
250     print('===============================')
251     #round函数设置显示小数位,T表示转置
252     print( df.describe().round(2).T )
253     print('===============================')
254     print('计算列的标准差')
255     print('===============================')
256     print( df['price'].std() )
257     print('===============================')
258     print('计算两个字段间的协方差')
259     print('===============================')
260     print( df['price'].cov(df_inner['age'])  )
261     print('===============================')
262     print('数据表中所有字段间的协方差')
263     print('===============================')
264     print( df.cov() )
265     print('===============================')
266     print('两个字段的相关性分析')
267     print('===============================')
268     ##相关系数在-1到1之间,接近1为正相关,接近-1为负相关,0为不相关
269     print( df['price'].corr(df_inner['age']) )
270     print('===============================')
271     print('数据表的相关性分析')
272     print('===============================')
273     print( df.corr() )
274     print('===============================')
275     print('********************************')
276     print('八、数据输出')
277     print('********************************')
278     print('数据导出:写入Excel')
279     #此处不知道什么原因,将内容写入到.xlsx文件中一直存在问题
280     df.to_excel('excel_to_python.xls',sheet_name='test1')
281     print('写入成功')
282     print('===============================')
283     print('数据导出:写入到CSV')
284     df.to_csv('excel_to_python.csv')
285     print('写入成功')
286     print('===============================')

运行结果:

********************************
一、信息表信息查看
********************************
查看前2行
df.head(2)
     id       date      city category  age   price
0  1001 2013-01-02  Beijing     100-A   23  1200.9
1  1002 2013-01-03        SH    100-B   44     NaN
===============================
查看后2行
df.tail(2)
     id       date      city category  age   price
4  1005 2013-01-06  shanghai    210-A   34     NaN
5  1006 2013-01-07  BEIJING     130-F   45  4432.0
===============================
维度查看
(6, 6)
===============================
数据表基本信息(维度、列名称、数据格式、所占空间等)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
id          6 non-null int64
date        6 non-null datetime64[ns]
city        6 non-null object
category    6 non-null object
age         6 non-null int64
price       4 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 276.0+ bytes
None
===============================
每一列数据的格式
id                   int64
date        datetime64[ns]
city                object
category            object
age                  int64
price              float64
dtype: object
===============================
category列格式
object
===============================
空值
      id   date   city  category    age  price
0  False  False  False     False  False  False
1  False  False  False     False  False   True
2  False  False  False     False  False  False
3  False  False  False     False  False  False
4  False  False  False     False  False   True
5  False  False  False     False  False  False
===============================
查看price列空值
0    False
1     True
2    False
3    False
4     True
5    False
Name: price, dtype: bool
===============================
查看age列的唯一值
[23 44 54 32 34 45]
===============================
查看数据表的值
[[1001L Timestamp('2013-01-02 00:00:00') 'Beijing ' '100-A' 23L 1200.9]
 [1002L Timestamp('2013-01-03 00:00:00') 'SH' '100-B' 44L nan]
 [1003L Timestamp('2013-01-04 00:00:00') ' guangzhou ' '110-A' 54L 2133.0]
 [1004L Timestamp('2013-01-05 00:00:00') 'Shenzhen' '110-C' 32L 5433.2]
 [1005L Timestamp('2013-01-06 00:00:00') 'shanghai' '210-A' 34L nan]
 [1006L Timestamp('2013-01-07 00:00:00') 'BEIJING ' '130-F' 45L 4432.0]]
===============================
查看列名称
Index([u'id', u'date', u'city', u'category', u'age', u'price'], dtype='object')
===============================
********************************
二、数据表清洗 
********************************
使用列prince的均值对NA进行填充
     id       date         city category  age   price
0  1001 2013-01-02     Beijing     100-A   23  1200.9
1  1002 2013-01-03           SH    100-B   44     NaN
2  1003 2013-01-04   guangzhou     110-A   54  2133.0
3  1004 2013-01-05     Shenzhen    110-C   32  5433.2
4  1005 2013-01-06     shanghai    210-A   34     NaN
5  1006 2013-01-07     BEIJING     130-F   45  4432.0
     id       date         city category  age   price
0  1001 2013-01-02     Beijing     100-A   23  1200.9
1  1002 2013-01-03           SH    100-B   44   100.0
2  1003 2013-01-04   guangzhou     110-A   54  2133.0
3  1004 2013-01-05     Shenzhen    110-C   32  5433.2
4  1005 2013-01-06     shanghai    210-A   34   100.0
5  1006 2013-01-07     BEIJING     130-F   45  4432.0
0    1200.900
1    3299.775
2    2133.000
3    5433.200
4    3299.775
5    4432.000
Name: price, dtype: float64
===============================
清除city字段的字符前后空格,字符的大小写转换
     id       date       city category  age   price
0  1001 2013-01-02    BEIJING    100-A   23  1200.9
1  1002 2013-01-03         SH    100-B   44     NaN
2  1003 2013-01-04  GUANGZHOU    110-A   54  2133.0
3  1004 2013-01-05   SHENZHEN    110-C   32  5433.2
4  1005 2013-01-06   SHANGHAI    210-A   34     NaN
5  1006 2013-01-07    BEIJING    130-F   45  4432.0
===============================
更改数据格式
0    1200
1     100
2    2133
3    5433
4     100
5    4432
Name: price, dtype: int32
===============================
更改列名称
     id       date       city category-size  age   price
0  1001 2013-01-02    BEIJING         100-A   23  1200.9
1  1002 2013-01-03         SH         100-B   44     NaN
2  1003 2013-01-04  GUANGZHOU         110-A   54  2133.0
3  1004 2013-01-05   SHENZHEN         110-C   32  5433.2
4  1005 2013-01-06   SHANGHAI         210-A   34     NaN
5  1006 2013-01-07    BEIJING         130-F   45  4432.0
===============================
删除后出现的重复值
0      BEIJING
1           SH
2    GUANGZHOU
3     SHENZHEN
4     SHANGHAI
Name: city, dtype: object
===============================
数据替换
     id       date       city category  age   price
0  1001 2013-01-02    BEIJING    100-A   23  1200.9
1  1002 2013-01-03   shanghai    100-B   44     NaN
2  1003 2013-01-04  GUANGZHOU    110-A   54  2133.0
3  1004 2013-01-05   SHENZHEN    110-C   32  5433.2
4  1005 2013-01-06   SHANGHAI    210-A   34     NaN
5  1006 2013-01-07    BEIJING    130-F   45  4432.0
===============================
********************************
三、数据预处理 
********************************
内连接
     id       date       city category  age   price  gender  m-point pay
0  1001 2013-01-02    BEIJING    100-A   23  1200.9    male       10   Y
1  1002 2013-01-03   shanghai    100-B   44     NaN  female       12   N
2  1003 2013-01-04  GUANGZHOU    110-A   54  2133.0    male       20   Y
3  1004 2013-01-05   SHENZHEN    110-C   32  5433.2  female       40   Y
4  1005 2013-01-06   SHANGHAI    210-A   34     NaN    male       40   N
5  1006 2013-01-07    BEIJING    130-F   45  4432.0  female       40   Y
===============================
左连接
     id       date       city category  age   price  gender  m-point pay
0  1001 2013-01-02    BEIJING    100-A   23  1200.9    male       10   Y
1  1002 2013-01-03   shanghai    100-B   44     NaN  female       12   N
2  1003 2013-01-04  GUANGZHOU    110-A   54  2133.0    male       20   Y
3  1004 2013-01-05   SHENZHEN    110-C   32  5433.2  female       40   Y
4  1005 2013-01-06   SHANGHAI    210-A   34     NaN    male       40   N
5  1006 2013-01-07    BEIJING    130-F   45  4432.0  female       40   Y
===============================
右连接
     id       date       city category   age   price  gender  m-point pay
0  1001 2013-01-02    BEIJING    100-A  23.0  1200.9    male       10   Y
1  1002 2013-01-03   shanghai    100-B  44.0     NaN  female       12   N
2  1003 2013-01-04  GUANGZHOU    110-A  54.0  2133.0    male       20   Y
3  1004 2013-01-05   SHENZHEN    110-C  32.0  5433.2  female       40   Y
4  1005 2013-01-06   SHANGHAI    210-A  34.0     NaN    male       40   N
5  1006 2013-01-07    BEIJING    130-F  45.0  4432.0  female       40   Y
6  1007        NaT        NaN      NaN   NaN     NaN    male       30   N
7   108        NaT        NaN      NaN   NaN     NaN  female       20   Y
===============================
外连接
     id       date       city category   age   price  gender  m-point pay
0  1001 2013-01-02    BEIJING    100-A  23.0  1200.9    male       10   Y
1  1002 2013-01-03   shanghai    100-B  44.0     NaN  female       12   N
2  1003 2013-01-04  GUANGZHOU    110-A  54.0  2133.0    male       20   Y
3  1004 2013-01-05   SHENZHEN    110-C  32.0  5433.2  female       40   Y
4  1005 2013-01-06   SHANGHAI    210-A  34.0     NaN    male       40   N
5  1006 2013-01-07    BEIJING    130-F  45.0  4432.0  female       40   Y
6  1007        NaT        NaN      NaN   NaN     NaN    male       30   N
7   108        NaT        NaN      NaN   NaN     NaN  female       20   Y
===============================
设置索引列
     id       date       city category   age   price  gender  m-point pay
0  1001 2013-01-02    BEIJING    100-A  23.0  1200.9    male       10   Y
3  1004 2013-01-05   SHENZHEN    110-C  32.0  5433.2  female       40   Y
4  1005 2013-01-06   SHANGHAI    210-A  34.0     NaN    male       40   N
1  1002 2013-01-03   shanghai    100-B  44.0     NaN  female       12   N
5  1006 2013-01-07    BEIJING    130-F  45.0  4432.0  female       40   Y
2  1003 2013-01-04  GUANGZHOU    110-A  54.0  2133.0    male       20   Y
6  1007        NaT        NaN      NaN   NaN     NaN    male       30   N
7   108        NaT        NaN      NaN   NaN     NaN  female       20   Y
===============================
如果price列的值>3000,group列显示high,否则显示low
     id       date       city category  ...    gender  m-point pay  group
0  1001 2013-01-02    BEIJING    100-A  ...      male       10   Y    low
3  1004 2013-01-05   SHENZHEN    110-C  ...    female       40   Y   high
4  1005 2013-01-06   SHANGHAI    210-A  ...      male       40   N    low
1  1002 2013-01-03   shanghai    100-B  ...    female       12   N    low
5  1006 2013-01-07    BEIJING    130-F  ...    female       40   Y   high
2  1003 2013-01-04  GUANGZHOU    110-A  ...      male       20   Y    low
6  1007        NaT        NaN      NaN  ...      male       30   N    low
7   108        NaT        NaN      NaN  ...    female       20   Y    low

[8 rows x 10 columns]
===============================
对复合多个条件的数据进行分组标记
     id       date       city category  ...   gender  m-point pay  sign
0  1001 2013-01-02    BEIJING    100-A  ...     male       10   Y   NaN
1  1002 2013-01-03   shanghai    100-B  ...   female       12   N   NaN
2  1003 2013-01-04  GUANGZHOU    110-A  ...     male       20   Y   NaN
3  1004 2013-01-05   SHENZHEN    110-C  ...   female       40   Y   NaN
4  1005 2013-01-06   SHANGHAI    210-A  ...     male       40   N   NaN
5  1006 2013-01-07    BEIJING    130-F  ...   female       40   Y   1.0

[6 rows x 10 columns]
===============================
对category字段的值依次进行分列,并创建数据表,索引值为df_inner的索引列,列名称为category和size
split
  category size
0      100    A
1      100    B
2      110    A
3      110    C
4      210    A
5      130    F
===============================
将完成分裂后的数据表和原df_inner数据表进行匹配
     id       date       city category_x  ...   pay  sign category_y  size
0  1001 2013-01-02    BEIJING      100-A  ...     Y   NaN        100     A
1  1002 2013-01-03   shanghai      100-B  ...     N   NaN        100     B
2  1003 2013-01-04  GUANGZHOU      110-A  ...     Y   NaN        110     A
3  1004 2013-01-05   SHENZHEN      110-C  ...     Y   NaN        110     C
4  1005 2013-01-06   SHANGHAI      210-A  ...     N   NaN        210     A
5  1006 2013-01-07    BEIJING      130-F  ...     Y   1.0        130     F

[6 rows x 12 columns]
===============================
********************************
四、数据提取
********************************
按索引提取单行的数值
id                           1001
date          2013-01-02 00:00:00
city                      BEIJING
category_x                  100-A
age                            23
price                      1200.9
gender                       male
m-point                        10
pay                             Y
sign                          NaN
category_y                    100
size                            A
Name: 0, dtype: object
===============================
按索引提取区域行数值
     id       date      city category_x  ...   pay  sign category_y  size
0  1001 2013-01-02   BEIJING      100-A  ...     Y   NaN        100     A
1  1002 2013-01-03  shanghai      100-B  ...     N   NaN        100     B

[2 rows x 12 columns]
===============================
重设索引
===============================
设置日期为索引
===============================
提取4日之前的所有数据
              id       city category_x  age ...   pay sign  category_y size
date                                        ...                            
2013-01-02  1001    BEIJING      100-A   23 ...     Y  NaN         100    A
2013-01-03  1002   shanghai      100-B   44 ...     N  NaN         100    B
2013-01-04  1003  GUANGZHOU      110-A   54 ...     Y  NaN         110    A

[3 rows x 11 columns]
===============================
使用iloc按位置区域提取数据
              id      city
date                      
2013-01-02  1001   BEIJING
2013-01-03  1002  shanghai
===============================
适应iloc按位置单独提起数据
             price  gender
date                      
2013-01-02  1200.9    male
2013-01-04  2133.0    male
2013-01-07  4432.0  female
===============================
使用ix按索引标签和位置混合提取数据
              id      city category_x  age
date                                      
2013-01-02  1001   BEIJING      100-A   23
2013-01-03  1002  shanghai      100-B   44
===============================
判断city列的值是否为北京
date
2013-01-02     True
2013-01-03    False
2013-01-04    False
2013-01-05    False
2013-01-06    False
2013-01-07     True
Name: city, dtype: bool
===============================
判断city列里是否包含beijing和shanghai,然后将符合条件的数据提取出来
              id      city category_x  age ...   pay sign  category_y size
date                                       ...                            
2013-01-02  1001   BEIJING      100-A   23 ...     Y  NaN         100    A
2013-01-03  1002  shanghai      100-B   44 ...     N  NaN         100    B
2013-01-07  1006   BEIJING      130-F   45 ...     Y  1.0         130    F

[3 rows x 11 columns]
===============================
提取前三个字符,并生成数据表
  category
0      100
1      100
2      110
3      110
4      210
5      130
===============================
********************************
五、数据筛选
********************************
找出年龄大于25并且在beijing的记录
     id     city  age category   price
5  1006  BEIJING   45    130-F  4432.0
===============================
找出年龄大于35或者在beijing的记录
===============================
     id       city  age category   price
0  1001    BEIJING   23    100-A  1200.9
1  1002   shanghai   44    100-B     NaN
2  1003  GUANGZHOU   54    110-A  2133.0
5  1006    BEIJING   45    130-F  4432.0
===============================
找出不在beijing的记录,对筛选后的数据按city列进行计数
===============================
     id       city  age category   price
1  1002   shanghai   44    100-B     NaN
2  1003  GUANGZHOU   54    110-A  2133.0
3  1004   SHENZHEN   32    110-C  5433.2
4  1005   SHANGHAI   34    210-A     NaN
总数:4
===============================
使用query函数进行筛选
===============================
     id       date      city category  age   price
0  1001 2013-01-02   BEIJING    100-A   23  1200.9
1  1002 2013-01-03  shanghai    100-B   44     NaN
5  1006 2013-01-07   BEIJING    130-F   45  4432.0
===============================
对筛选后的结果按prince进行求和
===============================
5632.9
===============================
********************************
六、数据汇总
********************************
对所有的列进行计数汇总
===============================
           id  date  category  age  price
city                                     
BEIJING     2     2         2    2      2
GUANGZHOU   1     1         1    1      1
SHANGHAI    1     1         1    1      0
SHENZHEN    1     1         1    1      1
shanghai    1     1         1    1      0
===============================
按城市对id字段进行计数
===============================
city
BEIJING      2
GUANGZHOU    1
SHANGHAI     1
SHENZHEN     1
shanghai     1
Name: id, dtype: int64
===============================
对两个字段进行汇总计数
===============================
city       age
BEIJING    23     1
           45     1
GUANGZHOU  54     1
SHANGHAI   34     1
SHENZHEN   32     1
shanghai   44     1
Name: id, dtype: int64
===============================
对city字段进行汇总,并分别计算prince的合计和均值
===============================
           len     sum     mean
city                           
BEIJING    2.0  5632.9  2816.45
GUANGZHOU  1.0  2133.0  2133.00
SHANGHAI   1.0     0.0      NaN
SHENZHEN   1.0  5433.2  5433.20
shanghai   1.0     0.0      NaN
===============================
********************************
七、数据统计
********************************
简单的数据采样 
===============================
              id       city category_x  age ...   pay sign  category_y size
date                                        ...                            
2013-01-07  1006    BEIJING      130-F   45 ...     Y  1.0         130    F
2013-01-02  1001    BEIJING      100-A   23 ...     Y  NaN         100    A
2013-01-04  1003  GUANGZHOU      110-A   54 ...     Y  NaN         110    A

[3 rows x 11 columns]
===============================
手动设置采样权重 
===============================
     id       date       city category  age   price
2  1003 2013-01-04  GUANGZHOU    110-A   54  2133.0
5  1006 2013-01-07    BEIJING    130-F   45  4432.0
===============================
采样后不放回
===============================
     id       date       city category  age   price
0  1001 2013-01-02    BEIJING    100-A   23  1200.9
4  1005 2013-01-06   SHANGHAI    210-A   34     NaN
3  1004 2013-01-05   SHENZHEN    110-C   32  5433.2
2  1003 2013-01-04  GUANGZHOU    110-A   54  2133.0
1  1002 2013-01-03   shanghai    100-B   44     NaN
5  1006 2013-01-07    BEIJING    130-F   45  4432.0
===============================
采样后放回
===============================
     id       date       city category  age   price
1  1002 2013-01-03   shanghai    100-B   44     NaN
2  1003 2013-01-04  GUANGZHOU    110-A   54  2133.0
3  1004 2013-01-05   SHENZHEN    110-C   32  5433.2
2  1003 2013-01-04  GUANGZHOU    110-A   54  2133.0
4  1005 2013-01-06   SHANGHAI    210-A   34     NaN
0  1001 2013-01-02    BEIJING    100-A   23  1200.9
===============================
数据表描述性统计
===============================
       count     mean      std     min      25%     50%      75%     max
id       6.0  1003.50     1.87  1001.0  1002.25  1003.5  1004.75  1006.0
age      6.0    38.67    11.09    23.0    32.50    39.0    44.75    54.0
price    4.0  3299.78  1966.39  1200.9  1899.98  3282.5  4682.30  5433.2
===============================
计算列的标准差
===============================
1966.39058917
===============================
计算两个字段间的协方差
===============================
nan
===============================
数据表中所有字段间的协方差
===============================
                id          age         price
id        3.500000     5.800000  3.242617e+03
age       5.800000   123.066667  2.646583e+03
price  3242.616667  2646.583333  3.866692e+06
===============================
两个字段的相关性分析
===============================
nan
===============================
数据表的相关性分析
===============================
             id       age     price
id     1.000000  0.279463  0.792163
age    0.279463  1.000000  0.098074
price  0.792163  0.098074  1.000000
===============================
********************************
八、数据输出
********************************
数据导出:写入Excel
写入成功
===============================
数据导出:写入到CSV
写入成功
===============================

 

posted @ 2019-04-16 14:49  我不是高斯分布  阅读(831)  评论(0编辑  收藏  举报