(七)数据分析-Python进行数据分析-numpy,pandas

Python基础知识（略）

数据分析相关：numpy，pandas

numpy

numpy功能相比pandas简单，使用较少

 1 import numpy as np
 2 list1 = [1,2,3,4,5,6]
 3 print(list1)
 4 arr1 = np.array(list1) #与列表不是同一种数据结构
 5 print(arr1,type(arr1))
 6 # [1, 2, 3, 4, 5, 6]
 7 # [1 2 3 4 5 6] <class 'numpy.ndarray'>
 8 print(arr1[0],arr1[1:-1]) #同样可以索引、切片
 9 # 1 [2 3 4 5]
10 print(arr1 + 1) #加是对每个子元素都加
11 # [2 3 4 5 6 7]
12 print(arr1+arr1,arr1*2) #每个子元素一一对应进行运算
13 # [ 2  4  6  8 10 12] [ 2  4  6  8 10 12]
14 print(arr1*arr1)
15 # [ 1  4  9 16 25 36]
16 arr2= np.array([[1,2,3,4],[5,6,7,8]]) #可以多维
17 print(arr2)
18 # [[1 2 3 4]
19 #  [5 6 7 8]]
20 print(arr2[0])
21 # [1 2 3 4]
22 print("sum:",arr2+arr2,"\ndouble:",arr2*2,"\nsquare:",arr2*arr2)
23 # sum: [[ 2  4  6  8]
24 #  [10 12 14 16]]
25 # double: [[ 2  4  6  8]
26 #  [10 12 14 16]]
27 # square: [[ 1  4  9 16]
28 #  [25 36 49 64]]
29 print(arr2.dtype)
30 # int32 C语言专属，整型的的数据格式 如果浮点型叫float64

pandas

series数据类型（可以理解为一维）

 1 import pandas as pd
 2 #两种数据结构，series一维，dataframe多维
 3 s1 = pd.Series([1,2,3,4,5,6])
 4 print(s1,type(s1))
 5 # 0    1
 6 # 1    2
 7 # 2    3
 8 # 3    4
 9 # 4    5
10 # 5    6 多出的数据标签，可以理解为索引
11 # dtype: int64 <class 'pandas.core.series.Series'>
12 s2 = pd.Series([1,2,3,4,],["a","b","c","d"])
13 print(s2)
14 # a    1
15 # b    2
16 # c    3
17 # d    4
18 # dtype: int64
19 print(s1[2],s2["b"])
20 # 3 2
21 print(s2[["a","b"]])
22 # a    1
23 # b    2
24 # dtype: int64
25 s2["d"] = 10
26 print(s2)
27 # a     1
28 # b     2
29 # c     3
30 # d    10
31 # dtype: int64
32 s2["e"] = 100
33 print(s2)
34 # a      1
35 # b      2
36 # c      3
37 # d     10
38 # e    100
39 # dtype: int64
40 s3 = pd.Series({"jenny":18,"aaron":34,"chow":41})
41 print(s3)
42 # jenny    18
43 # aaron    34
44 # chow     41
45 # dtype: int64
46 s4 = s3.astype(str) #数据转化
47 print(s4)
48 # jenny    18
49 # aaron    34
50 # chow     41
51 # dtype: object 表示字符串
52 s3["lily"] = "tweety"
53 print(s3)
54 # jenny        18
55 # aaron        34
56 # chow         41
57 # lily     tweety
58 # dtype: object

dataFrame数据类型

  1 import pandas as pd
  2 df= pd.DataFrame({"name":["Jenny","Aaron","Jay"],
  3                   "age":[18,34,41],
  4                   "gender":["female","male","male"]
  5 }) #可以输入字典格式
  6 print(df)
  7 #     name  age  gender
  8 # 0  Jenny   18  female
  9 # 1  Aaron   34    male
 10 # 2    Jay   41    male
 11 df2 = pd.DataFrame([1,23,33]) #可以输入单一列表
 12 print(df2)
 13 #     0
 14 # 0   1
 15 # 1  23
 16 # 2  33
 17 df3 = pd.DataFrame([[1,23,33],[22,34,65]],columns = list("yea")) #可以输入多维列表
 18 print(df3)
 19 #     y   e   a
 20 # 0   1  23  33
 21 # 1  22  34  65
 22 df3.info()
 23 # <class 'pandas.core.frame.DataFrame'>
 24 # RangeIndex: 2 entries, 0 to 1
 25 # Data columns (total 3 columns):
 26 # y    2 non-null int64
 27 # e    2 non-null int64
 28 # a    2 non-null int64
 29 # dtypes: int64(3)
 30 # memory usage: 128.0 bytes
 31 print(df["name"]) #一旦切片，数据类型变成series
 32 # 0    Jenny
 33 # 1    Aaron
 34 # 2      Jay
 35 # Name: name, dtype: object
 36 print(df.gender) #类的方式操作
 37 # 0    female
 38 # 1      male
 39 # 2      male
 40 # Name: gender, dtype: object
 41 print(df[["name","age"]]) #可以过滤
 42 #     name  age
 43 # 0  Jenny   18
 44 # 1  Aaron   34
 45 # 2    Jay   41
 46 df.age = 18 #整体修改统一值
 47 print(df)
 48 #     name  age  gender
 49 # 0  Jenny   18  female
 50 # 1  Aaron   18    male
 51 # 2    Jay   18    male
 52 df["age"][1] = 20 #修改具体值
 53 print(df)
 54 # name  age  gender
 55 # 0  Jenny   18  female
 56 # 1  Aaron   20    male
 57 # 2    Jay   18    male
 58 df.age = [15,25,35] #用列表方式传递值修改
 59 print(df)
 60 #     name  age  gender
 61 # 0  Jenny   15  female
 62 # 1  Aaron   25    male
 63 # 2    Jay   35    male
 64 df.index = list("abc") #也可以修改行标签
 65 print(df)
 66 #     name  age  gender
 67 # a  Jenny   15  female
 68 # b  Aaron   25    male
 69 # c    Jay   35    male
 70 print(df.age == 15) #进行逻辑判断,返回bool类型表格
 71 # a     True
 72 # b    False
 73 # c    False
 74 # Name: age, dtype: bool
 75 print(df[df.age == 15])#提取符合条件的值
 76 #     name  age  gender
 77 # a  Jenny   15  female
 78 print(~(df == 15))#反转结果
 79 #    name    age  gender
 80 # a  True  False    True
 81 # b  True   True    True
 82 # c  True   True    True
 83 print(df[(df.age>19)&(df.name=="Aaron")]) #综合条件判断查找
 84 #     name  age gender
 85 # b  Aaron   25   male
 86 print(df.query("(age > 19) &(name == 'Aaron')"))#相比索引方式输入简单一些
 87 #     name  age gender
 88 # b  Aaron   25   male
 89 print(df.iloc[0])#针对标签对应的行数查找对应行
 90 print(df.loc["a"]) #针对标签内容查找对应行
 91 # name       Jenny
 92 # age           15
 93 # gender    female
 94 # Name: a, dtype: object
 95 # name       Jenny
 96 # age           15
 97 # gender    female
 98 # Name: a, dtype: object
 99 print(df.iloc[1:2,1:2])
100 #    age
101 # b   25
102 print(df.loc["b","name"])
103 # Aaron

结合实例使用pandas

 1 import pandas as pd
 2 df = pd.read_csv("D:\Python Data Analysis\Instance.csv",encoding="gbk")
 3 
 4 df.head()
 5 
 6 df.tail()
 7 
 8 df["rank"]=df.topSalary.rank(ascending=False,method ="min" ) #,增加行，rank（）排序方式，其中method代表并列情况的处理。
 9 df.sort_values(by = "topSalary")
10 
11 df.info()
12 
13 df.city.unique() #输出非重复值,以array的形式
14 
15 df.city.value_counts() #对各个值进行计数
16 
17 df["ave"]=(df.topSalary +  df.botSalary)/2
18 df
19 
20 df.describe() #描述统计
21 
22 df.ave.describe() #对具体类目的统计
23 
24 df.ave.count() #上述的统计指标都可以直接使用
25 
26 df['cumsum'] = df.botSalary.cumsum() #累加
27 df
28 
29 df['bins'] = pd.cut(df.ave,bins=3,labels= ["low","middle","high"]) #把数据均匀3等分，并且标签
30 df
31 
32 df['bins'] = pd.cut(df.ave,bins=[0,10,15,20,40],include_lowest=True)
33 df
34 
35 df.groupby(by = "city")
36 
37 df.groupby(by = "city").count()
38 
39 df.groupby(by = "city").ave.max() #平均工值的最大值
40 
41 df.groupby(by = "city")['ave'].max() 
42 
43 df.groupby(by = ["city","company"]).max() #多重索引的聚合
44 
45 for i in df.groupby(by = "city"):
46     print(i,len(i[1])) #输出是元组
47 
48 for k,v in  df.groupby(by = "city"):
49     print(k) 
50     print(v)
51 
52 for k,v in  df.groupby(by = "city"):
53     print(k) 
54     print(v.topSalary.max()-v.topSalary.min())

pandas简单示例

数据关联（即多个表格或者文件的关联）

 1 import pandas as pd
 2 df1=pd.DataFrame({
 3     "A":list("abcd"),
 4     "B":list("efgh"),
 5     "C":list("oprt")
 6 })
 7 df2=pd.DataFrame({
 8     "C":list("oprt"),
 9     "D":list("ewee"),
10     "E":list("owfr")
11 })
12 df_merge=df1.merge(right=df2,how="inner") #左右拼接
13 # print(df_merge)
14 # #    A  B  C  D  E
15 # # 0  a  e  o  e  o
16 # # 1  b  f  p  w  w
17 # # 2  c  g  r  e  f
18 # # 3  d  h  t  e  r
19 df_join=df2.join(df1,lsuffix="C1",rsuffix="C2")
20 # print(df_join)
21 # #   CC1  D  E  A  B CC2
22 # # 0   o  e  o  a  e   o
23 # # 1   p  w  w  b  f   p
24 # # 2   r  e  f  c  g   r
25 # # 3   t  e  r  d  h   t
26 df_concat1=pd.concat([df1,df2])# #上下堆叠，如果有同样的就合并
27 # print(df_concat1)
28 # #      A    B  C    D    E
29 # # 0    a    e  o  NaN  NaN
30 # # 1    b    f  p  NaN  NaN
31 # # 2    c    g  r  NaN  NaN
32 # # 3    d    h  t  NaN  NaN
33 # # 0  NaN  NaN  o    e    o
34 # # 1  NaN  NaN  p    w    w
35 # # 2  NaN  NaN  r    e    f
36 # # 3  NaN  NaN  t    e    r
37 
38 df_concat2=pd.concat([df1,df2],axis=1)#左右堆叠
39 print(df_concat2)
40 #    A  B  C  C  D  E
41 # 0  a  e  o  o  e  o
42 # 1  b  f  p  p  w  w
43 # 2  c  g  r  r  e  f
44 # 3  d  h  t  t  e  r

merge,join,concat

多重索引

 1 #!/usr/bin/env python 
 2 # -*- coding:utf-8 -*-
 3 import pandas as pd
 4 position=pd.read_csv("D:\Python Data Analysis\employee.csv",encoding="gbk")
 5 company=pd.read_csv("D:\Python Data Analysis\company.csv",encoding="gbk")
 6 
 7 res = position.groupby(by=["city","education"]).mean()
 8 print(res)
 9 #                   positionId      companyId  botSalary  topSalary        avg
10 # city education
11 # 上海   大专         2.352912e+06   90651.500000  11.000000  14.000000  12.500000
12 #      本科         2.335357e+06  106898.769231  24.153846  29.923077  27.038462
13 #      硕士         2.306621e+06   47185.000000  21.500000  25.000000  23.250000
14 # 北京   本科         2.074818e+06   59771.000000  13.833333  20.166667  17.000000
15 # 深圳   大专         2.241769e+06   45237.500000   7.000000  11.000000   9.000000
16 #      本科         2.108186e+06   49800.111111   8.000000  10.666667   9.333333
17 res=position.groupby(by=["city","education"]).mean().avg["上海"] #转成series形式再切片
18 print(res)
19 # education
20 # 大专    12.500000
21 # 本科    27.038462
22 # 硕士    23.250000
23 # Name: avg, dtype: float64
24 res=position.groupby(by=["city","education"]).mean().avg["上海"]["本科"] #还可以继续切片
25 print(res)
26 # 27.03846153846154
27 res=position.groupby(by=["city","education"]).mean().loc["上海"] #对于DataFrame可以使用loc来查找
28 print(res)
29 # education
30 # 大专          2352911.5   90651.500000  11.000000  14.000000  12.500000
31 # 本科          2335357.0  106898.769231  24.153846  29.923077  27.038462
32 # 硕士          2306621.0   47185.000000  21.500000  25.000000  23.250000
33 
34 res=position.groupby(by=["city","education"]).mean().loc["上海"].avg #这种也可以进一步查找
35 print(res)
36 # education
37 # 大专    12.500000
38 # 本科    27.038462
39 # 硕士    23.250000
40 # Name: avg, dtype: float64
41 res = position.groupby(by=["city","education"]).mean().loc["上海","大专"] #loc支持多索引
42 print(res)
43 # Name: (上海, 大专), dtype: float64
44 # positionId    2352911.5
45 # companyId       90651.5
46 # botSalary          11.0
47 # topSalary          14.0
48 # avg                12.5
49 res=position.set_index(["city","education"]) #只单行加工，不合并，没有groupby效果好，其实是把列变成索引
50 print(res)
51 # city education                                    ...
52 # 上海   本科            2535392      53194        金融类  ...        30  25.0      应届生
53 #      本科            2315154      50702         产品  ...        30  22.5      应届生
54 #      本科            2306906      45315         技术  ...        45  40.0      应届生
55 #      本科            2406515      96063         金融  ...        25  22.5    5-10年
56 #      本科            2488286     152926         金融  ...        12  11.0    5-10年
57 #      本科            1900015     129806      市场与销售  ...        25  22.5    5-10年
58 #      本科            2507717      25322         技术  ...        14  12.0    5-10年
59 #      本科            2416226      18497         技术  ...        17  16.0    5-10年
60 #      本科             884378      25027         技术  ...        11  10.5    5-10年
61 # [34 rows x 12 columns]
62 res= position.sort_values(by=["city","education"]).set_index(["city","education"]) #先排序再处理，效果好一点
63 print(res)
64 # city education                                    ...
65 # 上海   大专            2358166      35176         技术  ...        16  14.0     1-3年
66 #      大专            2347657     146127         产品  ...        12  11.0     1-3年
67 #      本科            2535392      53194        金融类  ...        30  25.0      应届生
68 res=position.groupby(by=["city","education"]).mean().reset_index() #重置索引，城市变为了字段
69 print(res)
70 #   city education    positionId      companyId  botSalary  topSalary        avg
71 # 0   上海        大专  2.352912e+06   90651.500000  11.000000  14.000000  12.500000
72 # 1   上海        本科  2.335357e+06  106898.769231  24.153846  29.923077  27.038462
73 # 2   上海        硕士  2.306621e+06   47185.000000  21.500000  25.000000  23.250000
74 # 3   北京        本科  2.074818e+06   59771.000000  13.833333  20.166667  17.000000
75 # 4   深圳        大专  2.241769e+06   45237.500000   7.000000  11.000000   9.000000
76 # 5   深圳        本科  2.108186e+06   49800.111111   8.000000  10.666667   9.333333
77 res=position.groupby(by=["city","education"]).mean().reset_index()["city"]#可以再找索引
78 print(res)
79 # 0    上海
80 # 1    上海
81 # 2    上海
82 # 3    北京
83 # 4    深圳
84 # 5    深圳
85 # Name: city, dtype: object

多重索引

数据清洗：

　　去除[],"

 1 #!/usr/bin/env python 
 2 # -*- coding:utf-8 -*-
 3 import pandas as pd
 4 position=pd.read_csv("D:\Python Data Analysis\employee.csv",encoding="gbk")
 5 print(position.positionLables[1:-1] )#是索引操作
 6 # 1              ['平台', '产品经理', '产品', '数据']
 7 # 2                           ['架构师', '数据']
 8 # 3                          ['数据分析', '数据']
 9 # 4                          ['数据分析', '数据']
10 # 5                     ['大数据', '总监', '数据']
11 # 6                     ['平台', '管理岗', '数据']
12 # 7                          ['数据挖掘', '数据']
13 # 8              ['商业', '数据分析', '数据', 'BI']
14 # 9                           ['大数据', '数据']
15 # 10                           ['风控', '数据']
16 # 11             ['专家', '架构师', '大数据', '数据']
17 # 12                        ['分析师', '需求分析']
18 # Name: positionLables, dtype: object
19 print(position.positionLables.str[1:-1] ) #使用str，就可以对具体字符操作了
20 # 0                     '分析师', '信贷', '数据'
21 # 1              '平台', '产品经理', '产品', '数据'
22 # 2                           '架构师', '数据'
23 # 3                          '数据分析', '数据'
24 # 4                          '数据分析', '数据'
25 # 5                     '大数据', '总监', '数据'
26 # 6                     '平台', '管理岗', '数据'
27 # 7                          '数据挖掘', '数据'
28 # 8              '商业', '数据分析', '数据', 'BI'
29 # 9                           '大数据', '数据'
30 # 10                           '风控', '数据'
31 # Name: positionLables, dtype: object
32 print(position.positionLables.str[1:-1].str.replace("'","") )#使用str，才是对数组内容操作
33 # 0                 分析师, 信贷, 数据
34 # 1            平台, 产品经理, 产品, 数据
35 # 2                     架构师, 数据
36 # 3                    数据分析, 数据
37 # 4                    数据分析, 数据
38 # 5                 大数据, 总监, 数据
39 # 6                 平台, 管理岗, 数据
40 # 7                    数据挖掘, 数据
41 # 8            商业, 数据分析, 数据, BI
42 # 9                     大数据, 数据
43 # 10                     风控, 数据
44 # 11           专家, 架构师, 大数据, 数据
45 # 12                  分析师, 需求分析
46 # Name: positionLables, dtype: object

数据清洗

　　去重

 1 import pandas as pd
 2 import numpy as np
 3 position=pd.read_csv("D:\Python Data Analysis\employee.csv",encoding="gbk")
 4 df1=pd.DataFrame({
 5     "A":list("abcda"),
 6     "B":list("efghe"),
 7     "C":list("oprto")
 8 
 9 })
10 #    A  B  C
11 # 0  a  e  o
12 # 1  b  f  p
13 # 2  c  g  r
14 # 3  d  h  t
15 # 4  a  e  o
16 res=df1.duplicated() #返回布尔值
17 print(res)
18 # 0    False
19 # 1    False
20 # 2    False
21 # 3    False
22 # 4     True
23 # dtype: bool
24 res=df1[~res]
25 print(res)
26 #    A  B  C
27 # 0  a  e  o
28 # 1  b  f  p
29 # 2  c  g  r
30 # 3  d  h  t
31 res=df1.drop_duplicates()
32 print(res)
33 #    A  B  C
34 # 0  a  e  o
35 # 1  b  f  p
36 # 2  c  g  r
37 # 3  d  h  t

去重

聚合函数的用法

 1 #!/usr/bin/env python 
 2 # -*- coding:utf-8 -*-
 3 import pandas as pd
 4 position=pd.read_csv("D:\Python Data Analysis\employee.csv",encoding="gbk")
 5 position["new"]= position.avg.astype(str) + "k" #将数值转换为字符串，就可以拼接
 6 print(position.new)
 7 # 0     25.0k
 8 # 1     22.5k
 9 # 2     40.0k
10 # 3     22.5k
11 position["new"]= position.avg.apply(lambda x:str(x)+"k") #也可以使用apply来做到类型的转换，默认是列级别的操作
12 print(position.new)
13 
14 def func(x):
15     if x.avg>=20:
16         return "20k+"
17     else:
18         return "0-20k"
19 position["label"]=position.apply(func,axis=1) #axis=0默认对列,axis=1表示逐行
20 print(position.label)
21 # 0      20k+
22 # 1      20k+
23 # 2      20k+
24 # 3      20k+
25 # 4     0-20k
26 
27 # 输出各城市top5
28 def func(x):
29     res = x.sort_values("avg",ascending=False)
30     return res[0:5]
31 res= position.groupby(by="city").apply(func)
32 print(res)
33 # city                                            ...
34 # 上海   11     2568751   上海     150849  开发/测试/运维类  ...  52.5     1-3年  52.5k   20k+
35 #      10     2377876   上海     147183         金融  ...  42.5    5-10年  42.5k   20k+
36 #      2      2306906   上海      45315         技术  ...  40.0      应届生  40.0k   20k+
37 #      9      2128962   上海     140164         技术  ...  31.5    5-10年  31.5k   20k+
38 #      7      2472750   上海      56732         技术  ...  27.5    5-10年  27.5k   20k+
39 # 北京   29     2031072   北京      36162         技术  ...  25.0     1-3年  25.0k   20k+
40 #      28     2065400   北京     107423         技术  ...  23.5     1-3年  23.5k   20k+
41 #      32     2416226   北京      18497         技术  ...  16.0    5-10年  16.0k  0-20k
42 #      30     2544117   北京     146195         金融  ...  15.0    5-10年  15.0k  0-20k
43 #      31     2507717   北京      25322         技术  ...  12.0    5-10年  12.0k  0-20k
44 # 深圳   21     1280156   深圳       6566         设计  ...  12.5      应届生  12.5k  0-20k
45 #      20     2506489   深圳      83303         设计  ...  12.0      应届生  12.0k  0-20k
46 #      26     2351129   深圳      16213         技术  ...  11.0     1-3年  11.0k  0-20k
47 #      27     2580565   深圳      26148  开发/测试/运维类  ...  10.5     1-3年  10.5k  0-20k
48 #      18     2550303   深圳      15175  开发/测试/运维类  ...   9.0     1-3年   9.0k  0-20k
49 # [15 rows x 16 columns]
50 def func(x,n,asc=False):
51     res = x.sort_values("avg",ascending=asc)
52     return res[0:n]
53 res= position.groupby(by="city").apply(func,n=2,asc=True) #指定输出前n项,指定排序规则
54 print(res)
55 # city                                           ...
56 # 上海   4      2488286   上海     152926        金融  ...  11.0    5-10年  11.0k  0-20k
57 #      15     2347657   上海     146127        产品  ...  11.0     1-3年  11.0k  0-20k
58 # 北京   33      884378   北京      25027        技术  ...  10.5    5-10年  10.5k  0-20k
59 #      31     2507717   北京      25322        技术  ...  12.0    5-10年  12.0k  0-20k
60 # 深圳   17     2374266   深圳     143996     市场与销售  ...   5.5     1-3年   5.5k  0-20k
61 #      23     2540389   深圳      12567     市场与销售  ...   7.0      应届生   7.0k  0-20k
62 # [6 rows x 16 columns]
63 res=position.groupby(by="city").agg("mean")  #聚合
64 print(res)
65 #        positionId     companyId  botSalary  topSalary        avg
66 # city                                                             
67 # 上海    2.334042e+06  97962.176471  22.294118  27.470588  24.882353
68 # 北京    2.074818e+06  59771.000000  13.833333  20.166667  17.000000
69 # 深圳    2.132474e+06  48970.545455   7.818182  10.727273   9.272727
70 
71 res=position.groupby(by="city").agg(["sum","mean"])  #多重列的形式聚合
72 print(res)
73 #      positionId               companyId  ...  topSalary    avg           
74 #             sum          mean       sum  ...       mean    sum       mean
75 # city                                     ...                             
76 # 上海     39678706  2.334042e+06   1665357  ...  27.470588  423.0  24.882353
77 # 北京     12448910  2.074818e+06    358626  ...  20.166667  102.0  17.000000
78 # 深圳     23457215  2.132474e+06    538676  ...  10.727273  102.0   9.272727

apply,agg

数据透视表的用法

 1 import pandas as pd
 2 import numpy as np
 3 position=pd.read_csv("D:\Python Data Analysis\employee.csv",encoding="gbk")
 4 company=pd.read_csv("D:\Python Data Analysis\company.csv",encoding="gbk")
 5 res=position.pivot_table(values="avg",
 6                          index=["city","education"],
 7                          columns='workYear',
 8                          aggfunc="mean",)
 9 print(res)
10 # workYear             1-3年      5-10年        应届生
11 # city education
12 # 上海   大专         12.500000        NaN        NaN
13 #      本科         29.333333  25.142857  29.166667
14 #      硕士         19.000000  27.500000        NaN
15 # 北京   本科         24.250000  13.375000        NaN
16 # 深圳   大专         10.500000        NaN   7.500000
17 #      本科          8.700000        NaN  10.125000
18 res=position.pivot_table(values="avg",
19                      index=["city","education"],
20                      columns="workYear",
21                      aggfunc=[np.mean,np.max])["mean"].loc['上海'] #用np的统计值不容易出错
22 print(res)
23 # workYear        1-3年      5-10年        应届生
24 # education
25 # 大专         12.500000        NaN        NaN
26 # 本科         29.333333  25.142857  29.166667
27 # 硕士         19.000000  27.500000        NaN
28 res=position.pivot_table(values="avg",
29                      index=["city","education"],
30                      columns="workYear",
31                      aggfunc="mean",
32                         margins = True) #margins是代表汇总项
33 print(res)
34 # workYear             1-3年      5-10年        应届生        All
35 # city education
36 # 上海   大专         12.500000        NaN        NaN  12.500000
37 #      本科         29.333333  25.142857  29.166667  27.038462
38 #      硕士         19.000000  27.500000        NaN  23.250000
39 # 北京   本科         24.250000  13.375000        NaN  17.000000
40 # 深圳   大专         10.500000        NaN   7.500000   9.000000
41 #      本科          8.700000        NaN  10.125000   9.333333
42 # All             16.750000  21.416667  16.937500  18.441176
43 res =position.pivot_table(values=["avg","topSalary"],
44                      index=["city","education"],
45                      columns="workYear",
46                      aggfunc={"avg":np.mean,"topSalary":np.sum},)#对不同的值按照不同的规则进行处理，可以用字典形式指定
47 print(res)
48 #                       avg                       topSalary
49 # workYear             1-3年      5-10年        应届生      1-3年  5-10年    应届生
50 # city education
51 # 上海   大专         12.500000        NaN        NaN      28.0    NaN    NaN
52 #      本科         29.333333  25.142857  29.166667      95.0  189.0  105.0
53 #      硕士         19.000000  27.500000        NaN      20.0   30.0    NaN
54 # 北京   本科         24.250000  13.375000        NaN      59.0   62.0    NaN
55 # 深圳   大专         10.500000        NaN   7.500000      12.0    NaN   10.0
56 #      本科          8.700000        NaN  10.125000      48.0    NaN   48.0

pivot_table

posted @ 2020-03-31 14:31 Jennifer224 阅读(249) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

Jennifer224

(七)数据分析-Python进行数据分析-numpy,pandas

Python基础知识（略）

numpy

pandas

公告