import numpy as np
import pandas as pd
from pandas import DataFrame
替换操作
df = DataFrame(data=np.random.randint(0,100,size=(5,6)))
df
|
0 |
1 |
2 |
3 |
4 |
5 |
0 |
65 |
85 |
66 |
36 |
72 |
2 |
1 |
85 |
51 |
15 |
67 |
57 |
56 |
2 |
1 |
48 |
9 |
39 |
88 |
52 |
3 |
48 |
34 |
89 |
50 |
5 |
63 |
4 |
17 |
75 |
5 |
72 |
44 |
53 |
df.replace(to_replace=2,value='Two')
|
0 |
1 |
2 |
3 |
4 |
5 |
0 |
65 |
85 |
66 |
36 |
72 |
Two |
1 |
85 |
51 |
15 |
67 |
57 |
56 |
2 |
1 |
48 |
9 |
39 |
88 |
52 |
3 |
48 |
34 |
89 |
50 |
5 |
63 |
4 |
17 |
75 |
5 |
72 |
44 |
53 |
df.replace(to_replace={1:'one'})
|
0 |
1 |
2 |
3 |
4 |
5 |
0 |
65 |
85 |
66 |
36 |
72 |
2 |
1 |
85 |
51 |
15 |
67 |
57 |
56 |
2 |
one |
48 |
9 |
39 |
88 |
52 |
3 |
48 |
34 |
89 |
50 |
5 |
63 |
4 |
17 |
75 |
5 |
72 |
44 |
53 |
df.replace(to_replace={4:5},value='five')
#将指定列的元素进行替换to_replase={列索引:被替换的值}
|
0 |
1 |
2 |
3 |
4 |
5 |
0 |
65 |
85 |
66 |
36 |
72 |
2 |
1 |
85 |
51 |
15 |
67 |
57 |
56 |
2 |
1 |
48 |
9 |
39 |
88 |
52 |
3 |
48 |
34 |
89 |
50 |
five |
63 |
4 |
17 |
75 |
5 |
72 |
44 |
53 |
映射操作
dic = {
'name':['张三','李四','张三'],
'salary':[15000,20000,15000]
}
df = DataFrame(data=dic)
df
|
name |
salary |
0 |
张三 |
15000 |
1 |
李四 |
20000 |
2 |
张三 |
15000 |
#映射关系表
dic = {
'张三':'tom',
'李四':'jack'
}
df['e_name'] = df['name'].map(dic)
df
|
name |
salary |
e_name |
0 |
张三 |
15000 |
tom |
1 |
李四 |
20000 |
jack |
2 |
张三 |
15000 |
tom |
- map是Series的方法,只能被Series调用
运算工具
- 超过3000部分的钱缴纳50%的税,计算每个人的税后薪资
#该函数是我们指定的一个运算法则
def after_sal(s):#计算s对应的税后薪资
return s - (s-3000)*0.5
df['after_sal'] = df['salary'].map(after_sal)#可以将df['salary']这个Series中每一个元素(薪资)作为参数传递给s
df
|
name |
salary |
e_name |
after_sal |
0 |
张三 |
15000 |
tom |
9000.0 |
1 |
李四 |
20000 |
jack |
11500.0 |
2 |
张三 |
15000 |
tom |
9000.0 |
排序实现的随机抽样
- take()
- np.random.permutation()
df = DataFrame(data=np.random.randint(0,100,size=(100,3)),columns=['A','B','C'])
df
|
A |
B |
C |
0 |
6 |
97 |
40 |
1 |
9 |
42 |
1 |
2 |
7 |
40 |
82 |
3 |
82 |
89 |
50 |
4 |
6 |
93 |
93 |
5 |
21 |
30 |
33 |
6 |
1 |
70 |
65 |
7 |
33 |
51 |
73 |
8 |
70 |
40 |
19 |
9 |
27 |
98 |
84 |
10 |
30 |
33 |
45 |
11 |
41 |
19 |
34 |
12 |
32 |
0 |
23 |
13 |
70 |
60 |
22 |
14 |
52 |
99 |
10 |
15 |
60 |
79 |
6 |
16 |
35 |
16 |
1 |
17 |
71 |
63 |
99 |
18 |
86 |
43 |
89 |
19 |
4 |
43 |
78 |
20 |
72 |
64 |
54 |
21 |
4 |
17 |
27 |
22 |
64 |
60 |
84 |
23 |
77 |
73 |
89 |
24 |
59 |
90 |
19 |
25 |
4 |
22 |
71 |
26 |
86 |
89 |
38 |
27 |
56 |
60 |
92 |
28 |
75 |
37 |
15 |
29 |
31 |
0 |
43 |
... |
... |
... |
... |
70 |
23 |
1 |
7 |
71 |
12 |
88 |
5 |
72 |
52 |
52 |
35 |
73 |
26 |
48 |
96 |
74 |
26 |
49 |
63 |
75 |
23 |
34 |
68 |
76 |
82 |
36 |
46 |
77 |
74 |
49 |
70 |
78 |
17 |
53 |
50 |
79 |
18 |
34 |
2 |
80 |
41 |
61 |
18 |
81 |
84 |
92 |
98 |
82 |
10 |
20 |
61 |
83 |
64 |
33 |
42 |
84 |
14 |
87 |
16 |
85 |
67 |
47 |
75 |
86 |
95 |
26 |
77 |
87 |
13 |
94 |
85 |
88 |
12 |
27 |
86 |
89 |
53 |
97 |
76 |
90 |
26 |
98 |
71 |
91 |
49 |
95 |
15 |
92 |
62 |
33 |
77 |
93 |
51 |
63 |
44 |
94 |
80 |
68 |
2 |
95 |
6 |
32 |
67 |
96 |
76 |
8 |
64 |
97 |
83 |
44 |
14 |
98 |
59 |
17 |
61 |
99 |
66 |
46 |
99 |
100 rows × 3 columns
#生成乱序的随机序列
np.random.permutation(10)
array([2, 1, 7, 3, 5, 0, 4, 9, 6, 8])
#将原始数据打乱
df.take([2,0,1],axis=1)
df.take(np.random.permutation(3),axis=1)
|
C |
A |
B |
0 |
40 |
6 |
97 |
1 |
1 |
9 |
42 |
2 |
82 |
7 |
40 |
3 |
50 |
82 |
89 |
4 |
93 |
6 |
93 |
5 |
33 |
21 |
30 |
6 |
65 |
1 |
70 |
7 |
73 |
33 |
51 |
8 |
19 |
70 |
40 |
9 |
84 |
27 |
98 |
10 |
45 |
30 |
33 |
11 |
34 |
41 |
19 |
12 |
23 |
32 |
0 |
13 |
22 |
70 |
60 |
14 |
10 |
52 |
99 |
15 |
6 |
60 |
79 |
16 |
1 |
35 |
16 |
17 |
99 |
71 |
63 |
18 |
89 |
86 |
43 |
19 |
78 |
4 |
43 |
20 |
54 |
72 |
64 |
21 |
27 |
4 |
17 |
22 |
84 |
64 |
60 |
23 |
89 |
77 |
73 |
24 |
19 |
59 |
90 |
25 |
71 |
4 |
22 |
26 |
38 |
86 |
89 |
27 |
92 |
56 |
60 |
28 |
15 |
75 |
37 |
29 |
43 |
31 |
0 |
... |
... |
... |
... |
70 |
7 |
23 |
1 |
71 |
5 |
12 |
88 |
72 |
35 |
52 |
52 |
73 |
96 |
26 |
48 |
74 |
63 |
26 |
49 |
75 |
68 |
23 |
34 |
76 |
46 |
82 |
36 |
77 |
70 |
74 |
49 |
78 |
50 |
17 |
53 |
79 |
2 |
18 |
34 |
80 |
18 |
41 |
61 |
81 |
98 |
84 |
92 |
82 |
61 |
10 |
20 |
83 |
42 |
64 |
33 |
84 |
16 |
14 |
87 |
85 |
75 |
67 |
47 |
86 |
77 |
95 |
26 |
87 |
85 |
13 |
94 |
88 |
86 |
12 |
27 |
89 |
76 |
53 |
97 |
90 |
71 |
26 |
98 |
91 |
15 |
49 |
95 |
92 |
77 |
62 |
33 |
93 |
44 |
51 |
63 |
94 |
2 |
80 |
68 |
95 |
67 |
6 |
32 |
96 |
64 |
76 |
8 |
97 |
14 |
83 |
44 |
98 |
61 |
59 |
17 |
99 |
99 |
66 |
46 |
100 rows × 3 columns
df.take(np.random.permutation(3),axis=1).take(np.random.permutation(100),axis=0)[0:50]
|
A |
B |
C |
51 |
38 |
90 |
95 |
87 |
13 |
94 |
85 |
76 |
82 |
36 |
46 |
92 |
62 |
33 |
77 |
64 |
51 |
55 |
6 |
2 |
7 |
40 |
82 |
67 |
74 |
59 |
49 |
40 |
77 |
57 |
55 |
18 |
86 |
43 |
89 |
37 |
86 |
84 |
62 |
36 |
30 |
0 |
60 |
65 |
23 |
28 |
99 |
21 |
4 |
17 |
27 |
35 |
58 |
35 |
53 |
25 |
4 |
22 |
71 |
62 |
41 |
28 |
66 |
29 |
31 |
0 |
43 |
58 |
21 |
56 |
99 |
20 |
72 |
64 |
54 |
60 |
11 |
95 |
36 |
52 |
12 |
7 |
49 |
95 |
6 |
32 |
67 |
4 |
6 |
93 |
93 |
66 |
51 |
45 |
97 |
71 |
12 |
88 |
5 |
94 |
80 |
68 |
2 |
80 |
41 |
61 |
18 |
69 |
19 |
85 |
22 |
31 |
58 |
4 |
89 |
23 |
77 |
73 |
89 |
12 |
32 |
0 |
23 |
57 |
63 |
50 |
55 |
0 |
6 |
97 |
40 |
27 |
56 |
60 |
92 |
90 |
26 |
98 |
71 |
48 |
13 |
51 |
43 |
63 |
73 |
71 |
47 |
55 |
78 |
6 |
7 |
22 |
64 |
60 |
84 |
59 |
23 |
58 |
2 |
3 |
82 |
89 |
50 |
72 |
52 |
52 |
35 |
1 |
9 |
42 |
1 |
99 |
66 |
46 |
99 |
7 |
33 |
51 |
73 |
47 |
32 |
10 |
5 |
89 |
53 |
97 |
76 |
45 |
17 |
30 |
44 |
70 |
23 |
1 |
7 |
10 |
30 |
33 |
45 |
数据的分类处理
-
数据分类处理的核心:
- groupby()函数
- groups属性查看分组情况
df = DataFrame({'item':['Apple','Banana','Orange','Banana','Orange','Apple'],
'price':[4,3,3,2.5,4,2],
'color':['red','yellow','yellow','green','green','green'],
'weight':[12,20,50,30,20,44]})
df
|
item |
price |
color |
weight |
0 |
Apple |
4.0 |
red |
12 |
1 |
Banana |
3.0 |
yellow |
20 |
2 |
Orange |
3.0 |
yellow |
50 |
3 |
Banana |
2.5 |
green |
30 |
4 |
Orange |
4.0 |
green |
20 |
5 |
Apple |
2.0 |
green |
44 |
#想要水果的种类进行分析
df.groupby(by='item')
<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x1152f1908>
#查看详细的分组情况
df.groupby(by='item').groups
{'Apple': Int64Index([0, 5], dtype='int64'),
'Banana': Int64Index([1, 3], dtype='int64'),
'Orange': Int64Index([2, 4], dtype='int64')}
#计算出每一种水果的平均价格
df.groupby(by='item')['price'].mean()
item
Apple 3.00
Banana 2.75
Orange 3.50
Name: price, dtype: float64
#计算每一种颜色对应水果的平均重量
df.groupby(by='color')['weight'].mean()
color
green 31.333333
red 12.000000
yellow 35.000000
Name: weight, dtype: float64
dic = df.groupby(by='color')['weight'].mean().to_dict()
#将计算出的平均重量汇总到源数据
df['mean_w'] = df['color'].map(dic)
df
|
item |
price |
color |
weight |
mean_w |
0 |
Apple |
4.0 |
red |
12 |
12.000000 |
1 |
Banana |
3.0 |
yellow |
20 |
35.000000 |
2 |
Orange |
3.0 |
yellow |
50 |
35.000000 |
3 |
Banana |
2.5 |
green |
30 |
31.333333 |
4 |
Orange |
4.0 |
green |
20 |
31.333333 |
5 |
Apple |
2.0 |
green |
44 |
31.333333 |
高级数据聚合
- 使用groupby分组后,也可以使用transform和apply提供自定义函数实现更多的运算
- df.groupby('item')['price'].sum() <==> df.groupby('item')['price'].apply(sum)
- transform和apply都会进行运算,在transform或者apply中传入函数即可
- transform和apply也可以传入一个lambda表达式
def my_mean(s):
m_sum = 0
for i in s:
m_sum += i
return m_sum / len(s)
df.groupby(by='item')['price'].transform(my_mean)
0 3.00
1 2.75
2 3.50
3 2.75
4 3.50
5 3.00
Name: price, dtype: float64
df.groupby(by='item')['price'].apply(my_mean)
item
Apple 3.00
Banana 2.75
Orange 3.50
Name: price, dtype: float64
数据加载
df = pd.read_csv('./data/type-.txt')
df
|
你好-我好-他也好 |
0 |
也许-大概-有可能 |
1 |
然而-未必-不见得 |
df.shape
(2, 1)
- 将文件中每一个词作为元素存放在DataFrame中
pd.read_csv('./data/type-.txt',header=None,sep='-')
|
0 |
1 |
2 |
0 |
你好 |
我好 |
他也好 |
1 |
也许 |
大概 |
有可能 |
2 |
然而 |
未必 |
不见得 |
#连接数据库,获取连接对象
import sqlite3 as sqlite3
conn = sqlite3.connect('./data/weather_2012.sqlite')
#读取库表中的数据值
sql_df=pd.read_sql('select * from weather_2012',conn)
sql_df
|
index |
Date/Time |
Temp (C) |
Dew Point Temp (C) |
Rel Hum (%) |
Wind Spd (km/h) |
Visibility (km) |
Stn Press (kPa) |
Weather |
0 |
0.0 |
2012-01-01 00:00:00 |
-1.8 |
-3.9 |
86.0 |
4.0 |
8.0 |
101.24 |
Fog |
1 |
1.0 |
2012-01-01 01:00:00 |
-1.8 |
-3.7 |
87.0 |
4.0 |
8.0 |
101.24 |
Fog |
2 |
2.0 |
2012-01-01 02:00:00 |
-1.8 |
-3.4 |
89.0 |
7.0 |
4.0 |
101.26 |
Freezing Drizzle,Fog |
3 |
3.0 |
2012-01-01 03:00:00 |
-1.5 |
-3.2 |
88.0 |
6.0 |
4.0 |
101.27 |
Freezing Drizzle,Fog |
4 |
4.0 |
2012-01-01 04:00:00 |
-1.5 |
-3.3 |
88.0 |
7.0 |
4.8 |
101.23 |
Fog |
5 |
5.0 |
2012-01-01 05:00:00 |
-1.4 |
-3.3 |
87.0 |
9.0 |
6.4 |
101.27 |
Fog |
6 |
6.0 |
2012-01-01 06:00:00 |
-1.5 |
-3.1 |
89.0 |
7.0 |
6.4 |
101.29 |
Fog |
7 |
7.0 |
2012-01-01 07:00:00 |
-1.4 |
-3.6 |
85.0 |
7.0 |
8.0 |
101.26 |
Fog |
8 |
8.0 |
2012-01-01 08:00:00 |
-1.4 |
-3.6 |
85.0 |
9.0 |
8.0 |
101.23 |
Fog |
9 |
9.0 |
2012-01-01 09:00:00 |
-1.3 |
-3.1 |
88.0 |
15.0 |
4.0 |
101.20 |
Fog |
10 |
10.0 |
2012-01-01 10:00:00 |
-1.0 |
-2.3 |
91.0 |
9.0 |
1.2 |
101.15 |
Fog |
11 |
11.0 |
2012-01-01 11:00:00 |
-0.5 |
-2.1 |
89.0 |
7.0 |
4.0 |
100.98 |
Fog |
12 |
12.0 |
2012-01-01 12:00:00 |
-0.2 |
-2.0 |
88.0 |
9.0 |
4.8 |
100.79 |
Fog |
13 |
13.0 |
2012-01-01 13:00:00 |
0.2 |
-1.7 |
87.0 |
13.0 |
4.8 |
100.58 |
Fog |
14 |
14.0 |
2012-01-01 14:00:00 |
0.8 |
-1.1 |
87.0 |
20.0 |
4.8 |
100.31 |
Fog |
15 |
15.0 |
2012-01-01 15:00:00 |
1.8 |
-0.4 |
85.0 |
22.0 |
6.4 |
100.07 |
Fog |
16 |
16.0 |
2012-01-01 16:00:00 |
2.6 |
-0.2 |
82.0 |
13.0 |
12.9 |
99.93 |
Mostly Cloudy |
17 |
17.0 |
2012-01-01 17:00:00 |
3.0 |
0.0 |
81.0 |
13.0 |
16.1 |
99.81 |
Cloudy |
18 |
18.0 |
2012-01-01 18:00:00 |
3.8 |
1.0 |
82.0 |
15.0 |
12.9 |
99.74 |
Rain |
19 |
19.0 |
2012-01-01 19:00:00 |
3.1 |
1.3 |
88.0 |
15.0 |
12.9 |
99.68 |
Rain |
20 |
20.0 |
2012-01-01 20:00:00 |
3.2 |
1.3 |
87.0 |
19.0 |
25.0 |
99.50 |
Cloudy |
21 |
21.0 |
2012-01-01 21:00:00 |
4.0 |
1.7 |
85.0 |
20.0 |
25.0 |
99.39 |
Cloudy |
22 |
22.0 |
2012-01-01 22:00:00 |
4.4 |
1.9 |
84.0 |
24.0 |
19.3 |
99.32 |
Rain Showers |
23 |
23.0 |
2012-01-01 23:00:00 |
5.3 |
2.0 |
79.0 |
30.0 |
25.0 |
99.31 |
Cloudy |
24 |
24.0 |
2012-01-02 00:00:00 |
5.2 |
1.5 |
77.0 |
35.0 |
25.0 |
99.26 |
Rain Showers |
25 |
25.0 |
2012-01-02 01:00:00 |
4.6 |
0.0 |
72.0 |
39.0 |
25.0 |
99.26 |
Cloudy |
26 |
26.0 |
2012-01-02 02:00:00 |
3.9 |
-0.9 |
71.0 |
32.0 |
25.0 |
99.26 |
Mostly Cloudy |
27 |
27.0 |
2012-01-02 03:00:00 |
3.7 |
-1.5 |
69.0 |
33.0 |
25.0 |
99.30 |
Mostly Cloudy |
28 |
28.0 |
2012-01-02 04:00:00 |
2.9 |
-2.3 |
69.0 |
32.0 |
25.0 |
99.26 |
Mostly Cloudy |
29 |
29.0 |
2012-01-02 05:00:00 |
2.6 |
-2.3 |
70.0 |
32.0 |
25.0 |
99.21 |
Mostly Cloudy |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
8756 |
8756.0 |
2012-12-30 20:00:00 |
-13.8 |
-16.5 |
80.0 |
24.0 |
25.0 |
101.52 |
Clear |
8757 |
8757.0 |
2012-12-30 21:00:00 |
-13.8 |
-16.5 |
80.0 |
20.0 |
25.0 |
101.50 |
Mainly Clear |
8758 |
8758.0 |
2012-12-30 22:00:00 |
-13.7 |
-16.3 |
81.0 |
19.0 |
25.0 |
101.54 |
Mainly Clear |
8759 |
8759.0 |
2012-12-30 23:00:00 |
-12.1 |
-15.1 |
78.0 |
28.0 |
25.0 |
101.52 |
Mostly Cloudy |
8760 |
8760.0 |
2012-12-31 00:00:00 |
-11.1 |
-14.4 |
77.0 |
26.0 |
25.0 |
101.51 |
Cloudy |
8761 |
8761.0 |
2012-12-31 01:00:00 |
-10.7 |
-14.0 |
77.0 |
15.0 |
25.0 |
101.50 |
Cloudy |
8762 |
8762.0 |
2012-12-31 02:00:00 |
-10.1 |
-13.4 |
77.0 |
9.0 |
25.0 |
101.45 |
Cloudy |
8763 |
8763.0 |
2012-12-31 03:00:00 |
-11.8 |
-14.4 |
81.0 |
6.0 |
25.0 |
101.42 |
Mostly Cloudy |
8764 |
8764.0 |
2012-12-31 04:00:00 |
-10.5 |
-12.8 |
83.0 |
11.0 |
25.0 |
101.34 |
Cloudy |
8765 |
8765.0 |
2012-12-31 05:00:00 |
-10.2 |
-12.4 |
84.0 |
6.0 |
25.0 |
101.28 |
Cloudy |
8766 |
8766.0 |
2012-12-31 06:00:00 |
-9.7 |
-11.7 |
85.0 |
4.0 |
25.0 |
101.23 |
Cloudy |
8767 |
8767.0 |
2012-12-31 07:00:00 |
-9.3 |
-11.3 |
85.0 |
0.0 |
19.3 |
101.19 |
Snow Showers |
8768 |
8768.0 |
2012-12-31 08:00:00 |
-8.6 |
-10.3 |
87.0 |
4.0 |
3.2 |
101.14 |
Snow Showers |
8769 |
8769.0 |
2012-12-31 09:00:00 |
-8.1 |
-9.6 |
89.0 |
4.0 |
2.4 |
101.09 |
Snow |
8770 |
8770.0 |
2012-12-31 10:00:00 |
-7.4 |
-8.9 |
89.0 |
4.0 |
6.4 |
101.05 |
Snow,Fog |
8771 |
8771.0 |
2012-12-31 11:00:00 |
-6.7 |
-7.9 |
91.0 |
9.0 |
9.7 |
100.93 |
Snow |
8772 |
8772.0 |
2012-12-31 12:00:00 |
-5.8 |
-7.5 |
88.0 |
4.0 |
12.9 |
100.78 |
Snow |
8773 |
8773.0 |
2012-12-31 13:00:00 |
-4.6 |
-6.6 |
86.0 |
4.0 |
12.9 |
100.63 |
Snow |
8774 |
8774.0 |
2012-12-31 14:00:00 |
-3.4 |
-5.7 |
84.0 |
6.0 |
11.3 |
100.57 |
Snow |
8775 |
8775.0 |
2012-12-31 15:00:00 |
-2.3 |
-4.6 |
84.0 |
9.0 |
9.7 |
100.47 |
Snow |
8776 |
8776.0 |
2012-12-31 16:00:00 |
-1.4 |
-4.0 |
82.0 |
13.0 |
12.9 |
100.40 |
Snow |
8777 |
8777.0 |
2012-12-31 17:00:00 |
-1.1 |
-3.3 |
85.0 |
19.0 |
9.7 |
100.30 |
Snow |
8778 |
8778.0 |
2012-12-31 18:00:00 |
-1.3 |
-3.1 |
88.0 |
17.0 |
9.7 |
100.19 |
Snow |
8779 |
8779.0 |
2012-12-31 19:00:00 |
0.1 |
-2.7 |
81.0 |
30.0 |
9.7 |
100.13 |
Snow |
8780 |
8780.0 |
2012-12-31 20:00:00 |
0.2 |
-2.4 |
83.0 |
24.0 |
9.7 |
100.03 |
Snow |
8781 |
8781.0 |
2012-12-31 21:00:00 |
-0.5 |
-1.5 |
93.0 |
28.0 |
4.8 |
99.95 |
Snow |
8782 |
8782.0 |
2012-12-31 22:00:00 |
-0.2 |
-1.8 |
89.0 |
28.0 |
9.7 |
99.91 |
Snow |
8783 |
8783.0 |
2012-12-31 23:00:00 |
0.0 |
-2.1 |
86.0 |
30.0 |
11.3 |
99.89 |
Snow |
8784 |
NaN |
None |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
Fog |
8785 |
NaN |
None |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
Fog |
8786 rows × 9 columns
#将一个df中的数据值写入存储到db
df.to_sql('sql_data456',conn)
透视表
import pandas as pd
import numpy as np
df = pd.read_csv('./data/透视表-篮球赛.csv',encoding='utf8')
df
|
对手 |
胜负 |
主客场 |
命中 |
投篮数 |
投篮命中率 |
3分命中率 |
篮板 |
助攻 |
得分 |
0 |
勇士 |
胜 |
客 |
10 |
23 |
0.435 |
0.444 |
6 |
11 |
27 |
1 |
国王 |
胜 |
客 |
8 |
21 |
0.381 |
0.286 |
3 |
9 |
27 |
2 |
小牛 |
胜 |
主 |
10 |
19 |
0.526 |
0.462 |
3 |
7 |
29 |
3 |
灰熊 |
负 |
主 |
8 |
20 |
0.400 |
0.250 |
5 |
8 |
22 |
4 |
76人 |
胜 |
客 |
10 |
20 |
0.500 |
0.250 |
3 |
13 |
27 |
5 |
黄蜂 |
胜 |
客 |
8 |
18 |
0.444 |
0.400 |
10 |
11 |
27 |
6 |
灰熊 |
负 |
客 |
6 |
19 |
0.316 |
0.222 |
4 |
8 |
20 |
7 |
76人 |
负 |
主 |
8 |
21 |
0.381 |
0.429 |
4 |
7 |
29 |
8 |
尼克斯 |
胜 |
客 |
9 |
23 |
0.391 |
0.353 |
5 |
9 |
31 |
9 |
老鹰 |
胜 |
客 |
8 |
15 |
0.533 |
0.545 |
3 |
11 |
29 |
10 |
爵士 |
胜 |
主 |
19 |
25 |
0.760 |
0.875 |
2 |
13 |
56 |
11 |
骑士 |
胜 |
主 |
8 |
21 |
0.381 |
0.429 |
11 |
13 |
35 |
12 |
灰熊 |
胜 |
主 |
11 |
25 |
0.440 |
0.429 |
4 |
8 |
38 |
13 |
步行者 |
胜 |
客 |
9 |
21 |
0.429 |
0.250 |
5 |
15 |
26 |
14 |
猛龙 |
负 |
主 |
8 |
25 |
0.320 |
0.273 |
6 |
11 |
38 |
15 |
太阳 |
胜 |
客 |
12 |
22 |
0.545 |
0.545 |
2 |
7 |
48 |
16 |
灰熊 |
胜 |
客 |
9 |
20 |
0.450 |
0.500 |
5 |
7 |
29 |
17 |
掘金 |
胜 |
主 |
6 |
16 |
0.375 |
0.143 |
8 |
9 |
21 |
18 |
尼克斯 |
胜 |
主 |
12 |
27 |
0.444 |
0.385 |
2 |
10 |
37 |
19 |
篮网 |
胜 |
主 |
13 |
20 |
0.650 |
0.615 |
10 |
8 |
37 |
20 |
步行者 |
胜 |
主 |
8 |
22 |
0.364 |
0.333 |
8 |
10 |
29 |
21 |
湖人 |
胜 |
客 |
13 |
22 |
0.591 |
0.444 |
4 |
9 |
36 |
22 |
爵士 |
胜 |
客 |
8 |
19 |
0.421 |
0.333 |
5 |
3 |
29 |
23 |
开拓者 |
胜 |
客 |
16 |
29 |
0.552 |
0.571 |
8 |
3 |
48 |
24 |
鹈鹕 |
胜 |
主 |
8 |
16 |
0.500 |
0.400 |
1 |
17 |
26 |
pivot_table有四个最重要的参数index、values、columns、aggfunc
df.pivot_table(index=['对手','主客场'])
|
|
3分命中率 |
助攻 |
命中 |
得分 |
投篮命中率 |
投篮数 |
篮板 |
对手 |
主客场 |
|
|
|
|
|
|
|
76人 |
主 |
0.4290 |
7.0 |
8.0 |
29.0 |
0.381 |
21.0 |
4.0 |
客 |
0.2500 |
13.0 |
10.0 |
27.0 |
0.500 |
20.0 |
3.0 |
勇士 |
客 |
0.4440 |
11.0 |
10.0 |
27.0 |
0.435 |
23.0 |
6.0 |
国王 |
客 |
0.2860 |
9.0 |
8.0 |
27.0 |
0.381 |
21.0 |
3.0 |
太阳 |
客 |
0.5450 |
7.0 |
12.0 |
48.0 |
0.545 |
22.0 |
2.0 |
小牛 |
主 |
0.4620 |
7.0 |
10.0 |
29.0 |
0.526 |
19.0 |
3.0 |
尼克斯 |
主 |
0.3850 |
10.0 |
12.0 |
37.0 |
0.444 |
27.0 |
2.0 |
客 |
0.3530 |
9.0 |
9.0 |
31.0 |
0.391 |
23.0 |
5.0 |
开拓者 |
客 |
0.5710 |
3.0 |
16.0 |
48.0 |
0.552 |
29.0 |
8.0 |
掘金 |
主 |
0.1430 |
9.0 |
6.0 |
21.0 |
0.375 |
16.0 |
8.0 |
步行者 |
主 |
0.3330 |
10.0 |
8.0 |
29.0 |
0.364 |
22.0 |
8.0 |
客 |
0.2500 |
15.0 |
9.0 |
26.0 |
0.429 |
21.0 |
5.0 |
湖人 |
客 |
0.4440 |
9.0 |
13.0 |
36.0 |
0.591 |
22.0 |
4.0 |
灰熊 |
主 |
0.3395 |
8.0 |
9.5 |
30.0 |
0.420 |
22.5 |
4.5 |
客 |
0.3610 |
7.5 |
7.5 |
24.5 |
0.383 |
19.5 |
4.5 |
爵士 |
主 |
0.8750 |
13.0 |
19.0 |
56.0 |
0.760 |
25.0 |
2.0 |
客 |
0.3330 |
3.0 |
8.0 |
29.0 |
0.421 |
19.0 |
5.0 |
猛龙 |
主 |
0.2730 |
11.0 |
8.0 |
38.0 |
0.320 |
25.0 |
6.0 |
篮网 |
主 |
0.6150 |
8.0 |
13.0 |
37.0 |
0.650 |
20.0 |
10.0 |
老鹰 |
客 |
0.5450 |
11.0 |
8.0 |
29.0 |
0.533 |
15.0 |
3.0 |
骑士 |
主 |
0.4290 |
13.0 |
8.0 |
35.0 |
0.381 |
21.0 |
11.0 |
鹈鹕 |
主 |
0.4000 |
17.0 |
8.0 |
26.0 |
0.500 |
16.0 |
1.0 |
黄蜂 |
客 |
0.4000 |
11.0 |
8.0 |
27.0 |
0.444 |
18.0 |
10.0 |
- values参数:需要对计算的数据进行筛选
- 如果我们只需要哈登在主客场和不同胜负情况下的得分、篮板与助攻三项数据:
df.pivot_table(index=['主客场','胜负'],values=['得分','篮板','助攻'])
|
|
助攻 |
得分 |
篮板 |
主客场 |
胜负 |
|
|
|
主 |
胜 |
10.555556 |
34.222222 |
5.444444 |
负 |
8.666667 |
29.666667 |
5.000000 |
客 |
胜 |
9.000000 |
32.000000 |
4.916667 |
负 |
8.000000 |
20.000000 |
4.000000 |
df.pivot_table(index=['主客场','胜负'],values=['得分','篮板','助攻'],aggfunc='sum')
|
|
助攻 |
得分 |
篮板 |
主客场 |
胜负 |
|
|
|
主 |
胜 |
95 |
308 |
49 |
负 |
26 |
89 |
15 |
客 |
胜 |
108 |
384 |
59 |
负 |
8 |
20 |
4 |
#获取所有队主客场的总得分
df.pivot_table(index='主客场',values='得分',aggfunc='sum')
#获取每个队主客场的总得分(在总得分的基础上又进行了对手的分类)
df.pivot_table(index='主客场',values='得分',columns='对手',aggfunc='sum',fill_value=0)
对手 |
76人 |
勇士 |
国王 |
太阳 |
小牛 |
尼克斯 |
开拓者 |
掘金 |
步行者 |
湖人 |
灰熊 |
爵士 |
猛龙 |
篮网 |
老鹰 |
骑士 |
鹈鹕 |
黄蜂 |
主客场 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
主 |
29 |
0 |
0 |
0 |
29 |
37 |
0 |
21 |
29 |
0 |
60 |
56 |
38 |
37 |
0 |
35 |
26 |
0 |
客 |
27 |
27 |
27 |
48 |
0 |
31 |
48 |
0 |
26 |
36 |
49 |
29 |
0 |
0 |
29 |
0 |
0 |
27 |
交叉表
- 是一种用于计算分组的特殊透视图,对数据进行汇总
- pd.crosstab(index,colums)
- index:分组数据,交叉表的行索引
- columns:交叉表的列索引
import pandas as pd
from pandas import DataFrame
df = DataFrame({'sex':['man','man','women','women','man','women','man','women','women'],
'age':[15,23,25,17,35,57,24,31,22],
'smoke':[True,False,False,True,True,False,False,True,False],
'height':[168,179,181,166,173,178,188,190,160]})
df
|
sex |
age |
smoke |
height |
0 |
man |
15 |
True |
168 |
1 |
man |
23 |
False |
179 |
2 |
women |
25 |
False |
181 |
3 |
women |
17 |
True |
166 |
4 |
man |
35 |
True |
173 |
5 |
women |
57 |
False |
178 |
6 |
man |
24 |
False |
188 |
7 |
women |
31 |
True |
190 |
8 |
women |
22 |
False |
160 |
pd.crosstab(df.smoke,df.sex)
sex |
man |
women |
smoke |
|
|
False |
2 |
3 |
True |
2 |
2 |
pd.crosstab(df.age,df.smoke)
smoke |
False |
True |
age |
|
|
15 |
0 |
1 |
17 |
0 |
1 |
22 |
1 |
0 |
23 |
1 |
0 |
24 |
1 |
0 |
25 |
1 |
0 |
31 |
0 |
1 |
35 |
0 |
1 |
57 |
1 |
0 |