3.5.3 数据排序;重复数值、缺失值处理
import numpy as np
import pandas as pd
df = pd.read_csv('table.csv',index_col='ID') #用来指定表格的索引值
df.head(2)
School | Class | Gender | Address | Height | Weight | Math | Physics | |
---|---|---|---|---|---|---|---|---|
ID | ||||||||
1101 | S_1 | C_1 | M | street_1 | 173 | 63 | 34.0 | A+ |
1102 | S_1 | C_1 | F | street_2 | 192 | 73 | 32.5 | B+ |
2.排序
a)索引排序index
df.set_index('Weight').head(6)
School | Class | Gender | Address | Height | Math | Physics | |
---|---|---|---|---|---|---|---|
Weight | |||||||
63 | S_1 | C_1 | M | street_1 | 173 | 34.0 | A+ |
73 | S_1 | C_1 | F | street_2 | 192 | 32.5 | B+ |
82 | S_1 | C_1 | M | street_2 | 186 | 87.2 | B+ |
81 | S_1 | C_1 | F | street_2 | 167 | 80.4 | B- |
64 | S_1 | C_1 | F | street_4 | 159 | 84.8 | B+ |
68 | S_1 | C_2 | M | street_5 | 188 | 97.0 | A- |
</div>
df.set_index('Weight').sort_index(ascending=False).head(3) #ascending=False为降序,默认是升序
School | Class | Gender | Address | Height | Math | Physics | |
---|---|---|---|---|---|---|---|
Weight | |||||||
100 | S_2 | C_2 | M | street_5 | 193 | 39.1 | B |
99 | S_2 | C_3 | F | street_7 | 190 | 65.9 | C |
97 | S_2 | C_1 | F | street_5 | 159 | 72.2 | B+ |
b)值排序values
df.sort_values(by='Height').head(3)
School | Class | Gender | Address | Height | Weight | Math | Physics | |
---|---|---|---|---|---|---|---|---|
ID | ||||||||
2203 | S_2 | C_2 | M | street_4 | 155 | 91 | 73.8 | A+ |
2103 | S_2 | C_1 | M | street_4 | 157 | 61 | 52.5 | B- |
2301 | S_2 | C_3 | F | street_4 | 157 | 78 | 72.3 | B+ |
df.sort_values(by=['Height','Weight'],ascending=False).head(4) #多值的排序,先对第一次层排序,在第一层相同的情况下,再对下一层排序
School | Class | Gender | Address | Height | Weight | Math | Physics | |
---|---|---|---|---|---|---|---|---|
ID | ||||||||
1304 | S_1 | C_3 | M | street_2 | 195 | 70 | 85.2 | A |
2202 | S_2 | C_2 | F | street_7 | 194 | 77 | 68.5 | B+ |
2201 | S_2 | C_2 | M | street_5 | 193 | 100 | 39.1 | B |
2405 | S_2 | C_4 | F | street_6 | 193 | 54 | 47.6 | B |
3.重复值
a)duplicated 告诉我们是否重复,以布尔值的形式告知
df.duplicated('Address').head(6) #True和False指的是:一个数值,自上而下,若第一次出现,为False;若之后再次出现(可多次出现),则为True
ID 1101 False 1102 False 1103 True 1104 True 1105 False 1201 False dtype: bool
df.duplicated('Class',keep='last').tail(8)
ID 2303 True 2304 True 2305 False 2401 True 2402 True 2403 True 2404 True 2405 False dtype: bool
df.tail(8)
School | Class | Gender | Address | Height | Weight | Math | Physics | |
---|---|---|---|---|---|---|---|---|
ID | ||||||||
2303 | S_2 | C_3 | F | street_7 | 190 | 99 | 65.9 | C |
2304 | S_2 | C_3 | F | street_6 | 164 | 81 | 95.5 | A- |
2305 | S_2 | C_3 | M | street_4 | 187 | 73 | 48.9 | B |
2401 | S_2 | C_4 | F | street_2 | 192 | 62 | 45.3 | A |
2402 | S_2 | C_4 | M | street_7 | 166 | 82 | 48.7 | B |
2403 | S_2 | C_4 | F | street_6 | 158 | 60 | 59.7 | B+ |
2404 | S_2 | C_4 | F | street_2 | 160 | 84 | 67.7 | B |
2405 | S_2 | C_4 | F | street_6 | 193 | 54 | 47.6 | B |
b)drop_duplicates
单个字段重复值处理
df['Class'].value_counts()
C_1 10 C_3 10 C_2 10 C_4 5 Name: Class, dtype: int64
df.drop_duplicates('Class') #保留删除重复值后的唯一项
School | Class | Gender | Address | Height | Weight | Math | Physics | |
---|---|---|---|---|---|---|---|---|
ID | ||||||||
1101 | S_1 | C_1 | M | street_1 | 173 | 63 | 34.0 | A+ |
1201 | S_1 | C_2 | M | street_5 | 188 | 68 | 97.0 | A- |
1301 | S_1 | C_3 | M | street_4 | 161 | 68 | 31.5 | B+ |
2401 | S_2 | C_4 | F | street_2 | 192 | 62 | 45.3 | A |
df.drop_duplicates('Class',keep='last') #last的意思是:保留重复值中最后出现的,first与之相反
School | Class | Gender | Address | Height | Weight | Math | Physics | |
---|---|---|---|---|---|---|---|---|
ID | ||||||||
2105 | S_2 | C_1 | M | street_4 | 170 | 81 | 34.2 | A |
2205 | S_2 | C_2 | F | street_7 | 183 | 76 | 85.4 | B |
2305 | S_2 | C_3 | M | street_4 | 187 | 73 | 48.9 | B |
2405 | S_2 | C_4 | F | street_6 | 193 | 54 | 47.6 | B |
</div>
多个字段重复值处理
df.drop_duplicates(['School','Class']) #多列的重复值,在这里可以联想为多层的索引
School | Class | Gender | Address | Height | Weight | Math | Physics | |
---|---|---|---|---|---|---|---|---|
ID | ||||||||
1101 | S_1 | C_1 | M | street_1 | 173 | 63 | 34.0 | A+ |
1201 | S_1 | C_2 | M | street_5 | 188 | 68 | 97.0 | A- |
1301 | S_1 | C_3 | M | street_4 | 161 | 68 | 31.5 | B+ |
2101 | S_2 | C_1 | M | street_7 | 174 | 84 | 83.3 | C |
2201 | S_2 | C_2 | M | street_5 | 193 | 100 | 39.1 | B |
2301 | S_2 | C_3 | F | street_4 | 157 | 78 | 72.3 | B+ |
2401 | S_2 | C_4 | F | street_2 | 192 | 62 | 45.3 | A |
</div>
df.drop_duplicates(['School','Class'],keep='last')
School | Class | Gender | Address | Height | Weight | Math | Physics | |
---|---|---|---|---|---|---|---|---|
ID | ||||||||
1105 | S_1 | C_1 | F | street_4 | 159 | 64 | 84.8 | B+ |
1205 | S_1 | C_2 | F | street_6 | 167 | 63 | 68.4 | B- |
1305 | S_1 | C_3 | F | street_5 | 187 | 69 | 61.7 | B- |
2105 | S_2 | C_1 | M | street_4 | 170 | 81 | 34.2 | A |
2205 | S_2 | C_2 | F | street_7 | 183 | 76 | 85.4 | B |
2305 | S_2 | C_3 | M | street_4 | 187 | 73 | 48.9 | B |
2405 | S_2 | C_4 | F | street_6 | 193 | 54 | 47.6 | B |
4.缺失值
a)了解缺失的信息
qs_data = pd.read_csv('table_missing.csv') qs_data.head()
School | Class | ID | Gender | Address | Height | Weight | Math | Physics | |
---|---|---|---|---|---|---|---|---|---|
0 | S_1 | C_1 | NaN | M | street_1 | 173 | NaN | 34.0 | A+ |
1 | S_1 | C_1 | NaN | F | street_2 | 192 | NaN | 32.5 | B+ |
2 | S_1 | C_1 | 1103.0 | M | street_2 | 186 | NaN | 87.2 | B+ |
3 | S_1 | NaN | NaN | F | street_2 | 167 | 81.0 | 80.4 | NaN |
4 | S_1 | C_1 | 1105.0 | NaN | street_4 | 159 | 64.0 | 84.8 | A- |
isna
qs_data['Physics'].isna().head()
0 False 1 False 2 False 3 True 4 False Name: Physics, dtype: bool
qs_data.isna().sum() #通过以上操作能得出每列有多少缺失值
School 0 Class 4 ID 6 Gender 7 Address 0 Height 0 Weight 13 Math 5 Physics 4 dtype: int64
info
qs_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 35 entries, 0 to 34 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 School 35 non-null object 1 Class 31 non-null object 2 ID 29 non-null float64 3 Gender 28 non-null object 4 Address 35 non-null object 5 Height 35 non-null int64 6 Weight 22 non-null float64 7 Math 30 non-null float64 8 Physics 31 non-null object dtypes: float64(3), int64(1), object(5) memory usage: 2.6+ KB
b)查看缺失值所在行
qs_data[qs_data['Physics'].isna()]
School | Class | ID | Gender | Address | Height | Weight | Math | Physics | |
---|---|---|---|---|---|---|---|---|---|
3 | S_1 | NaN | NaN | F | street_2 | 167 | 81.0 | 80.4 | NaN |
8 | S_1 | C_2 | 1204.0 | F | street_5 | 162 | 63.0 | 33.8 | NaN |
13 | S_1 | C_3 | 1304.0 | NaN | street_2 | 195 | 70.0 | 85.2 | NaN |
22 | S_2 | C_2 | 2203.0 | M | street_4 | 155 | 91.0 | 73.8 | NaN |
c)选出所有缺失或者非缺失列
qs_data[qs_data.notna().all(1)]
School | Class | ID | Gender | Address | Height | Weight | Math | Physics | |
---|---|---|---|---|---|---|---|---|---|
5 | S_1 | C_2 | 1201.0 | M | street_5 | 159 | 68.0 | 97.0 | A- |
6 | S_1 | C_2 | 1202.0 | F | street_4 | 176 | 94.0 | 63.5 | B- |
12 | S_1 | C_3 | 1303.0 | M | street_7 | 188 | 82.0 | 49.7 | B |
17 | S_2 | C_1 | 2103.0 | M | street_4 | 157 | 61.0 | 52.5 | B- |
21 | S_2 | C_2 | 2202.0 | F | street_7 | 194 | 77.0 | 68.5 | B+ |
25 | S_2 | C_3 | 2301.0 | F | street_4 | 157 | 78.0 | 72.3 | B+ |
27 | S_2 | C_3 | 2303.0 | F | street_7 | 190 | 99.0 | 65.9 | C |
28 | S_2 | C_3 | 2304.0 | F | street_6 | 164 | 81.0 | 95.5 | A- |
29 | S_2 | C_3 | 2305.0 | M | street_4 | 187 | 73.0 | 48.9 | B |
</div>
d)数据的填充或放置或删除
fillna
qs_data['Physics'].fillna('2333').head(4)
0 A+ 1 B+ 2 B+ 3 2333 Name: Physics, dtype: object
qs_data['Physics'].fillna(method='ffill').head(4) #用缺失值所在行的上一行的值进行填充
0 A+ 1 B+ 2 B+ 3 B+ Name: Physics, dtype: object
qs_data['Physics'].fillna(method='backfill').head(4) #用缺失值所在行的下一行的值进行填充
0 A+ 1 B+ 2 B+ 3 A- Name: Physics, dtype: object
qs_data['Weight'].fillna(qs_data['Weight'].mean()).head() #填充还可以使用平均值、中位数、众数等 #前提,已知的缺失数据量不能较大
0 76.863636 1 76.863636 2 76.863636 3 81.000000 4 64.000000 Name: Weight, dtype: float64
dropna
qs_data.dropna(axis=1)
qs_data.dropna()
School | Class | ID | Gender | Address | Height | Weight | Math | Physics | |
---|---|---|---|---|---|---|---|---|---|
5 | S_1 | C_2 | 1201.0 | M | street_5 | 159 | 68.0 | 97.0 | A- |
6 | S_1 | C_2 | 1202.0 | F | street_4 | 176 | 94.0 | 63.5 | B- |
12 | S_1 | C_3 | 1303.0 | M | street_7 | 188 | 82.0 | 49.7 | B |
17 | S_2 | C_1 | 2103.0 | M | street_4 | 157 | 61.0 | 52.5 | B- |
21 | S_2 | C_2 | 2202.0 | F | street_7 | 194 | 77.0 | 68.5 | B+ |
25 | S_2 | C_3 | 2301.0 | F | street_4 | 157 | 78.0 | 72.3 | B+ |
27 | S_2 | C_3 | 2303.0 | F | street_7 | 190 | 99.0 | 65.9 | C |
28 | S_2 | C_3 | 2304.0 | F | street_6 | 164 | 81.0 | 95.5 | A- |
29 | S_2 | C_3 | 2305.0 | M | street_4 | 187 | 73.0 | 48.9 | B |
qs_data.head(8)
School | Class | ID | Gender | Address | Height | Weight | Math | Physics | |
---|---|---|---|---|---|---|---|---|---|
0 | S_1 | C_1 | NaN | M | street_1 | 173 | NaN | 34.0 | A+ |
1 | S_1 | C_1 | NaN | F | street_2 | 192 | NaN | 32.5 | B+ |
2 | S_1 | C_1 | 1103.0 | M | street_2 | 186 | NaN | 87.2 | B+ |
3 | S_1 | NaN | NaN | F | street_2 | 167 | 81.0 | 80.4 | NaN |
4 | S_1 | C_1 | 1105.0 | NaN | street_4 | 159 | 64.0 | 84.8 | A- |
5 | S_1 | C_2 | 1201.0 | M | street_5 | 159 | 68.0 | 97.0 | A- |
6 | S_1 | C_2 | 1202.0 | F | street_4 | 176 | 94.0 | 63.5 | B- |
7 | S_1 | C_2 | NaN | M | street_6 | 160 | 53.0 | 58.8 | A+ |
小石小石摩西摩西的学习笔记,欢迎提问,欢迎指正!!!