import numpy as np
import pandas as pd
text = pd.read_csv('train_chinese.csv')
text.head()
| 乘客ID | 是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 |
---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
---|
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
---|
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
---|
frame = pd.DataFrame(np.arange(8).reshape((2,4)),
index = ['three','one'],
columns = ['d','a','b','c'])
排序与排名
- 默认升序排列,降序排列设置参数为 ascending = False
- sort_inde 按行索引,参数axis = 1 按列名排序
- sort_values 按值排序; sort_values(by =[]) 按照一个或者多个列名字进行排序
frame.sort_index()
frame.sort_index(axis = 1,ascending = False)
frame.sort_values(by = ['b','c'],ascending = False)
Series 排名
- rank 排名,对数组进行从1到有效数据总数分配名次的操作,相同数据,平均排名
- 加上参数 rank(method = ‘first’),按照观察顺序排名,相同数据靠前的,排名靠前
- rank(ascending = False,method = ‘max’)
DataFrame 排名
frame.rank(axis = ‘columns’)
平级关系打破方法
- average 默认:在每个组中平均排名
- min 对整个组使用最小排名
- max 对整个组使用最大排名
- first 按照值在数据中出现的顺序分配排名
obj = pd.Series([7,-2,7,4,6,8])
obj.rank(ascending = 'False',method = 'max')
0 5.0
1 1.0
2 5.0
3 2.0
4 3.0
5 6.0
dtype: float64
text.sort_values(by = ['年龄'],ascending = False).head(10)
| 乘客ID | 是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 |
---|
630 | 631 | 1 | 1 | Barkworth, Mr. Algernon Henry Wilson | male | 80.0 | 0 | 0 | 27042 | 30.0000 | A23 | S |
---|
851 | 852 | 0 | 3 | Svensson, Mr. Johan | male | 74.0 | 0 | 0 | 347060 | 7.7750 | NaN | S |
---|
493 | 494 | 0 | 1 | Artagaveytia, Mr. Ramon | male | 71.0 | 0 | 0 | PC 17609 | 49.5042 | NaN | C |
---|
96 | 97 | 0 | 1 | Goldschmidt, Mr. George B | male | 71.0 | 0 | 0 | PC 17754 | 34.6542 | A5 | C |
---|
116 | 117 | 0 | 3 | Connors, Mr. Patrick | male | 70.5 | 0 | 0 | 370369 | 7.7500 | NaN | Q |
---|
672 | 673 | 0 | 2 | Mitchell, Mr. Henry Michael | male | 70.0 | 0 | 0 | C.A. 24580 | 10.5000 | NaN | S |
---|
745 | 746 | 0 | 1 | Crosby, Capt. Edward Gifford | male | 70.0 | 1 | 1 | WE/P 5735 | 71.0000 | B22 | S |
---|
33 | 34 | 0 | 2 | Wheadon, Mr. Edward H | male | 66.0 | 0 | 0 | C.A. 24579 | 10.5000 | NaN | S |
---|
54 | 55 | 0 | 1 | Ostby, Mr. Engelhart Cornelius | male | 65.0 | 0 | 1 | 113509 | 61.9792 | B30 | C |
---|
280 | 281 | 0 | 3 | Duane, Mr. Frank | male | 65.0 | 0 | 0 | 336439 | 7.7500 | NaN | Q |
---|
text.sort_values(by = ['年龄']).head(10)
| 乘客ID | 是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 |
---|
803 | 804 | 1 | 3 | Thomas, Master. Assad Alexander | male | 0.42 | 0 | 1 | 2625 | 8.5167 | NaN | C |
---|
755 | 756 | 1 | 2 | Hamalainen, Master. Viljo | male | 0.67 | 1 | 1 | 250649 | 14.5000 | NaN | S |
---|
644 | 645 | 1 | 3 | Baclini, Miss. Eugenie | female | 0.75 | 2 | 1 | 2666 | 19.2583 | NaN | C |
---|
469 | 470 | 1 | 3 | Baclini, Miss. Helene Barbara | female | 0.75 | 2 | 1 | 2666 | 19.2583 | NaN | C |
---|
78 | 79 | 1 | 2 | Caldwell, Master. Alden Gates | male | 0.83 | 0 | 2 | 248738 | 29.0000 | NaN | S |
---|
831 | 832 | 1 | 2 | Richards, Master. George Sibley | male | 0.83 | 1 | 1 | 29106 | 18.7500 | NaN | S |
---|
305 | 306 | 1 | 1 | Allison, Master. Hudson Trevor | male | 0.92 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S |
---|
827 | 828 | 1 | 2 | Mallet, Master. Andre | male | 1.00 | 0 | 2 | S.C./PARIS 2079 | 37.0042 | NaN | C |
---|
381 | 382 | 1 | 3 | Nakid, Miss. Maria ("Mary") | female | 1.00 | 0 | 2 | 2653 | 15.7417 | NaN | C |
---|
164 | 165 | 0 | 3 | Panula, Master. Eino Viljami | male | 1.00 | 4 | 1 | 3101295 | 39.6875 | NaN | S |
---|
从上边可以看出,忽略其他因素,年龄越大,存活率越低!
DataFrame的数据运算与对齐
df1 = pd.DataFrame(np.arange(12).reshape((3,4)),
index = ['three','one','a'],
columns = ['a','b','c','d']
)
df1
| a | b | c | d |
---|
three | 0 | 1 | 2 | 3 |
---|
one | 4 | 5 | 6 | 7 |
---|
a | 8 | 9 | 10 | 11 |
---|
df2 = pd.DataFrame(np.arange(12).reshape((3,4)),
index = ['b','one','a'],
columns = ['a','e','c','f']
)
df2
df2+df1
| a | b | c | d | e | f |
---|
a | 16.0 | NaN | 20.0 | NaN | NaN | NaN |
---|
b | NaN | NaN | NaN | NaN | NaN | NaN |
---|
one | 8.0 | NaN | 12.0 | NaN | NaN | NaN |
---|
three | NaN | NaN | NaN | NaN | NaN | NaN |
---|
max(text['兄弟姐妹个数']+text['父母子女个数'])
10
text['票价'].describe()
count 891.000000
mean 32.204208
std 49.693429
min 0.000000
25% 7.910400
50% 14.454200
75% 31.000000
max 512.329200
Name: 票价, dtype: float64
text['父母子女个数'].describe()
count 891.000000
mean 0.381594
std 0.806057
min 0.000000
25% 0.000000
50% 0.000000
75% 0.000000
max 6.000000
Name: 父母子女个数, dtype: float64