【小睿的ML之路】Pandas数据预处理(含泰坦尼克号数据分析案例实战)
import pandas
food_info = pandas.read_csv("food_info.csv",encoding="gbk")
print(food_info)
名称 价格(元) 糖分(g) 重量(kg) 含水量(mg)
0 苹果 200 20 10 30
1 香蕉 100 50 30 70
2 梨 10 20 30 40
# 使用 sort_values() 排序,不修改原始数据
sorted_info = food_info.sort_values("价格(元)")
print("Sorted DataFrame (Not Inplace):")
print(sorted_info)
# 原始数据不受影响
print("\nOriginal DataFrame:")
print(food_info)
# 使用 sort_values() 排序,并修改原始数据
food_info.sort_values("价格(元)", inplace=True)
print("\nSorted DataFrame (Inplace):")
print(food_info)
Sorted DataFrame (Not Inplace):
名称 价格(元) 糖分(g) 重量(kg) 含水量(mg)
2 梨 10 20 30 40
1 香蕉 100 50 30 70
0 苹果 200 20 10 30
Original DataFrame:
名称 价格(元) 糖分(g) 重量(kg) 含水量(mg)
0 苹果 200 20 10 30
1 香蕉 100 50 30 70
2 梨 10 20 30 40
Sorted DataFrame (Inplace):
名称 价格(元) 糖分(g) 重量(kg) 含水量(mg)
2 梨 10 20 30 40
1 香蕉 100 50 30 70
0 苹果 200 20 10 30
# ascending=True,从小到大,反之从大到小
food_info.sort_values("价格(元)", inplace=True, ascending=False)
print(food_info)
名称 价格(元) 糖分(g) 重量(kg) 含水量(mg)
0 苹果 200 20 10 30
1 香蕉 100 50 30 70
2 梨 10 20 30 40
# 泰坦尼克号数据分析案例实战
import numpy as np
import pandas as pd
titanic_survival = pd.read_csv("titanic_train.csv")
# 舱位纬度
# pclass 舱位(1头等舱、2二等舱、3三等舱)
# cabin 客舱号
# survived 是否生还,0死亡,1生还
# 乘客纬度
# name姓名、sex性别、age年龄、sibsp兄弟姐妹/配偶数量
# parch父母/小孩数量
# 船票纬度
# ticket 船票信息
# fare 票价
# 地区纬度
# embarked 登船港口
print(titanic_survival.head())
PassengerId Survived Pclass \
0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3
Name Sex Age SibSp \
0 Braund, Mr. Owen Harris male 22.0 1
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
2 Heikkinen, Miss. Laina female 26.0 0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1
4 Allen, Mr. William Henry male 35.0 0
Parch Ticket Fare Cabin Embarked
0 0 A/5 21171 7.2500 NaN S
1 0 PC 17599 71.2833 C85 C
2 0 STON/O2. 3101282 7.9250 NaN S
3 0 113803 53.1000 C123 S
4 0 373450 8.0500 NaN S
age = titanic_survival["Age"]
# print(age.loc[0:10])
age_is_null = pd.isnull(age) # 判断缺失值
# print(age_is_null)
age_null_true = age[age_is_null] # 获取缺失值的索引
# print(age_null_true)
age_null_count = len(age_null_true) #查看指定列缺失值的记录数
print(age_null_count)
177
error_average_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"]) # 求年龄平均值,若数据存在NaN,则无法统计
print(error_average_age)
nan
good_ages = titanic_survival["Age"][age_is_null == False] # 过滤掉缺失值的指定列
# print(good_age)
average_age = sum(good_ages) / len(good_ages) # 求年龄普工君子
print(average_age)
29.69911764705882
good_average_age = titanic_survival["Age"].mean() # 调用mean()函数,过滤掉缺失值的指定列,求平均值
print(good_average_age)
29.69911764705882
# 计算不同舱位的平均票价
passenager_classes = [1,2,3]
fares_by_class = {}
for this_class in passenager_classes:
pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class] # 过滤出每个舱位的数据集
pclass_fares = pclass_rows["Fare"] # 获取每个舱位数据集的票价记录
fare_for_class = pclass_fares.mean() # 过滤掉票价列的缺失值,求平均值
fares_by_class[this_class] = fare_for_class # 赋值-统计
print(fares_by_class)
{1: 84.1546875, 2: 20.662183152173913, 3: 13.675550101832993}
# 计算不同舱位的平均获救概率 pivot_table方法
passenger_survival = titanic_survival.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean)
print(passenger_survival)
Survived
Pclass
1 0.629630
2 0.472826
3 0.242363
# 计算不同舱位的平均年龄,未设置aggfunc默认为求均值mean
passenger_age = titanic_survival.pivot_table(index="Pclass",values="Age")
print(passenger_age)
Age
Pclass
1 38.233441
2 29.877630
3 25.140620
# 计算不同登船港口,总票价和总生还人数
port_stats = titanic_survival.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum)
print(port_stats)
Fare Survived
Embarked
C 10072.2962 93
Q 1022.2543 30
S 17439.3988 217
drop_na_columns = titanic_survival.dropna(axis=1) #丢失缺失列
# print(drop_na_columns)
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age","Sex"]) #丢失缺失记录,可以指定筛选列
print(new_titanic_survival)
PassengerId Survived Pclass \
0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3
.. ... ... ...
885 886 0 3
886 887 0 2
887 888 1 1
889 890 1 1
890 891 0 3
Name Sex Age SibSp \
0 Braund, Mr. Owen Harris male 22.0 1
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
2 Heikkinen, Miss. Laina female 26.0 0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1
4 Allen, Mr. William Henry male 35.0 0
.. ... ... ... ...
885 Rice, Mrs. William (Margaret Norton) female 39.0 0
886 Montvila, Rev. Juozas male 27.0 0
887 Graham, Miss. Margaret Edith female 19.0 0
889 Behr, Mr. Karl Howell male 26.0 0
890 Dooley, Mr. Patrick male 32.0 0
Parch Ticket Fare Cabin Embarked
0 0 A/5 21171 7.2500 NaN S
1 0 PC 17599 71.2833 C85 C
2 0 STON/O2. 3101282 7.9250 NaN S
3 0 113803 53.1000 C123 S
4 0 373450 8.0500 NaN S
.. ... ... ... ... ...
885 5 382652 29.1250 NaN Q
886 0 211536 13.0000 NaN S
887 0 112053 30.0000 B42 S
889 0 111369 30.0000 C148 C
890 0 370376 7.7500 NaN Q
[714 rows x 12 columns]
# 定位具体值
row_index_83_age = titanic_survival.loc[83,"Age"] # 行号,列名
print(row_index_83_age)
28.0
# 按年龄排序
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
print(new_titanic_survival[0:10])
#重置排序
titanic_reindexed = new_titanic_survival.reset_index(drop=True)
print(titanic_reindexed.loc[0:10])
PassengerId Survived Pclass Name \
630 631 1 1 Barkworth, Mr. Algernon Henry Wilson
851 852 0 3 Svensson, Mr. Johan
493 494 0 1 Artagaveytia, Mr. Ramon
96 97 0 1 Goldschmidt, Mr. George B
116 117 0 3 Connors, Mr. Patrick
672 673 0 2 Mitchell, Mr. Henry Michael
745 746 0 1 Crosby, Capt. Edward Gifford
33 34 0 2 Wheadon, Mr. Edward H
54 55 0 1 Ostby, Mr. Engelhart Cornelius
280 281 0 3 Duane, Mr. Frank
Sex Age SibSp Parch Ticket Fare Cabin Embarked
630 male 80.0 0 0 27042 30.0000 A23 S
851 male 74.0 0 0 347060 7.7750 NaN S
493 male 71.0 0 0 PC 17609 49.5042 NaN C
96 male 71.0 0 0 PC 17754 34.6542 A5 C
116 male 70.5 0 0 370369 7.7500 NaN Q
672 male 70.0 0 0 C.A. 24580 10.5000 NaN S
745 male 70.0 1 1 WE/P 5735 71.0000 B22 S
33 male 66.0 0 0 C.A. 24579 10.5000 NaN S
54 male 65.0 0 1 113509 61.9792 B30 C
280 male 65.0 0 0 336439 7.7500 NaN Q
PassengerId Survived Pclass Name Sex \
0 631 1 1 Barkworth, Mr. Algernon Henry Wilson male
1 852 0 3 Svensson, Mr. Johan male
2 494 0 1 Artagaveytia, Mr. Ramon male
3 97 0 1 Goldschmidt, Mr. George B male
4 117 0 3 Connors, Mr. Patrick male
5 673 0 2 Mitchell, Mr. Henry Michael male
6 746 0 1 Crosby, Capt. Edward Gifford male
7 34 0 2 Wheadon, Mr. Edward H male
8 55 0 1 Ostby, Mr. Engelhart Cornelius male
9 281 0 3 Duane, Mr. Frank male
10 457 0 1 Millet, Mr. Francis Davis male
Age SibSp Parch Ticket Fare Cabin Embarked
0 80.0 0 0 27042 30.0000 A23 S
1 74.0 0 0 347060 7.7750 NaN S
2 71.0 0 0 PC 17609 49.5042 NaN C
3 71.0 0 0 PC 17754 34.6542 A5 C
4 70.5 0 0 370369 7.7500 NaN Q
5 70.0 0 0 C.A. 24580 10.5000 NaN S
6 70.0 1 1 WE/P 5735 71.0000 B22 S
7 66.0 0 0 C.A. 24579 10.5000 NaN S
8 65.0 0 1 113509 61.9792 B30 C
9 65.0 0 0 336439 7.7500 NaN Q
10 65.0 0 0 13509 26.5500 E38 S
分类:
机器学习
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型,支持深度思考和联网搜索!
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· ollama系列01:轻松3步本地部署deepseek,普通电脑可用
· 25岁的心里话
· 按钮权限的设计及实现