python数据分析学习1

1.pandas库用于数据处理
numpy库用于**
matplotlib.pyplot库用于统计图
2.导入数据
fileDf=pd.read_csv('文件名.csv')
3.处理缺失值
fileDf.fillna(method='ffill',implace=True)
对缺失值进行填充,用上一行填充
method参数的取值 : {‘pad’, ‘ffill’,‘backfill’, ‘bfill’, None}, default None

pad/ffill:用前一个非缺失值去填充该缺失值

backfill/bfill:用下一个非缺失值填充该缺失值

None:指定一个值去替换缺失值(缺省默认这种方式)

inplace参数的取值:True、False

True:直接修改原对象

False:创建一个副本,修改副本,原对象不变(缺省默认)
 例子:分析泰坦尼克号
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
fileDf = pd.read_csv('titanic2.csv')#导入数据
fileDf.fillna(method='ffill',inplace=True)#处理缺失值,用上面未缺失的代替
del fileDf['SexCode']#删除无用项

fig = plt.figure(figsize=(10,8))#设置画布大小
p1 = plt.subplot(221)#统计图位置
bins = [0, 12, 60, 100]#将age分段
bins_label = ["under 12", 'between 12 and 60', 'above 60']#标签名
fileDf['Age_cut'] = pd.cut(fileDf['Age'], bins=bins, labels=bins_label)#数据分箱
#print(fileDf['Age_cut'].value_counts())#输出分箱统计
# 再对数据进行统计,结果发现儿童的获救率明显高于成年人和老年人
Age_0 = fileDf.loc[fileDf['Survived'] == 0, 'Age_cut'].value_counts()
Age_1 = fileDf.loc[fileDf['Survived'] == 1, 'Age_cut'].value_counts()
df_age = pd.DataFrame({'survived':Age_1, 'dead':Age_0})#设置标签及其所对应的数据
df_age.plot(kind='barh', ax=p1)#设置横向条形图
plt.title('Survived in age')#图名

p2 = plt.subplot(222)
s_d = fileDf.loc[fileDf['Survived'] == 0,'Sex'].value_counts()
s_s = fileDf.loc[fileDf['Survived'] == 1,'Sex'].value_counts()
sex = pd.DataFrame({'dead':s_d,'survived':s_s})
sex.plot(kind='bar',stacked=True,ax=p2)
plt.title('Sex in survived')
#sex.plot(kind='pie',subplots=True,ax=p2)

p3 = plt.subplot(223)
pclass_0 = fileDf.loc[fileDf['Survived'] == 0,'PClass'].value_counts()
pclass_1 = fileDf.loc[fileDf['Survived'] == 1,'PClass'].value_counts()
pclass = pd.DataFrame({'dead':pclass_0,'survived':pclass_1})
pclass.plot(kind='bar',stacked=True,ax=p3)
plt.title('PClass in survived')


plt.show()
#print(fileDf['Survived'].value_counts())

#print(fileDf.head(10))#输出前n行
#fileDf.to_csv('df.csv')#导出文件
#print(fileDf.info())#输出基本信息

 

posted @ 2021-11-17 20:51  vicsee  阅读(59)  评论(0编辑  收藏  举报