11-21

念念不忘 必有回响

马拉松数据的探索(matplotlib)

#马拉松数据随堂练习
import json
import pandas as pd
import numpy as np
!curl -O https://raw.githubusercontent.com/jakevdp/marathon-data/master/marathon-data.csv # 下载数据的简便方法
data=pd.read_csv("marathon-data.csv")
data.head()#查看前五行

Out[4]:
 agegendersplitfinal
0 33 M 1:05:38 2:08:51
1 32 M 1:06:26 2:09:28
2 31 M 1:06:49 2:10:42
3 38 M 1:06:16 2:13:45
4 31 M 1:06:32 2:13:59
In [15]:
data.shape#查看有多少行多少列

Out[15]:

(18371, 4)
In [18]:
data.dtypes#查看数据类型
Out[18]:
age        int64
gender    object
split     object
final     object
dtype: object
In [24]:
data.tail()#查看后5行

Out[24]:

 agegendersplitfinal
18366 33 W 02:05:40 04:43:23
18367 48 M 01:59:04 04:43:23
18368 44 M 02:05:47 04:43:23
18369 32 W 02:11:40 04:43:23
18370 3 NaN NaN NaN
In [25]:
data.index#查看索引
Out[25]:
RangeIndex(start=0, stop=18371, step=1)
In [26]:
data.values#查看内容
Out[26]:
array([[33, 'M', '01:05:38', '02:08:51'],
       [32, 'M', '01:06:26', '02:09:28'],
       [31, 'M', '01:06:49', '02:10:42'],
       ...,
       [44, 'M', '02:05:47', '04:43:23'],
       [32, 'W', '02:11:40', '04:43:23'],
       [3, nan, nan, nan]], dtype=object)
In [30]:
data.columns#查看列名称
Out[30]:
Index(['age', 'gender', 'split', 'final'], dtype='object')
In [32]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18371 entries, 0 to 18370
Data columns (total 4 columns):
age       18371 non-null int64
gender    18370 non-null object
split     18370 non-null object
final     18370 non-null object
dtypes: int64(1), object(3)
memory usage: 574.2+ KB
In [33]:
data.describe()
Out[33]:
 age
count 18371.000000
mean 39.662675
std 9.210472
min 3.000000
25% 33.000000
50% 40.000000
75% 46.000000
max 78.000000
In [76]:
data.dtypes
Out[76]:
age        int64
gender    object
split     object
final     object
dtype: object
In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

In [5]:

def convert_time(s):#批量把字符串转换成时间格式
    h,m,s=map(int,s.split(":"))
    return pd.datetools.timedelta(hours=h,minutes=m,seconds=s)

In [6]:

data=pd.read_csv('marathon-data.csv',
converters={'split':convert_time,'final':convert_time})
data.head()#在执行中多次报错,显示int无法转换,经过查看原来数据集最后一行,后三个变量为空,导致无法转换,删除之后运行正常

Out[6]:

 agegendersplitfinal
0 33 M 01:05:38 02:08:51
1 32 M 01:06:26 02:09:28
2 31 M 01:06:49 02:10:42
3 38 M 01:06:16 02:13:45
4 31 M 01:06:32 02:13:59
 
 
In [6]:
data.dtypes

Out[6]:

age                 int64
gender             object
split     timedelta64[ns]
final     timedelta64[ns]
dtype: object
In [7]:
def transfor_time(tt):#增加两行 把时间转换成秒,方便seaborn 画图
      return tt.total_seconds()
data['split_sec'] = data['split'].apply(transfor_time)
data['final_sec'] = data['final'].apply(transfor_time)
data.head()

Out[7]:

 agegendersplitfinalsplit_secfinal_sec
0 33 M 01:05:38 02:08:51 3938.0 7731.0
1 32 M 01:06:26 02:09:28 3986.0 7768.0
2 31 M 01:06:49 02:10:42 4009.0 7842.0
3 38 M 01:06:16 02:13:45 3976.0 8025.0
4 31 M 01:06:32 02:13:59 3992.0 8039.0
In [41]:
sns.jointplot("split_sec","final_sec",data=data,kind="kde")
Out[41]:
<seaborn.axisgrid.JointGrid at 0x195e3fcb160>
 
In [8]:
sns.jointplot("split_sec","final_sec",data=data,kind="hex")
Out[8]:
<seaborn.axisgrid.JointGrid at 0x2f97a33d668>
 
In [11]:
with sns.axes_style("white"):#主题风格与调色板设定
    g=sns.jointplot("split_sec","final_sec",data,kind="hex")
     g.ax_joint.plot(np.linspace(4000,16000),
                            np.linspace(8000,32000),":k")            

 

In [23]:
data["split_feac"]=1-2*data["split_sec"]/data["final_sec"]
data.head()#添加一列表示选手前后赛程的差异,如果差异系数小于零表示这个人是后半程加速选手

 

Out[23]:
 agegendersplitfinalsplit_secfinal_secsplit_feac
0 33 M 01:05:38 02:08:51 3938.0 7731.0 -0.018756
1 32 M 01:06:26 02:09:28 3986.0 7768.0 -0.026262
2 31 M 01:06:49 02:10:42 4009.0 7842.0 -0.022443
3 38 M 01:06:16 02:13:45 3976.0 8025.0 0.009097
4 31 M 01:06:32 02:13:59 3992.0 8039.0 0.006842
In [23]:
 sns.distplot(data["split_feac"],kde=False)#频次直方图
plt.axvline(0,color="r",linestyle="--")#设置分割线
Out[23]:
<matplotlib.lines.Line2D at 0x2f97bf7a358>
 
In [24]:
sum(data.split_feac<0)#该图表明能够做到后半程加速的选手相当的少,只有231个人
Out[24]:
231
In [12]:
g=sns.PairGrid(data,vars=["age","split_sec","final_sec","split_feac"],hue="gender",palette="RdBu_r")
g.map(plt.scatter,alpha=0.8)
g.add_legend()
Out[12]:
<seaborn.axisgrid.PairGrid at 0x1c22fced4a8>
 
In [29]:
 
sns.kdeplot(data.split_feac[data.gender=="M"],label="men",shade=True)
sns.kdeplot(data.split_feac[data.gender=="W"],label="women",shade=True)
plt.xlabel("splot_frac")
plt.legend()#查看男女选手后半程差异系数的分布情况,男性要多很多
Out[29]:
<matplotlib.legend.Legend at 0x1c231c599b0>
 
In [46]:
%time sns.violinplot("gender","split_feac",data=data,palette=["lightblue","lightpink"])
#用小提请图对比不同性别的差异函数分布
 
Wall time: 296 ms
Out[46]:
<matplotlib.axes._subplots.AxesSubplot at 0x1c2321574a8>
 
In [32]:
#创建新的一列表示选手的年龄段
data["age_dec"]=data.age.map(lambda age:10*(age//10))#每10岁为一个档次
data.head()
 
Out[32]:
 agegendersplitfinalsplit_secfinal_secsplit_feacage_dec
0 33 M 01:05:38 02:08:51 3938.0 7731.0 -0.018756 30
1 32 M 01:06:26 02:09:28 3986.0 7768.0 -0.026262 30
2 31 M 01:06:49 02:10:42 4009.0 7842.0 -0.022443 30
3 38 M 01:06:16 02:13:45 3976.0 8025.0 0.009097 30
4 31 M 01:06:32 02:13:59 3992.0 8039.0 0.006842 30
In [42]:
 
men=(data.gender=="M")#设置图列
women=(data.gender=="W")
sns.axes_style(style=None)
sns.violinplot("age_dec","split_feac",hue="gender",data=data,aplit=True,inner="quartile",palette=["lightblue","lightpink"])
Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x1c231f88710>
 
In [45]:
 
g=sns.lmplot("final_sec","split_feac",col="gender",data=data,markers=".",scatter_kws=dict(color="c"))
g.map(plt.axhline,y=0.1,color="k",ls=":")
plt.xlim(8000,40000)
plt.ylim(-0.2,0.5)
Out[45]:
(-0.2, 0.5)
 
In [ ]:
import seaborn as sns #一种更画面更简洁,操作更简单的画图包
sns.kdeplot(x)#画KDE图,用于查看变量是否平滑
sns.distplot(x)#画频次直方图与KDE结合的图像
sns.jointplot(x,y,data)#画联合分布图,用于查看两两数据的相关信息
sns.pairplot(data)#画联合矩阵图,观察数据集两两间的相关性
sns.boxplot()#画箱线图,观察某个变量之间的影响情况,使用中注意 KIND 参数的使用
#详细参数参考帮助文档
sns.violinplot()#画小提琴图
 

posted on 2019-11-21 23:19  11-21  阅读(563)  评论(0编辑  收藏  举报

导航