数据特征分析-分布分析
分布分析用于研究数据的分布特征,常用分析方法:
1、极差
2、频率分布
3、分组组距及组数
df = pd.DataFrame({'编码':['001','002','003','004','005','006','007','008','009','010','011','012','013','014','015'],\ '小区':['A村','B村','C村','D村','E村','A村','B村','C村','D村','E村','A村','B村','C村','D村','E村'],\ '朝向':['south','east_north','south','east_south','eath_south','north','east_north','west_north','south','west','north','east_north','south','south','east'],\ '单价':[7374,6435,6643,5874,6738,6453,5733,6034,5276,5999,6438,5864,6099,5699,6999],\ '首付':[15,7.5,18,10,30,10,18,30,40,30,20,22,29,30,40],\ '总价':[50,65,68,73,80,55,45,70,59,57,40,60,50,48,60],\ '经度':[114.0,114.6,114.8,114.2,114.5,114.3,114.4,114.7,114.9,114.1,114.8,114.2,114.5,114.3,114.8],\ '纬度':[22.0,22.4,22.6,22.8,22.2,22.1,22.7,22.5,22.9,22.3,22.8,22.2,22.1,22.7,22.5] })
先对总体做关于经纬度的散点图
plt.scatter(df['经度'],df['纬度'],s = df['单价']/50,c = df['总价'],cmap='Greens') #原点的大小可以表示单价,越大单价越高;颜色深浅可以表示总价,越深总价越高
求总价、单价和首付的极差
def d_range(df,*cols): krange = [] for c in cols: crange = df[c].max() - df[c].min() krange.append(crange) return ('%s极差:%s\n%s极差:%s\n%s极差:%s'%(cols[0],krange[0],cols[1],krange[1],cols[2],krange[2])) print(d_range(df,'总价','单价','首付')) # 总价极差:40 # 单价极差:2098 # 首付极差:32.5
单价和总价的频率分布
fig,axes = plt.subplots(1,2,figsize = (10,4)) df['单价'].hist(bins = 8,ax = axes[0]) df['总价'].hist(bins = 8,ax = axes[1])
将总价分为8个区间,求出每个区间的频数、频率,并求出累计频率
# 频率分布,分组区间 total_range = pd.cut(df['总价'],8) #通过cut将总价分为8个区间 total_range_count = total_range.value_counts(sort=False) #求每个区间的个数,结果为一个Seris,不按列的大小排序 total_range_s = pd.DataFrame(total_range_count) #将Seris转化为DataFrame,生成一个用于统计总价的DataFrame # # total_range_s.rename(columns = {total_range_count.name:'频数',inplace = True}) total_range_s.columns = ['频数'] #给转化后的DataFrame重命名列 df['区间'] = total_range.values #给原数据加一列区间 total_range_s['频率'] = total_range_s['频数']/total_range_s['频数'].sum() #求总价在每个区间出现的频率 total_range_s['累计频率'] = total_range_s['频率'].cumsum() ##求总价在每个区间的累计频率 total_range_s['频率%'] = total_range_s['频率'].apply(lambda x:'%.2f%%'%(100*x)) #格式化频率列,显示为2位百分数 total_range_s['累计频率%'] = total_range_s['累计频率'].apply(lambda x:'%.2f%%'%(100*x))#格式化频率列,显示为2位百分数 total_range_s.style.bar(subset = ['频率','累计频率'])
对每个总价区间出现的频率做柱状图
total_range_s['频率'].plot(kind = 'bar',alpha = 0.8,title ='total price interval') x = range(len(total_range_s.index)) for i,j,k in zip(x,total_range_s['频率'],total_range_s['频数']): plt.text(i,j+0.01,k)
对于单个字段比如朝向,做频率统计分析
# 频率分布 定性字段 cx = df['朝向'].value_counts() cx_s = pd.DataFrame(cx) cx_s.columns = ['频数'] cx_s['频率'] = cx_s['频数']/cx_s['频数'].sum() cx_s['累计频率'] = cx_s['频率'].cumsum() cx_s['频率%'] = cx_s['频率'].apply(lambda x:'%.2f%%'%(100*x)) cx_s['累计频率%'] = cx_s['累计频率'].apply(lambda x:'%.2f%%'%(100*x)) cx_s.style.bar(subset = ['频率','累计频率'] )
对朝向做柱状图和饼图
fig,axes = plt.subplots(1,2,figsize = (10,4)) cx_s['频率'].plot(kind = 'bar',ax = axes[0],title = 'direction bar') plt.pie(cx_s['频数'],labels=cx_s.index,autopct='%2.f%%') plt.title('direction pie')