数据分析第三章
1.绘制箱型图并标记异常数据
#%% 箱型图 import pandas as pd import matplotlib.pyplot as plt #餐饮数据 catering_sale = r"D:\py_project\a_三下\Python数据分析与挖掘实战-源代码与数据\第3章源代码\catering_sale.xls" data = pd.read_excel(catering_sale,index_col = u'日期') plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False plt.figure() p = data.boxplot(return_type='dict') #异常值坐标 x = p['fliers'][0].get_xdata() y = p['fliers'][0].get_ydata() y.sort() print(x) print(len(x)) print(y) #用annotate添加注释 for i in range(len(x)): if i > 0: plt.annotate(y[i],xy=(x[i],y[i]), xytext=(x[i]+0.05 - 0.8/(y[i]-y[i-1]),y[i])) else: plt.annotate(y[i],xy=(x[i],y[i]), xytext=(x[i]+0.08,y[i])) #展示图
plt.title('3141') plt.show()
该箱型图简洁明了的绘制出该组数据的离群点
2.直方图
#%% 频率直方图 import pandas as pd import numpy as np import matplotlib.pyplot as plt catering_sale = r"D:\py_project\a_三下\Python数据分析与挖掘实战-源代码与数据\第3章源代码\catering_sale.xls" data = pd.read_excel(catering_sale,names = ['date','sale']) bins = [0,500,1000,1500,2000,2500,3000,3500,4000] labels = ['[0,500)','[500,1000)','[1000,1500)','[1500,2000)', '[2000,2500)','[2500,3000)','[3000,3500)','[3500,4000)'] data['sale分层'] = pd.cut(data.sale, bins, labels=labels) aggResult = data.groupby(by=['sale分层'])['sale'].agg([("count", "count")]) pAggResult = round(aggResult/aggResult.sum(), 2, ) * 100 import matplotlib.pyplot as plt plt.figure(figsize=(9,6)) # 设置图框大小尺寸 pAggResult['count'].plot(kind='bar',width=0.6,fontsize=10) # 绘制频率直方图 plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.title('季度销售额频率分布直方图3141',fontsize=20) plt.show()
揭示数据的分布特征和分布类型
3.饼图
#%% 饼图 import pandas as pd import matplotlib.pyplot as plt catering_dish_profit = r"D:\py_project\a_三下\Python数据分析与挖掘实战-源代码与数据\第3章源代码\catering_dish_profit.xls" data = pd.read_excel(catering_dish_profit) x = data['盈利'] labels = data['菜品名'] plt.figure(figsize = (8,6)) plt.pie(x,labels=labels) plt.rcParams['font.sans-serif'] = 'SimHei' plt.title('菜品销售量分布(饼图)3141') plt.axis('equal') plt.show()
4.折线图
#%% 折线图 import pandas as pd import matplotlib.pyplot as plt data=pd.read_excel(r"D:\py_project\a_三下\Python数据分析与挖掘实战-源代码与数据\第3章源代码\dish_sale.xls") plt.figure(figsize=(8,4)) plt.plot(data['月份'],data['A部门'],color='green',label='A部门',marker='o') plt.plot(data['月份'],data['B部门'],color='red',label='B部门',marker='s') plt.plot(data['月份'],data['C部门'],color='skyblue',label='C部门',marker='x') plt.legend() plt.ylabel('销售额(万元)') plt.title('3部门之间销售额的比较3141',fontsize=20) plt.show()
5.趋势图
#%% 趋势图 import pandas as pd import matplotlib.pyplot as plt df_normal = pd.read_csv(r"D:\py_project\a_三下\Python数据分析与挖掘实战-源代码与数据\data\\user.csv") plt.figure(figsize = (8,4)) plt.plot(df_normal["Date"],df_normal["Eletricity"]) plt.xlabel("日期") x_major_locator = plt.MultipleLocator(7) ax = plt.gca() ax.xaxis.set_major_locator(x_major_locator) plt.ylabel("每日电量") plt.title("正常用户电量趋势3141") plt.rcParams['font.sans-serif'] = ['SimHei'] plt.show() df_steal = pd.read_csv(r"D:\py_project\a_三下\Python数据分析与挖掘实战-源代码与数据\data\\user.csv") plt.figure(figsize = (10,9)) plt.plot(df_steal["Date"],df_steal["Eletricity"]) plt.xlabel("日期") plt.ylabel("日期") x_major_locator = plt.MultipleLocator(7) ax = plt.gca() ax.xaxis.set_major_locator(x_major_locator) plt.title("窃电用户电量趋势3141") plt.rcParams['font.sans-serif'] = ['SimHei'] plt.show()
6.帕累托图
#%% 帕累托图 import pandas as pd dish_profit = r"D:\py_project\a_三下\Python数据分析与挖掘实战-源代码与数据\data\catering_dish_profit.xls" data = pd.read_excel(dish_profit, index_col='菜品名') data = data['盈利'].copy() data.sort_values(ascending=False) # 排序 import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False plt.figure() data.plot(kind='bar') plt.ylabel('盈利(元)') p = 1.0 * data.cumsum() / data.sum() # 分别计算总盈利额占该盈利额的值 p.plot(color='r', secondary_y=True, style='-o', linewidth=2) plt.annotate(format(p[6], '.4%'), xy=(6, p[6]), xytext=(6 * 0.9, p[6] * 0.9), arrowprops=dict(arrowstyle='->', connectionstyle='arc3, rad=.2')) plt.ylabel('盈利(比例)') plt.title('菜品盈利数据帕累托图3141') plt.show()