数据探索

缺失值分析及箱型图

数据：catering_sale.xls（餐饮日销额数）

缺失值使用函数：describe()函数，能算出数据集的八个统计量

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

# read excel file 
df = pd.read_excel('catering_sale.xls',index_col='日期')
print(df.head())

# Generate descriptive statitsticsb
descriptive_stats = df.describe(include='all').transpose()

# Find any missing values in dataframe
missing_values = df['销量'].isnull().sum()

# Print the results 
print(descriptive_stats)
print(missing_values)

                销量
日期                
2015-03-01    51.0
2015-02-28  2618.2
2015-02-27  2608.4
2015-02-26  2651.9
2015-02-25  3442.1
    count       mean         std   min       25%      50%       75%      max
销量  200.0  2755.2147  751.029772  22.0  2451.975  2655.85  3026.125  9106.44
1

极差-变异系数-四分位数间距

df=pd.read_excel('catering_sale.xls',index_col='日期')
df=df[(df['销量']>400)&(df['销量']<5000)]
descriptive_stats = df.describe(include='all').transpose()
descriptive_stats['range']=descriptive_stats['max']-descriptive_stats['min']
descriptive_stats['var']=descriptive_stats['std']/descriptive_stats['mean']
descriptive_stats['dis']=descriptive_stats['75%']-descriptive_stats['25%']
descriptive_stats

	count	mean	std	min	25%	50%	75%	max	range	var	dis
销量	195.0	2744.595385	424.739407	865.0	2460.6	2655.9	3023.2	4065.2	3200.2	0.154755	562.6

箱型图

# 画出数据的箱线图
import matplotlib.pyplot as plt  # 导入图像库

plt.rc('font', family='sans-serif')

plt.figure()
p = df.boxplot(return_type='dict')  # 画箱线图, 处理异常值
x = p['fliers'][0].get_xdata() # 'flies'即为异常值的标签
y = p['fliers'][0].get_ydata()
y.sort()

for index in range(len(x)):
  if index > 0:
    plt.annotate(y[index], xy = (x[index], y[index]), xytext=(x[index]+0.05 -0.8/(y[index]-y[index-1]), y[index]))
  else:
    plt.annotate(y[index], xy = (x[index], y[index]), xytext=(x[index]+0.08, y[index]))

plt.title('3322-li箱型图',fontsize=12)
plt.show()

频率分布直方图

数据：catering_fish_congee.xls （“捞起生鱼片”每日销售额）

频率分布直方图

# Read in data and designate 'Date' column as index 
data = pd.read_excel("catering_fish_congee.xls",names=['date','sale'])

# Create bins (0-500, 500-1000, etc.)
bins = [0,500,1000,1500,2000,2500,3000,3500,4000]
labels = ['[0,500)','[500,1000)','[1000,1500)','[1500,2000)',
       '[2000,2500)','[2500,3000)','[3000,3500)','[3500,4000)']

# Group data according to bins and compute the number of instances
data['sale分层'] = pd.cut(data.sale, bins, labels=labels)  # Binning
aggResult = data.groupby(by=['sale分层'])['sale'].agg(np.size)         # Aggregration
print(aggResult)

# Calculate percentage of each bin and show result
pAggResult = round(aggResult/aggResult.sum(), 2, ) * 100
print(pAggResult)

# Plot graph to visualize result
plt.figure(figsize=(10,6))  # Set figure size
pAggResult.plot(kind='bar',width=0.8,fontsize=10)  # Draw frequency histogram
plt.title('3322-li季度销售额频率分布直方图',fontsize=12) # Set chart title
plt.xticks(rotation=45)
plt.show()

sale分层
[0,500)        28
[500,1000)     20
[1000,1500)    12
[1500,2000)    12
[2000,2500)     8
[2500,3000)     3
[3000,3500)     4
[3500,4000)     3
Name: sale, dtype: int64
sale分层
[0,500)        31.0
[500,1000)     22.0
[1000,1500)    13.0
[1500,2000)    13.0
[2000,2500)     9.0
[2500,3000)     3.0
[3000,3500)     4.0
[3500,4000)     3.0
Name: sale, dtype: float64

饼图

import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_excel('catering_dish_profit.xls')  # 读取数据

# Set figure size 
plt.figure(figsize=(10,6))

# Draw pie chart 
x = data['盈利']
labels = data['菜品名']
plt.pie(x, labels=labels)

# Set title and make pie chart look nice
plt.title('3322-li菜品销售量分布（饼图）', fontsize=15)
plt.axis('equal')

(-1.0999995230582924, 1.09999997728849, -1.0999999973110348, 1.099999968474505)

条形图

x = data['菜品名']
y = data['盈利']
plt.figure(figsize=(10,6))  # 设置图框大小尺寸
plt.title('3322-li菜品销售量分布（条形图）', fontsize=15)  
plt.bar(x, y, color='g') # 绘制饼图
plt.rcParams['font.sans-serif'] = ['SimHei']   
plt.xlabel('菜品')	# 设置 x 轴标签 
plt.ylabel('销量')	# 设置 y 轴标签 
plt.show()

用于比较数据--折线图

数据：dish_sale.xls，不同部门各月份的销售额

#部门之间销售金额比较
data = pd.read_excel("dish_sale.xls")  # 读取数据

plt.figure(figsize=(10,6))  # 设置图框大小尺寸

plt.plot(data['月份'],data['A部门'],color='skyblue',label='A部门',marker='o')
plt.plot(data['月份'],data['B部门'],color='red',label='B部门',marker='*')
plt.plot(data['月份'],data['C部门'],color='yellow',label='C部门',marker='s')

plt.legend()#显示图例
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.title('3322-li部门之间销售金额比较',fontsize=15)
plt.ylabel('销售额（万元）')
plt.show()

数据：dish_sale_b'.xls,B部门各年份之间销售金额比较

#B部门各年份之间销售金额比较
data = pd.read_excel("dish_sale_b.xls")  # 读取数据

plt.figure(figsize=(10,6))  # 设置图框大小尺寸

# Set parameters
line_width = 2
colors = ["green", "yellow", "purple"]
markers = ["o", "*", "s"]
labels = ["2012年", "2013年", "2014年"]

for i, year in enumerate(["2012年", "2013年", "2014年"]):
    plt.plot(data['月份'], data[year], color=colors[i], 
    label=labels[i], marker=markers[i], linewidth=line_width)

plt.legend()#显示图例
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.title('3148-TangB部门各年份之间销售金额比较',fontsize=15)
plt.ylabel('销售额（万元）')
plt.show()

周期性分析

数据：user.csv（某单位日用电量）

df_normal = pd.read_csv("user.csv") # 读入数据
plt.figure(figsize=(10,6))
plt.plot(df_normal["Date"],df_normal["Eletricity"],color='green')
plt.xlabel("日期")
plt.ylabel("每日电量")

# 设置x轴刻度间隔
x_major_locator = plt.MultipleLocator(7)
ax = plt.gca()
ax.xaxis.set_major_locator(x_major_locator)
plt.show()  # 展示图片

数据：Steal user.csv（窃电用户数据）

df_steal = pd.read_csv("Steal user.csv")
plt.figure(figsize=(10, 6))
plt.plot(df_steal["Date"],df_steal["Eletricity"],color='yellow')
plt.xlabel("日期")
plt.ylabel("日期")

# 设置x轴刻度间隔
x_major_locator = plt.MultipleLocator(7)
ax = plt.gca()
ax.xaxis.set_major_locator(x_major_locator)
plt.title("3322li-窃电用户电量趋势",fontsize=15)
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.show()  # 展示图片

帕累托图

import pandas as pd

data = pd.read_excel('catering_dish_profit.xls', index_col = u'菜品名')
data = data['盈利'].copy()
data.sort_values(ascending = False)


plt.figure()
data.plot(kind='bar')
plt.ylabel('盈利（元）')#左侧y轴，右侧y轴
p = 1.0*data.cumsum()/data.sum()
p.plot(color = 'r', secondary_y = True, style = '-o',linewidth = 2)
plt.annotate(format(p[6], '.4%'), xy = (6, p[6]), xytext=(6*0.9, p[6]*0.9), arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=.8")) #添加注释，即85%处的标记。这里包括了指定箭头样式。
plt.ylabel('盈利（比例）')
plt.title("3322li-菜品盈利数据帕累托图",fontsize=15)
plt.show()

	百合酱蒸凤爪	翡翠蒸香茜饺	金银蒜汁蒸排骨	乐膳真味鸡	蜜汁焗餐包	生炒菜心	铁板酸菜豆腐	香煎韭菜饺	香煎罗卜糕	原汁原味菜心
百合酱蒸凤爪	1.00	0.01	0.02	0.46	0.10	0.31	0.20	0.13	-0.09	0.43
翡翠蒸香茜饺	0.01	1.00	0.30	-0.01	0.06	-0.18	-0.03	0.06	0.27	0.02
金银蒜汁蒸排骨	0.02	0.30	1.00	0.04	0.10	-0.18	0.19	0.12	0.08	0.03
乐膳真味鸡	0.46	-0.01	0.04	1.00	0.02	0.33	0.30	-0.07	-0.03	0.42
蜜汁焗餐包	0.10	0.06	0.10	0.02	1.00	0.31	0.50	0.16	0.17	0.53
生炒菜心	0.31	-0.18	-0.18	0.33	0.31	1.00	0.37	0.04	0.05	0.12
铁板酸菜豆腐	0.20	-0.03	0.19	0.30	0.50	0.37	1.00	0.10	0.16	0.57
香煎韭菜饺	0.13	0.06	0.12	-0.07	0.16	0.04	0.10	1.00	0.18	0.05
香煎罗卜糕	-0.09	0.27	0.08	-0.03	0.17	0.05	0.16	0.18	1.00	0.09
原汁原味菜心	0.43	0.02	0.03	0.42	0.53	0.12	0.57	0.05	0.09	1.00

python主要数据探索函数

计算协方差矩阵 cov（）

import pandas as pd 
import numpy as np
D = pd.DataFrame (np.random. randn (6, 5) ) #产生6X5随机矩阵
D.cov() #计算协方差矩阵

	0	1	2	3	4
0	1.41	0.64	-1.61	0.09	0.57
1	0.64	0.55	-0.79	-0.02	0.14
2	-1.61	-0.79	1.91	-0.18	-0.57
3	0.09	-0.02	-0.18	0.54	0.30
4	0.57	0.14	-0.57	0.30	0.63

绘制正弦与余弦虚线

import matplotlib.pyplot as plt
import numpy as np
x = np.linspace(0,2*(np.pi))
y1 = np.sin(x)
y2 = np.cos(x)
plt.figure()
plt.plot(x,y1,'r-o')
plt.plot(x,y2,'b-*')
plt.legend("sin(x)")

<matplotlib.legend.Legend at 0x294574e3460>

饼图 hist（）

import matplotlib.pyplot as plt #导入作图库
# The slices will be ordered and plotted counter-clockwise.
labels = ' Frogs', 'Hogs', ' Dogs', 'Logs' , #定义标签
sizes = [15, 30, 45, 10] # 每一块的比例
colors = [ 'yellowgreen', 'gold', 'lightskyblue', 'lightcoral' ] #每一块的颜色
explode = (0, 0.1, 0, 0) #突出显示，这里仅仅突出显示第二块(即'Hogs')
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%',shadow=True, startangle=90)
plt .axis ('equal') #显示为圆(避免比例压缩为椭圆)
plt.show()

函数图

import matplotlib.pyplot as plt  # 导入作图库
import numpy as np
import pandas as pd

plt.figure(figsize=(8,6))
x = pd. Series (np.exp (np. arange (20) ) ) #原始数据
plt.subplot(1,2,1)
x.plot (label = '原始数据图',legend = True,color='lightskyblue')
plt.title('3322li',fontsize=15)
plt.subplot(1,2,2)
x.plot (logy = True,label = '原始数据图',legend = True,color='lightcoral') 
plt.show()

误差棒图

import numpy as np
import matplotlib.pyplot as plt


# 构造数据
x = np.arange(10)
y = 2.5 * np.sin(x / 20 * np.pi)
# 构造一维数组
yerr1 = np.linspace(0.05, 0.2, 10)
# 构造(2,N)结构的数组
yerr2 = np.random.rand(2,10)

# 浮点值，所有点统一对称误差
plt.errorbar(x, y, yerr=0.5)
# 一维数组，每个点分别根据数组对应元素设置对称误差
plt.errorbar(x, y - 1, yerr=yerr1)
# (2,N)结构数组，每个点分别根据数组设置误差
# 第一行对应每个点的负误差，第二行对应每个点的正误差
plt.errorbar(x, y - 2, yerr=yerr2)

plt.show()

posted @ 2023-02-26 14:21 里列昂遗失的记事本阅读(40) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

里列昂遗失的记事本

数据探索

数据探索

缺失值分析及箱型图

极差-变异系数-四分位数间距

箱型图

频率分布直方图

饼图

条形图

用于比较数据--折线图

周期性分析

帕累托图

相关性分析

python主要数据探索函数

计算协方差矩阵 cov（）

绘制正弦与余弦虚线

饼图 hist（）

函数图

误差棒图

公告