python 数据可视化

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

　　matplotlib参数设置

matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family']='sans-serif'
matplotlib.rcParams['axes.unicode_minus'] = False
#matplotlib.fontsize='15'

#plt.rcParams['figure.figsize'] = (12.0,5.0)  #设置图形大小

#图形内嵌式，notebook模式下（注释不可加在下列命令后）
%matplotlib inline
#ipython模式下
#%pylab inline

　　seaborn参数设置

#Seaborn有两组函数对风格进行控制：axes_style()/set_style()函数和plotting_context()/set_context()函数。
#Seaborn有5种预定义的主题：darkgrid（默认）、whitegrid、dark、white、ticks
#Seaborn有4种预定义的上下文：paper、notebook（默认）、talk、poster
sns.set_style("whitegrid")
'''
sns.set_context("poster")
sns.set_style(style=None, rc=None)
sns.despine(offset=10)  #图与轴线距离
sns.despine()  #去除刻度和轴线
sns.set_context(fontscale=1.5)  #字体大小
sns.set_context(rc={'lines.linewidth':1.5)  #线宽
sns.set()   #恢复默认值
'''

　　其他参数设置

myfont = matplotlib.font_manager.FontProperties(fname="simsun.ttc")  #自定义字体库simsun.ttc
ax1.set_xlabel('时间', fontproperties=myfont, size=18)  #原始matplotlib不支持中文
plt.gcf().set_facecolor(np.ones(3) * 240/255)  #设置背景色
plt.gcf().autofmt_xdate()  #自动适应刻度线密度，包括x轴，y轴
plt.legend(loc=1)  #1,2,3,4分别对应图像的右上角，左上角，左下角，右下角
ax.invert_xaxis()  #将x轴逆序

　　线图（1）

#数据
x=np.linspace(0,10,1000)
y1=np.sin(x)
y2=np.cos(x)
y3=np.cos(x**2)

plt.figure(1) #图编号
plt.subplot(221)
plt.plot(x,y1,label="$sin(x)$",color="red",linewidth=2)
plt.plot(x,y2,label="$cos(x)$",color="blue",linewidth=2)

plt.subplot(222)
plt.scatter(x[:1000:50],y2[:1000:50],color="blue",label="$cos(x^2)$")

plt.subplot(212) #改变图分块
plt.plot(x,y1+y3,"g-",label="$sin(x)+cos(x^2)$")
plt.xlabel("time")
plt.ylabel("value")
plt.title("$sin(x)+cos(x^2)$ curve")
plt.xlim(-0.2,10.2)
plt.legend()#显示左下角的图例

plt.subplots_adjust(left=0.08,right=0.95,wspace=0.25,hspace=0.45)
#subplots_adjust类似于网页css格式化中的边距处理，取决于你需要绘制的大小和各模块之间的间距
plt.show()

　　线图（2）

plt.figure(3)
plt.rcParams['figure.figsize'] = (12,4)

plt.subplot(121)
def sinplot(flip=1):
    x=np.linspace(0,14,100)
    for i in range(1,7):
        plt.plot(x,np.sin(x+i*0.5)*(7-i)*flip)
sinplot()

plt.subplot(122)
x = np.arange(0, 2*np.pi, 0.02)  
y = np.sin(x)  
y1 = np.sin(2*x)  
y2 = np.sin(3*x)  
ym1 = np.ma.masked_where(y1 > 0.5, y1)  
ym2 = np.ma.masked_where(y2 < -0.5, y2)  
#绘图
lines = plt.plot(x, y, x, ym1, x, ym2, 'o')  
#设置线的属性
plt.setp(lines[0], linewidth=1)  
plt.setp(lines[1], linewidth=2)  
plt.setp(lines[2], linestyle='-',marker='^',markersize=2)  
#线的标签
plt.legend(('No mask', 'Masked if > 0.5', 'Masked if < -0.5'), loc='upper right')  
plt.title('Masked line demo')  
plt.show()

　　条形图+饼图+直方图+阶梯图

plt.figure(2)
#数据
np.random.seed(sum(map(ord,"aesthetics")))
d1 = dict([['A',5], ['B',7], ['C',3]])
d2 = np.random.randn(1000)

#条形图
plt.subplot(221) 
plt.bar(d1.keys(),d1.values(),align='center') #,alpha=.7,color='g'
#plt.bar(range(3),d1.values(),align='center')
#plt.xticks(range(3),xticks)
plt.ylabel("Frequency")
plt.title("Numbers of Books Students Read")

#饼图
plt.subplot(222)
plt.pie(d1.values(),labels=d1.keys(),autopct='%1.1f%%')
plt.title("Number of Books Students Read")

#直方图
plt.subplot(223)
plt.hist(d2,100)
plt.xlabel('Heights')
plt.ylabel('Frequency')
plt.title('Height of Students')

#阶梯曲线/累积分布曲线
plt.subplot(224) 
plt.hist(d2,20,normed=True,histtype='step',cumulative=True)
plt.xlabel('Heights')
plt.ylabel('Frequency')
plt.title('Heights of Students')

plt.subplots_adjust(left=0.08,right=0.95,wspace=0.25,hspace=0.45)  #图间距
plt.show()

　　饼图+箱线图

plt.figure(2)
plt.subplot(121)  #fig, ax
animals = dict([['frogs',15], ['hogs',20], ['dogs',45],['cats',10]])
colors = 'yellowgreen','gold','lightskyblue','lightcoral'
explode = 0,0.1,0,0
plt.pie(animals.values(), explode=explode, labels=animals.keys(), 
        colors=colors, autopct='%1.1f%%', shadow=True, startangle=50)  #ax.pie
#ax.set(aspect="equal", title='Pie plot with animals')
plt.axis('equal')

plt.subplot(122)
plt.boxplot(animals.values(),labels=['animals'])
#plt.boxplot((x,y,z),labels=('x','y','z')) #水平vert=False，whis=1.5
#df.boxplot()
plt.title('Heights of Students')
plt.show()

　　雷达图 + 圆环图

plt.figure(figsize=(12,4), facecolor="white")
#数据
labels=np.array(['综合', '第一周','第二周','第三周', '第四周', '第五周'])  #标签
nAttr = 6  #数据点个数
values = np.array([88.7, 85, 90, 95, 70, 96])  #原始数据
angles = np.linspace(0,2*np.pi, nAttr, endpoint=False)  #弧度
#首尾相连
values = np.concatenate((values,[values[0]]))
angles = np.concatenate((angles,[angles[0]]))
#绘图
plt.subplot(121, polar=True)  #极坐标系
plt.plot(angles, values, 'bo-', color='g', linewidth=2)  #线
plt.fill(angles, values, facecolor='g', alpha=0.2)  #区域
plt.thetagrids(angles*180/np.pi, labels)  #标签
#plt.figtext(0.52, 0.95, 'python成绩分析图', ha='center')  #标题
plt.title('python成绩分析图')
plt.grid(True)
#plt.savefig('dota_radar.JPG')

plt.subplot(122)
#fig, ax = plt.subplots()
vals1 = [1, 2, 3, 4]
vals2 = [2, 3, 4, 5]
vals3=[1]
labels = 'A', 'B', 'C', 'D'
plt.pie(vals1, radius=1.2, autopct='%1.1f%%', pctdistance=0.9)
plt.pie(vals2, radius=1, autopct='%1.1f%%', pctdistance=0.75)
plt.pie(vals3, radius=0.6, colors='w')
#ax.set(aspect="equal", title='Pie plot with `ax.pie`')
plt.title('Pie plot with xx')
plt.legend(labels, loc='best') #bbox_to_anchor=(1, 1), loc='best', borderaxespad=0.
plt.show()

　　散点图+直方图

plt.figure(figsize=(12,4))

#散点图
plt.subplot(121)
import matplotlib.cm as cm
def scatter_plot_by_category(feat, x, y):
    gs = df.groupby(feat)
    cs = cm.rainbow(np.linspace(0, 1, len(gs)))
    for g, c in zip(gs, cs):
        plt.scatter(g[1][x], g[1][y], color=c, alpha=0.5)
scatter_plot_by_category('target', 'sepal length (cm)', 'sepal width (cm)')
plt.xlabel('sepal length (cm)')
plt.ylabel('sepal width (cm)')
plt.title('target')

#直方图
plt.subplot(122)
mu, sigma = 100, 15
x = mu + sigma * np.random.randn(10000)
x1 = np.linspace(x.min(), x.max(), 1000)
normal = mlab.normpdf(x1, mu, sigma) #生成正态曲线的数据
kde = mlab.GaussianKDE(x) #生成核密度曲线的数据

#color='steelblue'
#bins=np.arange(x.min(),x.max(), 5)
#normed=True,     #频率直方图
#cumulative=True, #积累直方图
n, bins, patches = plt.hist(x, bins=50, density=1, edgecolor ='k', facecolor='g', alpha=0.75)  #边界色 + 填充色

line1, = plt.plot(x1, normal, 'r-', linewidth = 2) 
line2, = plt.plot(x1, kde(x1), 'g-', linewidth = 2)

plt.legend([line1, line2],[ '正态曲线', '核密度曲线'],loc= 'best')
plt.tick_params(top= 'off', right= 'off')  #去除边界刻度
plt.axvline(90)   #参考线
plt.text(60, .025, r'$\mu=100,\ \sigma=15$')  #文本
plt.axis([40, 160, 0, 0.03])  #刻度区间
plt.grid(ls='--')
plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title('Histogram of IQ')

plt.show()

　　seaborn.barplot绘制柱状图更多：Seaborn常见绘图总结

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12,4))
plt.subplot(121)
a=np.arange(40).reshape(10,4)
df=pd.DataFrame(a,columns=['a','b','c','d'])
df['a']=[0,4,4,8,8,8,4,12,12,12]
df['d']=list('aabbabbbab')
sns.barplot(x='a', y='b', data=df, hue='d')  #分类柱状图
plt.subplot(122)
plt.bar(df['a'], df['b'], label='b')
#barh(x,y)
plt.bar(df['a'], df['c'], bottom=df['b'], color='r', label='c')
plt.legend(loc=2)
plt.show()

　　并列柱状图

bar_width = 0.3
x = np.arange(3)
tick_label = ['一级医院','二级医院','三级医院']
plt.figure(figsize=(12,4))
plt.subplot(121)
#data1.groupby('医院等级').sum()[['医院数','本地定点医院数']].plot(kind="bar",width = .8)  #.unstack()
#data1[['医院数','本地定点医院数']].plot(kind="bar",width = .8)
plt.bar(x, data1['医院数'], width=bar_width, align="center", color="c", label="全部医院", alpha=0.5)
plt.bar(x+bar_width, data1['本地定点医院数'], width=bar_width, align="center", color="b", label="本地定点医院", alpha=0.5)
plt.xticks(x+bar_width/2, tick_label)
plt.legend()
plt.title('舟山市居民就医医院的等级分布')

#plt.title('医院数分布')
plt.subplot(122)
plt.bar(x, data1['总单号数'], width=bar_width, align="center", color="c", label="全部医院", alpha=0.5)
plt.bar(x+bar_width, data1['本地定点医院单号量'], width=bar_width, align="center", color="b", label="本地定点医院", alpha=0.5)
plt.xticks(x+bar_width/2, tick_label)
plt.legend()
plt.title('舟山市居民在各等级医院就医的单号量分布')
plt.show()

　　柱状图添加数据标签

import matplotlib.pyplot as plt
import numpy as np

# 构造数据
menMeans = (20, 35, 30, 35, 27)
womenMeans = (25, 32, 34, 20, 25)
xlabels = ['G1', 'G2', 'G3', 'G4', 'G5']
width = 0.35

plt.subplot(211)
p1 = plt.bar(xlabels, menMeans, width, label='Men')
plt.bar_label(p1, label_type='center')
plt.title('center')
plt.subplot(212)
p2 = plt.bar(xlabels, womenMeans, width, label='Women')
plt.bar_label(p2, label_type='edge')
plt.title('edge')
plt.show()

　　堆积柱状图添加数据标签

import matplotlib.pyplot as plt
import numpy as np

# 构造数据
menMeans = (20, 35, 30, 35, -27)
womenMeans = (25, 32, 34, 20, -25)
xlabels = ['G1', 'G2', 'G3', 'G4', 'G5']
width = 0.35  

# 绘制堆积柱状图
p1 = plt.bar(xlabels, menMeans, width, label='Men')
p2 = plt.bar(xlabels, womenMeans, width, bottom=menMeans, label='Women')

plt.axhline(0, color='grey', linewidth=0.8)
plt.ylabel('Scores')
plt.title('Scores by group and gender')

plt.legend()

# 为第一段柱子添加标签
plt.bar_label(p1, label_type='center')
# 为第二段柱子添加标签
plt.bar_label(p2, label_type='center')
# 为柱子整体添加标签
plt.bar_label(p2)

plt.show()

　　堆积图

total = df.sum(axis=1)
for i in df.columns: 
    df[i] = df[i] / total
    
bottom = 0
for i in range(df.shape[1]):
    y = df.iloc[:n,i]
    plt.bar(x, y, bottom=bottom)
    bottom += y
plt.legend(['一级医院','二级医院','三级医院'])
plt.title('100种常见病在不同医院等级下的单号量分布图')

　　柱状折线图 / 双轴图（增速要乘100的哦）

df = pd.DataFrame({'x':list('abcd'), 'y':[20, 15, 10, 8], 'r':[0.3, 0.5, 0.4, 0.1]})

#plt.rcParams['figure.figsize'] = (12.0,5.0) 
fig = plt.figure(figsize=(8,4))
 
#画柱子
ax1 = fig.add_subplot(111)
ax1.bar(df['x'], df['y'], alpha=.7, color='g')

ax1.set_ylabel('xx收入', fontsize=12)
plt.xticks(range(df.shape[0]), df['x'])
plt.xticks(fontsize=10)  #后面设置不了
plt.yticks(fontsize=10)

#画折线图
ax2 = ax1.twinx()
ax2.plot(df['x'], df['r'], 'r', marker='*', ms=10)

ax2.set_ylim([0,0.6])
ax2.set_ylabel('同比增速（%）', fontsize=12)
plt.yticks(fontsize=10)

#ax1.set_xticklabels('defg', rotation=-45)  #旋转效果
plt.title('近年xx公司xx收入与同比增速', fontsize=16)
plt.grid(False)
 
#添加数据标签
for i in range(df.shape[0]):
    #plt.text(i, df['y'][i]+0.3, str(df['y'][i]), ha='center', va='bottom', fontsize=15, rotation=0)
    plt.text(i, df['r'][i], str(df['r'][i]), ha='center', va='bottom', fontsize=12, rotation=0)

#保存与展示
#dpi为图像分辨率, bbox_inches='tight'代表去除空白
#plt.savefig('e:/tj/month/fx1806/公司保费增速与同比.png', dpi=600, bbox_inches='tight')
plt.show()

　　柱状折线图 -- 合并label

fig = plt.figure(figsize=(10, 4))
ax1 = fig.add_subplot(111)
lns1 = ax1.bar(range(ind.sum()), data.loc[ind,'单号数'], alpha=.7, color='b', label=r'单号数')
ax2 = ax1.twinx()
lns2 = ax2.plot(range(ind.sum()), data.loc[ind,'用药（包含检查等）种类数'], color='r', marker='*', ms=4, linewidth=1, label=r'用药（包含检查等）种类数')
lns = [lns1]+lns2
labs = [l.get_label() for l in lns]
ax1.legend(lns, labs, loc=0)
plt.show()

　　其他条形图

plt.figure(figsize=(10, 3))
#重叠条形图
plt.subplot(121)
data_hour2015 = pd.DataFrame(np.random.randint(10, size=(100,)), columns=['num'])
data_hour2016 = pd.DataFrame(np.random.randint(10, size=(100,)), columns=['num'])
data_hour2017 = pd.DataFrame(-np.random.randint(10, size=(100,)), columns=['num'])
data_hour2015['num'].plot.bar(color='g', alpha=0.6, label='2015年')
data_hour2016['num'].plot.bar(color='r', alpha=0.6, label='2016年')
data_hour2017['num'].plot.bar(color='b', alpha=0.6, label='2017年')
#plt.ylabel('counts')
#plt.title('missing')
plt.legend(loc='upper right')
plt.xticks([0,19,39,59,79,99], [1,20,40,60,80,100])

#二维频数分布图
plt.subplot(122)
x = np.random.randn(1000)+2
y = np.random.randn(1000)+3
plt.hist2d(x,y,bins=40)
plt.show()

　　自定义图例参考

注意：数据点过多会导致部分bar显示不全的情况

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

colors = ['red', 'green', 'blue']
labels = ['一级医院', '二级医院', '三级医院']
c_map = data['hirate'].map(lambda x:colors[int(x)-1]).tolist()

plt.figure(figsize=(8,4))
plt.bar(range(len(data['hicode'])), data['counts'], color=c_map)  #width=0.5
#plt.ylim(-0.01, 5000000)
# 自定义刻度
plt.xticks(ticks=np.arange(7)*100, labels=data['hicode'][np.arange(7)*100])
# 自定义图例
patches = [mpatches.Patch(color=colors[i], label="{:s}".format(labels[i])) for i in range(len(colors)) ]
ax = plt.gca()
#box = ax.get_position()
#ax.set_position([box.x0, box.y0, box.width , box.height* 0.8])
ax.legend(handles=patches, loc=0)  #bbox_to_anchor=(0.95,1.12)设定位置, ncol=1列数
plt.title('医院编码 - 接诊单号量分布图')
plt.show()

　　并列条形图 -- 参考链接

df.groupby(['Region','Tier'],sort=True).sum()[['Sales2015','Sales2016']].unstack().plot(kind="bar",width = .8)

　　DataFrame数据绘图

#柱状图
speed = [0.1, 17.5, 40, 48, 52, 69, 88]
lifespan = [2, 8, 70, 1.5, 25, 12, 28]
index = ['snail', 'pig', 'elephant','rabbit', 'giraffe', 'coyote', 'horse']
df = pd.DataFrame({'speed': speed, 'lifespan': lifespan}, index=index)
ax = df.plot.barh(x='lifespan')
#df.plot.bar()


#直方图
df = pd.DataFrame(np.random.randint(1, 7, 6000), columns = ['one'])
df['two'] = df['one'] + np.random.randint(1, 7, 6000)
ax = df.plot.hist(bins=12, alpha=0.5)


#箱线图
data = np.random.randn(25, 4)
df = pd.DataFrame(data, columns=list('ABCD'))
ax = df.plot.box()

#六边形热力图
n = 10000
df = pd.DataFrame({'x': np.random.randn(n), 'y': np.random.randn(n)})
ax = df.plot.hexbin(x='x', y='y', gridsize=20)

n = 500
df = pd.DataFrame({'coord_x': np.random.uniform(-3, 3, size=n),
                   'coord_y': np.random.uniform(30, 50, size=n),
                   'observations': np.random.randint(1,5, size=n)})
ax = df.plot.hexbin(x='coord_x',
                    y='coord_y',
                    C='observations',
                    reduce_C_function=np.sum,
                    gridsize=10,
                    cmap="viridis")

#核密度
df = pd.DataFrame({'x': [1, 2, 2.5, 3, 3.5, 4, 5],
                   'y': [4, 4, 4.5, 5, 5.5, 6, 6],})
ax = df.plot.kde()
ax = df.plot.kde(bw_method=0.3)
ax = df.plot.kde(bw_method=3)
ax = df.plot.kde(ind=[1, 2, 3, 4, 5, 6])

#线图
df = pd.DataFrame({'pig': [20, 18, 489, 675, 1776],
                   'horse': [4, 25, 281, 600, 1900]},
                  index=[1990, 1997, 2003, 2009, 2014])
lines = df.plot.line()
axes = df.plot.line(subplots=True)
lines = df.plot.line(x='pig', y='horse')

#饼图
df = pd.DataFrame({'mass': [0.330, 4.87 , 5.97],
                   'radius': [2439.7, 6051.8, 6378.1]},
                  index=['Mercury', 'Venus', 'Earth'])
ax = df.plot.pie(y='mass', subplots=True, figsize=(6, 3))
ax = df.plot.pie(y='radius', subplots=True, figsize=(6, 3))

#散点图
df = pd.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1],
                  [6.4, 3.2, 1], [5.9, 3.0, 2]],
                  columns=['length', 'width', 'species'])
ax1 = df.plot.scatter(x='length',
                      y='width',
                      c='DarkBlue')
ax2 = df.plot.scatter(x='length',
                      y='width',
                      c='species',
                      colormap='viridis')

　　矩阵图

import pandas as pd
x = pd.DataFrame(np.random.randn(200,4)*100, columns = ['A','B','C','D'])
cs = np.random.randint(3, size=200)
#c='k',cmap=mglearn.cm3
pd.scatter_matrix(x, figsize=(8,8), c = cs, marker = '+',
                  diagonal='hist', hist_kwds={'bins':10, 'edgecolor':'k'},
                  alpha = 0.8, range_padding=0.1)
plt.show()

　　热力图

#corr = df.corr()
flights = sns.load_dataset("flights")
flights = flights.pivot("month", "year", "passengers")
fig, ax = plt.subplots(figsize = (6, 4.5))
sns.heatmap(flights, annot=True,fmt="d",linewidths=.5, ax = ax)  #cmap='RdBu'
plt.show()

　　violinplot图

from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(iris['data'], columns=iris['feature_names'])
df['target'] = iris['target']

plt.figure(figsize=(9, 8))
for column_index, column in enumerate(df.columns):
    if column == 'target':
        continue
    plt.subplot(2, 2, column_index + 1)
    sns.violinplot(x='target', y=column, data=df)

　　数学教科书上展示的图

plt.figure(1)
x = np.linspace(-np.pi,np.pi,256,endpoint=True)
co, si = np.cos(x), np.sin(x)

plt.plot(x, co, color="blue", linewidth=1.0, linestyle="-", label="cos", alpha=0.5)
plt.plot(x, si, "r*", markersize=1, label="sin")

#创建一个坐标轴的编辑器
ax=plt.gca()
#隐藏右边和上边的轴线，将左边和下边的轴线移到中间（数据域），把刻度数据放到下边和左边
ax.spines['right'].set_color("none")
ax.spines['top'].set_color("none")
ax.spines['left'].set_position(("data",0))
ax.spines['bottom'].set_position(("data",0))
ax.xaxis.set_ticks_position("bottom")
ax.yaxis.set_ticks_position("left")
#设置刻度及刻度标签格式
plt.xticks([-np.pi,-np.pi/2,0,np.pi/2,np.pi], [r'$-\pi$',r'$-\pi/2$',r'$0$',r'$\pi/2$',r'$\pi$'])
plt.yticks(np.linspace(-1,1,5, endpoint=True))
for label in ax.get_xticklabels()+ax.get_yticklabels():
    label.set_fontsize(10)  #字体
    label.set_bbox(dict(facecolor="white", edgecolor="None", alpha=0.2))

#色彩填充
plt.fill_between(x, np.abs(x)<0.5, co, co>0.5, color="red", alpha=0.2)

#添加注释
'''
xy为标注值，xycoords="data"表示使用原始坐标
xytext:文本位置，textcoords设置其坐标规范（坐标偏移）
arrowprops设置箭头属性（参数类型为字典）, arrowstyle为箭头风格, connectionstyle为连接风格
'''
t = 1
plt.plot([t,t], [0,np.cos(t)], 'y', color ='yellow', linewidth=2, linestyle="--")
plt.scatter([t,t], [0,np.cos(t)], 50, color ='red')
plt.annotate("cos(1)", xy=(t, np.cos(t)), xycoords="data",
             xytext=(+10, +20), textcoords="offset points", fontsize=12,
             arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=.2"))

t = 2*np.pi/3
plt.plot([t,t], [0,np.sin(t)], 'y', color ='yellow', linewidth=2, linestyle="--")
plt.scatter([t,t],[0,np.sin(t)], 50, color ='green')
plt.annotate(r'$\sin(\frac{2\pi}{3})=\frac{\sqrt{3}}{2}$', xy=(t,np.sin(t)), xycoords='data',
             xytext=(+10, +30), textcoords='offset points', fontsize=12,
             arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=.2"))

plt.title("cos&sin")
plt.legend(loc="upper left")
plt.grid(ls='--')
plt.axis([-3.15,3.15,-1.05,1.05])

plt.show()

　　插值图

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.interpolate import griddata

def func(x, y):
    return x*(1-x)*np.cos(4*np.pi*x) * np.sin(4*np.pi*y**2)**2
    
points = np.random.rand(1000, 2)
values = func(points[:,0], points[:,1])
grid_x, grid_y = np.mgrid[0:1:100j, 0:1:200j]

grid_z0 = griddata(points, values, (grid_x, grid_y), method='nearest')
grid_z1 = griddata(points, values, (grid_x, grid_y), method='linear')
grid_z2 = griddata(points, values, (grid_x, grid_y), method='cubic')

plt.subplot(221)
plt.imshow(func(grid_x, grid_y).T, extent=(0,1,0,1), origin='lower')
plt.plot(points[:,0], points[:,1], 'k.', ms=1)
plt.title('Original')
plt.subplot(222)
plt.imshow(grid_z0.T, extent=(0,1,0,1), origin='lower')
plt.title('Nearest')
plt.subplot(223)
plt.imshow(grid_z1.T, extent=(0,1,0,1), origin='lower')
plt.title('Linear')
plt.subplot(224)
plt.imshow(grid_z2.T, extent=(0,1,0,1), origin='lower')
plt.title('Cubic')
plt.gcf().set_size_inches(6, 6)
plt.show()

　　等高线图

import numpy as np
import matplotlib.pyplot as plt
#import matplotlib as mpl
#from matplotlib import colors

#建立步长为0.01，即每隔0.01取一个点
step = 0.01
x = np.arange(-10,10,step)
y = np.arange(-10,10,step)
#也可以用x = np.linspace(-10,10,100)表示从-10到10，分100份

#将原始数据变成网格数据形式
X,Y = np.meshgrid(x,y)
Z = X**2+Y**2

#等高线图
plt.figure(figsize=(10,6))  #设置画布大小
plt.subplot(231)
plt.contour(X,Y,Z)  #等高线

plt.subplot(232)
contour = plt.contour(X,Y,Z, [20,40,60], colors='k')  #只画z=20和40的线，黑色
plt.clabel(contour, fontsize=10, colors=('k','r','b'), fmt='%.4f')  #标注高度（字体，颜色，小数）

plt.subplot(233)
contour = plt.contour(X,Y,Z, 4, colors='k')  #只画z=20和40的线，黑色
plt.clabel(contour, fontsize=10, colors='b', fmt='%.2f')  #标注高度（字体，颜色，小数）

plt.subplot(234)
plt.contourf(X,Y,Z)  #填充颜色，f即filled
plt.xticks(())  #去掉刻度
plt.yticks(())

plt.subplot(235)
cset = plt.contourf(X,Y,Z,6,cmap=plt.cm.hot)
plt.colorbar(cset)

plt.subplot(236)
cset = plt.contourf(X,Y,Z,6,alpha=1,vmin=0,vmax=100, cmap='hot_r')  #6种颜色, 颜色取反
plt.colorbar(cset)
contour = plt.contour(X,Y,Z,8,colors='k')  #8条线
plt.clabel(contour,fontsize=10,colors='k')
plt.scatter(0,0,color='r')
plt.show()

#colorslist = ['w','gainsboro','gray','aqua']
#将颜色条命名为mylist，一共插值颜色条50个
#cmaps = colors.LinearSegmentedColormap.from_list('mylist',colorslist,N=200)
#cmap='hot' 'BuGn', plt.get_cmap('YlOrBr_r'), mpl.cm.hot

　　聚类结果的可视化（1）

from itertools import cycle
import matplotlib.pyplot as plt

plt.close('all')
plt.figure(figsize=(12,4))
plt.clf()

unique_labels = set(db.labels_)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)  # 设置一个样本个数长度的全false向量
core_samples_mask[db.core_sample_indices_] = True #将核心样本部分设置为true

# 使用黑色标注离散点
plt.subplot(121)
colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:  # 聚类结果为-1的样本为离散点
        # 使用黑色绘制离散点
        col = [0, 0, 0, 1]

    class_member_mask = (db.labels_ == k)  # 将所有属于该聚类的样本位置置为true

    xy = X[class_member_mask & core_samples_mask]  # 将所有属于该类的核心样本取出，使用大图标绘制
    plt.plot(xy[:, 0], xy[:, 2], 'o', markerfacecolor=tuple(col),markeredgecolor='k', markersize=14)

    xy = X[class_member_mask & ~core_samples_mask]  # 将所有属于该类的非核心样本取出，使用小图标绘制
    plt.plot(xy[:, 0], xy[:, 2], 'o', markerfacecolor=tuple(col),markeredgecolor='k', markersize=6)

plt.title('对医院医疗耗材的异常值检测最佳聚类数: %d' % n_clusters_)
plt.xlabel(r'CQ类材料使用频率(%)')
plt.ylabel(r'单价200元以上CL类使用频率(%)')
#plt.show()


plt.subplot(122)
colors = cycle('bgrcmybgrcmybgrcmybgrcmy')
for k, col in zip(unique_labels, colors):
    class_member_mask = db.labels_ == k
    if k == -1:
        plt.plot(X[class_member_mask, 0], X[class_member_mask, 2], 'k' + '.')
    else:
        cluster_center = X[class_member_mask & core_samples_mask].mean(axis=0)
        plt.plot(X[class_member_mask, 0], X[class_member_mask, 2], col + '.')
        plt.plot(cluster_center[0], cluster_center[2], 'o', markerfacecolor=col,
                 markeredgecolor='k', markersize=14)
        for x in X[class_member_mask]:
            plt.plot([cluster_center[0], x[0]], [cluster_center[2], x[2]], col)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.xlabel(r'CQ类材料使用频率(%)')
plt.ylabel(r'单价200元以上CL类使用频率(%)')
plt.show()

　　聚类结果的可视化（2）

print(__doc__)

from time import time
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

np.random.seed(42)

digits = load_digits()
data = scale(digits.data)

n_samples, n_features = data.shape
n_digits = len(np.unique(digits.target))
labels = digits.target

sample_size = 300

print("n_digits: %d, \t n_samples %d, \t n_features %d"
      % (n_digits, n_samples, n_features))


print(82 * '_')
print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')


def bench_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data)
    print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_,
                                                average_method='arithmetic'),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)))

bench_k_means(KMeans(init='k-means++', n_clusters=n_digits, n_init=10),
              name="k-means++", data=data)

bench_k_means(KMeans(init='random', n_clusters=n_digits, n_init=10),
              name="random", data=data)

# in this case the seeding of the centers is deterministic, hence we run the
# kmeans algorithm only once with n_init=1
pca = PCA(n_components=n_digits).fit(data)
bench_k_means(KMeans(init=pca.components_, n_clusters=n_digits, n_init=1),
              name="PCA-based",
              data=data)
print(82 * '_')

# #############################################################################
# Visualize the results on PCA-reduced data

reduced_data = PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
kmeans.fit(reduced_data)

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired,
           aspect='auto', origin='lower')

plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
            marker='x', s=169, linewidths=3,
            color='w', zorder=10)
plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
          'Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

决策树可视化

1. 安装绘图软件GraphViz（graphviz-2.38.zip 下载），并将解压路径添加到环境变量（通过我的电脑改环境变量貌似不行）

# 添加环境变量
import os
os.environ["PATH"] += os.pathsep + 'D:/graphviz-2.38/release/bin/'

# 安装相关包
pip install graphviz pydotplus

2. 绘制决策树

#import io
#import graphviz
import pydotplus
from sklearn.datasets import load_iris
from sklearn import tree
from IPython.display import Image

iris = load_iris()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)
#tree.plot_tree(clf.fit(iris.data, iris.target))

#dot_data = tree.export_graphviz(clf, out_file=None)  #黑白
dot_data = tree.export_graphviz(clf, out_file=None, 
                      feature_names=iris.feature_names,  
                      class_names=iris.target_names,  
                      filled=True, rounded=True,  
                      special_characters=True)
#dot_data = io.StringIO()
#tree.export_graphviz(clf, out_file=dot_data)
#graph = graphviz.Source(dot_data)
#graph.render("iris")  #导出为iris.pdf
#graph

graph = pydotplus.graphviz.graph_from_dot_data(dot_data)
Image(graph.create_png())

# ---------------------------------------------------
#from numpy import loadtxt
from sklearn.datasets import load_iris
from xgboost import XGBClassifier
from xgboost import plot_tree
import matplotlib.pyplot as plt
# load data
#iris = loadtxt('pima-indians-diabetes.csv', delimiter=",")
iris = load_iris()
# split data into X and y
X = iris.data
y = iris.target
# fit model no training data
model = XGBClassifier()
model.fit(X, y)
# plot single tree
fig = plt.figure(dpi=180)
ax = plt.subplot(1,1,1)
plot_tree(model, num_trees=4, ax = ax)
plt.show()

时间序列数据可视化

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns
from datetime import datetime
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
#import statsmodels.api as sm
#import statsmodels.formula.api as smf
#import statsmodels.tsa.api as smt
#sm.graphics.tsa.plot_acf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from matplotlib.ticker import MultipleLocator, FormatStrFormatter

def plot_acf_pacf(y, lags=12):
    plt.figure(figsize=(14, 8))
    layout = (3, 2)
    def tsplot(y, layout, i, plotlags=20, title=''):
        ts_ax   = plt.subplot2grid(layout, (0, i))
        acf_ax  = plt.subplot2grid(layout, (1, i))
        pacf_ax = plt.subplot2grid(layout, (2, i))
        y.plot(ax=ts_ax)
        ts_ax.set_title(title)
        #y.plot(ax=hist_ax, kind='hist', bins=25)
        #hist_ax.set_title('Histogram')
        #设置主刻度标签文本的格式
        #xmajorFormatter = FormatStrFormatter('%1.1f') #设置x轴标签文本的格式
        #ax.xaxis.set_major_formatter(xmajorFormatter)
        #设置主刻度标签的位置
        #xmajorLocator  = MultipleLocator(20) #将x主刻度标签设置为20的倍数 
        #ax.xaxis.set_major_locator(xmajorLocator)
        plot_acf(y, lags=plotlags, ax=acf_ax)  #lags=20
        #acf_ax.axhline(y=0.1,ls="--",c="r")  #添加水平直线
        #acf_ax.axhline(y=-0.1,linestyle="--",c="r")  #添加水平直线
        #plt.axvline(x=4,ls="-",c="green")  #添加垂直直线
        #plt.plot([0, 0.1], [lags, 0.1], linestyle='--', dashes=(5, 5))  #dashes分别表示线和空格长度
        #acf_ax.xaxis.set_ticks([i for i in range(0,plotlags+1,2)])
        acf_ax.set_xticks([i for i in range(0,plotlags+1,2)])
        plot_pacf(y, lags=plotlags, ax=pacf_ax)
        #pacf_ax.axhline(y=0.1,ls="--",c="r")  #添加水平直线
        #pacf_ax.axhline(y=-0.1,linestyle="--",c="r")  #添加水平直线
        #pacf_ax.xaxis.set_ticks([i for i in range(0,plotlags+1,2)])
        pacf_ax.set_xticks([i for i in range(0,plotlags+1,2)])
        #[ax.set_xlim(0) for ax in [acf_ax, pacf_ax]]
        #sns.despine()
    tsplot(y, layout, 0, plotlags=20, title='Original Series')
    tsplot(y.diff(lags).dropna(), layout, 1, plotlags=20, title='%sst Order Differencing'%(lags))
    plt.tight_layout()
    plt.show()


plot_acf_pacf(income2, lags=12)
plot_acf_pacf(payment2, lags=12)

散点图+边缘直方图

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd

# 获取数据
#url = 'https://github.com/cocolone2/datasets/blob/master/mpg_ggplot2.csv'
#df = pd.read_csv(url, delimiter='\t')
#df = pd.read_clipboard()

# 创建画布并将画布分割成格子
fig = plt.figure(figsize=(16, 8), dpi=80, facecolor='white')
grid = plt.GridSpec(4, 4, hspace=0.5, wspace=0.2)

# 添加子图
ax_main = fig.add_subplot(grid[:, :-1])
ax_right = fig.add_subplot(grid[:, -1], xticklabels=[], yticklabels=[])
#ax_bottom = fig.add_subplot(grid[-1, :-1], xticklabels=[], yticklabels=[])

# 在中心绘制气泡图
ax_main.scatter('displ', 'hwy'
                , s=df.cty * 4 # 点的大小
                , data=df #数据集
                , c=df.manufacturer.astype('category').cat.codes # 颜色列表
                , cmap='tab10' # 调色板
                , edgecolors='gray' # 边缘颜色
                , linewidth=.5 # 线宽
                , alpha=.9) # 透明度
# 绘制底部直方图
#ax_bottom.hist(df.displ, 40, histtype='stepfilled', orientation='vertical', color='deeppink')
#ax_bottom.invert_yaxis()  # 让y轴反向

# 绘制右边直方图
def normfun(x, mu, sigma):
    pdf = np.exp(-((x - mu) ** 2) / (2 * sigma ** 2)) / (sigma * np.sqrt(2 * np.pi))
    return pdf

x = np.arange(df.hwy.min(), df.hwy.max(), 0.1)
mu,sigma = df.hwy.mean(), df.hwy.std()
y = normfun(x, mu, sigma)
#plt.plot(y, x, color='r')
#plt.hist(df.hwy, bins=40, histtype='stepfilled', orientation='horizontal', color='b', density=True)
ax_right.plot(y, x, color='r')  #'--', , linewidth=2
ax_right.hist(df.hwy, bins=40, histtype='stepfilled', orientation='horizontal', color='b', density=True)  #vertical
x0 = ax_right.get_xlim()[-1]
y0 = ax_right.get_ylim()[-1]
#arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=.2")
ax_right.annotate(r"$\mu$="+str(round(mu,2))+"\n"+r"$\sigma^2$="+str(round(sigma,2)), 
                  xy=(x0*2/3, y0*15/16), xycoords="data", xytext=(-5, -5), 
                  textcoords="offset points", fontsize=12)

# 装饰图像
plt.rcParams['font.sans-serif'] = ['Simhei']
ax_main.set(title='边缘直方图 \n 发动机排量 vs 公路里程/加仑',
            xlabel='发动机排量(L)', 
            ylabel='公路里程/加仑')
ax_main.title.set_fontsize(20)
#ax_main.set_title('边缘直方图 \n 发动机排量 vs 公路里程/加仑', fontsize=20)
for item in ([ax_main.xaxis.label, ax_main.yaxis.label] + ax_main.get_xticklabels() + ax_main.get_yticklabels()):
    item.set_fontsize(18)

#for item in [ax_bottom, ax_right]:
#    item.set_xticks([]) # 去掉直方图的标尺
#    item.set_yticks([])

xlabels = ax_main.get_xticks().tolist()
ax_main.xaxis.set_major_locator(mticker.FixedLocator(xlabels))  # 定位到散点图的x轴
ax_main.set_xticklabels(['{:,.1f}'.format(x) for x in xlabels])  # 使用列表推导式循环将刻度转换成浮点数
#plt.tight_layout()
plt.show()

import matplotlib
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family']='sans-serif'
matplotlib.rcParams['axes.unicode_minus'] = False

x = np.random.randn(200);y = x+np.random.randn(200)*1.0
#left, bottom, width, height
ax1 = plt.axes([0.1, 0.1, 0.6, 0.6])
ax2 = plt.axes([0.1, 0.1+0.6+0.02, 0.6, 0.2])
ax3 = plt.axes([0.1+0.6+0.02, 0.1, 0.2, 0.6])
ax2.set_xticks([])
ax3.set_yticks([])
xymax = np.max([np.max(np.fabs(x)), np.max(np.fabs(y))])

ax1.scatter(x, y)
bin_width = 0.25
lim = np.ceil(xymax/bin_width) * bin_width
bins = np.arange(-lim, lim, bin_width)
ax2.hist(x, bins=bins)
ax3.hist(y, bins=bins, orientation='horizontal')
ax2.set_xlim(ax2.get_xlim())
ax3.set_ylim(ax3.get_ylim())
plt.show()