论文实战之基于遗传算法的聚类
Genetic algorithm-based clustering technique
2015.3.27 使用pyevolve
数据产生
%matplotlib inline
from scipy.stats import multivariate_normal
import matplotlib.pyplot as plt
import numpy as np
mean1 = [10,10]
cov1= [[1,0],[0,1]]
mean2 = [15,15]
cov2= [[1,0],[0,1]]
nm1=multivariate_normal(mean1,cov1)
nm2=multivariate_normal(mean2,cov2)
data1_1=nm1.rvs(5)
data1_2=nm2.rvs(5)
plt.scatter(data1_1[:,0],data1_1[:,1],c='r')
plt.scatter(data1_2[:,0],data1_2[:,1],c='b')
data=np.r_[data1_1,data1_2]
遗传算法部分
一开始程序老是出错,原因在于
- 初始化的时候是随意的
- 有时候某一个类是空的,改进版如下
from pyevolve import G1DList
from pyevolve import GSimpleGA
from pyevolve import Crossovers,Mutators,Initializators,Selectors,Consts
def form_clusters(x):
#extract centers for the genome x
#从染色体提取各个类的中心,例如,4维两个mean1 = [10,10]
centers=[np.array([x[i*N+j] for j in range(N)]) for i in range(K)]
#create empty clusters
clusters=[[] for i in range(K)]
#cacluate score values respect to each center for each data
#距离矩阵,数据个数*类数,即每个数据相对于每一个类的距离
clustMatrix=np.zeros((len(data),K))#data得是array!
for i,d in enumerate(data):
for j,c in enumerate(centers):
#print i,j,'d',d,d.shape,'\n','c',c,c.shape
#print 'd%d'%i,d,'\n','c%d'%j,c
clustMatrix[i,j]=np.linalg.norm(d-c)
#print clustMatrix
#the index of the minumum for each column
#最近的中心:一个array,长度等于数据个数
closestIndex=np.argmin(clustMatrix,axis=1)
#print closestIndex
#根据最近中心,将数据分类
for i,d in enumerate(data):
clusters[closestIndex[i]].append(d)
#重新计算聚类中心:N*K
new_centers=[np.average(np.array(clusters[i]),axis=0) for i in range(K)]
#print '0',new_centers
for i,c in enumerate(new_centers):
if np.isnan(c).any():
new_centers[i]=centers[i]
#print '1',new_centers
return new_centers,clusters
pF=0
def eval_func(x):
global pF;
if pF==0:
pF=1
for i in range(K):
for j in range(N):
tmp=data[np.random.randint(0,len(data))]
x[i*N+j]=tmp[j]
#将数据重新分类
centers,clusters=form_clusters(x)
#将聚类中心赋给染色体x
for i,c in enumerate(np.array(centers).ravel()):
x[i]=c
#计算fitness
s=0
for i in range(K):
#print 'clusters[%d]'%i,np.array(clusters[i])
#print 'centers[%d]'%i,np.array(centers[i])
if clusters[i]!=[]:
#print 'clusters[i]',clusters[i]
#print np.isnull(clusters[i]).any()
s=s+np.linalg.norm(np.array(clusters[i])-np.array(centers[i])).sum()#使用了broadcast
return 1./s
N=2#num of dimensions
K=2#num of clusters
Consts.CDefGACrossoverRate=0.8
Consts.CDefGAMutationRate=0.001
Consts.CDefGAPopulationSize=2
Consts.CDefGAGenerations=100
genome=G1DList.G1DList(N*K)#data in genome:N values for each cluster K
genome.initializator.set(Initializators.G1DListInitializatorReal)
genome.evaluator.set(eval_func)
#genome.mutator.set(Mutators.G1DListMutatorRealRange)
genome.mutator.set(Mutators.G1DListMutatorRealGaussian)
genome.crossover.set(Crossovers.G1DListCrossoverSinglePoint)
ga = GSimpleGA.GSimpleGA(genome)
ga.selector.set(Selectors.GRouletteWheel)
ga.evolve(20)
输出:
Gen. 0 (0.00%): Max/Min/Avg Fitness(Raw) [0.13(0.13)/0.09(0.09)/0.11(0.11)]
Gen. 20 (20.00%): Max/Min/Avg Fitness(Raw) [0.21(0.21)/0.21(0.21)/0.21(0.21)]
Gen. 40 (40.00%): Max/Min/Avg Fitness(Raw) [0.21(0.21)/0.21(0.21)/0.21(0.21)]
Gen. 60 (60.00%): Max/Min/Avg Fitness(Raw) [0.21(0.21)/0.21(0.21)/0.21(0.21)]
Gen. 80 (80.00%): Max/Min/Avg Fitness(Raw) [0.21(0.21)/0.21(0.21)/0.21(0.21)]
Gen. 100 (100.00%): Max/Min/Avg Fitness(Raw) [0.21(0.21)/0.21(0.21)/0.21(0.21)]
Total time elapsed: 1.062 seconds.
1/0.21大概是4.761904761904762,与论文里提到的误差2.22差的太远了
2015.3.29 使用deap
%matplotlib inline
from scipy.stats import multivariate_normal
import matplotlib.pyplot as plt
import numpy as np
from deap import base,creator,tools
import random
mean1 = [10,10]
cov1= [[1,0],[0,1]]
mean2 = [15,15]
cov2= [[1,0],[0,1]]
nm1=multivariate_normal(mean1,cov1)
nm2=multivariate_normal(mean2,cov2)
data1_1=nm1.rvs(5)
data1_2=nm2.rvs(5)
data=np.r_[data1_1,data1_2]
plt.scatter(data1_1[:,0],data1_1[:,1],c='r')
plt.scatter(data1_2[:,0],data1_2[:,1],c='b')
N=2#num of dimensions
K=2#num of clusters
data=data
toolbox=base.Toolbox()
creator.create('maxFit',base.Fitness,weights=(-1,))
creator.create('Individual',list,fitness=creator.maxFit)
def initCenter():
return data[random.sample(range(len(data)),1),:].ravel()
toolbox.register('individual',tools.initRepeat,creator.Individual,initCenter,K)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
def form_clusters(x):
#染色体由K个聚类中心组成
centers=toolbox.clone(x)
#create empty clusters
clusters=[[] for i in range(K)]
#cacluate score values respect to each center for each data
#距离矩阵,数据个数*类数,即每个数据相对于每一个类的距离
clustMatrix=np.zeros((len(data),K))#data得是array!
for i,d in enumerate(data):
for j,c in enumerate(centers):
clustMatrix[i,j]=np.linalg.norm(d-c)
#print clustMatrix
#the index of the minumum for each column
#最近的中心:一个array,长度等于数据个数
closestIndex=np.argmin(clustMatrix,axis=1)
#print closestIndex
#根据最近中心,将数据分类
for i,d in enumerate(data):
clusters[closestIndex[i]].append(d)
#重新计算聚类中心:N*K
new_centers=[np.average(np.array(clusters[i]),axis=0) for i in range(K)]
#print '0',new_centers
#下面是处理某一类为空的情况,将其变成上一次的值
for i,c in enumerate(new_centers):
if np.isnan(c).any():
new_centers[i]=centers[i]
#print '1',new_centers
return new_centers,clusters
def evaluate(x):
#将数据重新分类
centers,clusters=form_clusters(x)
#将聚类中心赋给染色体x
x=toolbox.clone(centers)
#计算fitness
s=0
for i in range(K):
if clusters[i]!=[]:
s=s+np.linalg.norm(np.array(clusters[i])-np.array(centers[i])).sum()#使用了broadcast
return s,
toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxOnePoint)
toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=1, indpb=0.1)
toolbox.register("select", tools.selTournament, tournsize=2)
def test():
pop=toolbox.population(n=10)
CXPB, MUTPB, NGEN = 0.8, 0.001, 100
fitness=map(toolbox.evaluate,pop)
for p,f in zip(pop,fitness):
p.fitness.values=f
for g in range(NGEN):
offspring=map(toolbox.clone,toolbox.select(pop,len(pop)))
for child1,child2 in zip(offspring[::2],offspring[1::2]):
if random.random() < CXPB:
toolbox.mate(child1,child2)
del child1.fitness.values
del child2.fitness.values
for mutant in offspring:
if random.random() < MUTPB:
toolbox.mutate(mutant)
del mutant.fitness.values
invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
fitnesses = map(toolbox.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
pop[:] = offspring
return pop
pop=test()
map(toolbox.evaluate,pop)
- 选择方法tools.selTournament,fitness返回的是误差之和s,权值为-1,
输出:
[(6.2601100607661895,),比pyevolve的大
奇怪的是,无论初始总体是1还是10,误差都是这么大 - 选择方法换为toolbox.register("select", tools.selRoulette),fitness返回的是误差之和s,权值为-1,最终误差不太稳定,一般为10.56807481420261
- 选择方法:toolbox.register("select", tools.selRoulette)
更改权值creator.create('maxFit',base.Fitness,weights=(1,))变为+1,前两种情况,权值为-1,fitness返回的是误差之和s
现在权值为1,fitness返回return 1./s
输出变为0.15974160043403576,倒数(误差)为6.2601100607661895,可见selRoulette的时候要小心啊
2015.3.29更新:
我意识到,官方文档说明如果个体里边是np的array的话,应该在交叉的时候注意Inheriting from Numpy:http://deap.readthedocs.org/en/master/tutorials/advanced/numpy.html
因为np的数组的slice其实是同一个对象的不同view,因此,必须得用.copy()
>>> import numpy
>>> a = numpy.array([1,2,3,4])
>>> b = numpy.array([5,6,7,8])
>>> a[1:3], b[1:3] = b[1:3].copy(), a[1:3].copy()
>>> print(a)
[1 6 7 4]
>>> print(b)
[5 2 3 8]
对于view的解释,详见 http://wiki.scipy.org/Tentative_NumPy_Tutorial#head-4c1d53fe504adc97baf27b65513b4b97586a4fc5
将相应部分的代码变为
def cxOnePoint(ind1, ind2):
size = len(ind1)
cxpoint = 1
ind1[cxpoint],ind2[cxpoint]=ind2[cxpoint].copy(),ind1[cxpoint].copy()
toolbox.register("mate", cxOnePoint)
因为我现在只有两个聚类中心,即每一个个体其实只有俩元素,每一个都是np的array,因此我就直接交换后一个array,即后一个类的聚类中心
总体有三个个体
结果[(0.30239333776854554,1/0.30239333776854554=3.306951162943307很接近paper中的2.225498了
2015.3.29继续更新:
将变异和paper中的一样
def mute(ind):
for d in ind:
delta=random.random()
if delta<=0.5:
d=d*(1+2*delta)
else :
d=d*(1-2*delta)
print 'I m in mutation!'
toolbox.register("mutate", mute)
结果还是3.306951162943307
现在将遗传算法部分整理如下
N=2#num of dimensions
K=2#num of clusters
data=data
toolbox=base.Toolbox()
creator.create('maxFit',base.Fitness,weights=(1,))
creator.create('Individual',list,fitness=creator.maxFit)
def initCenter():
return data[random.sample(range(len(data)),1),:].ravel()
toolbox.register('individual',tools.initRepeat,creator.Individual,initCenter,K)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
def form_clusters(x):
#染色体由K个聚类中心组成
centers=toolbox.clone(x)
#create empty clusters
clusters=[[] for i in range(K)]
#cacluate score values respect to each center for each data
#距离矩阵,数据个数*类数,即每个数据相对于每一个类的距离
clustMatrix=np.zeros((len(data),K))#data得是array!
for i,d in enumerate(data):
for j,c in enumerate(centers):
clustMatrix[i,j]=np.linalg.norm(d-c)
#print clustMatrix
#the index of the minumum for each column
#最近的中心:一个array,长度等于数据个数
closestIndex=np.argmin(clustMatrix,axis=1)
#print closestIndex
#根据最近中心,将数据分类
for i,d in enumerate(data):
clusters[closestIndex[i]].append(d)
#重新计算聚类中心:N*K
new_centers=[np.average(np.array(clusters[i]),axis=0) for i in range(K)]
#print '0',new_centers
#下面是处理某一类为空的情况,将其变成上一次的值
for i,c in enumerate(new_centers):
if np.isnan(c).any():
new_centers[i]=centers[i]
#print '1',new_centers
return new_centers,clusters
def evaluate(x):
#将数据重新分类
centers,clusters=form_clusters(x)
#将聚类中心赋给染色体x
x=toolbox.clone(centers)
#计算fitness
s=0
for i in range(K):
if clusters[i]!=[]:
s=s+np.linalg.norm(np.array(clusters[i])-np.array(centers[i])).sum()#使用了broadcast
return 1./s,
toolbox.register("evaluate", evaluate)
def cxOnePoint(ind1, ind2):
size = len(ind1)
cxpoint = 1
ind1[cxpoint],ind2[cxpoint]=ind2[cxpoint].copy(),ind1[cxpoint].copy()
toolbox.register("mate", cxOnePoint)
def mute(ind):
for d in ind:
delta=random.random()
if delta<=0.5:
d=d*(1+2*delta)
else :
d=d*(1-2*delta)
print 'I m in mutation!'
toolbox.register("mutate", mute)
toolbox.register("select", tools.selRoulette)
def test():
pop=toolbox.population(n=4)
CXPB, MUTPB, NGEN = 0.8, 0.001, 100
fitness=map(toolbox.evaluate,pop)
for p,f in zip(pop,fitness):
p.fitness.values=f
for g in range(NGEN):
offspring=map(toolbox.clone,toolbox.select(pop,len(pop)))
for child1,child2 in zip(offspring[::2],offspring[1::2]):
if random.random() < CXPB:
toolbox.mate(child1,child2)
del child1.fitness.values
del child2.fitness.values
for mutant in offspring:
if random.random() < MUTPB:
toolbox.mutate(mutant)
del mutant.fitness.values
invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
fitnesses = map(toolbox.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
pop[:] = offspring
return pop
pop=test()
map(toolbox.evaluate,pop)
2015.3.30更新:3个聚类中心的情况,高斯分布产生的二维数据
变化点:
- 由于染色体中的每一个元素其实都是一个两元素的array,因此我也不把他们放到list里了,干脆都放到np的array里边
creator.create('Individual',np.ndarray,fitness=creator.maxFit)
- 交叉:
def cxOnePoint(ind1, ind2):
size = len(ind1)
cxpoint = random.randint(1,size-1)#Return a random integer N such that a <= N <= b
ind1[cxpoint:],ind2[cxpoint:]=ind2[cxpoint:].copy(),ind1[cxpoint:].copy()
toolbox.register("mate", cxOnePoint)
数据部分:
%matplotlib inline
from scipy.stats import multivariate_normal
import matplotlib.pyplot as plt
import numpy as np
from deap import base,creator,tools
import random
mean1 = [10,10]
cov1= [[1,0],[0,1]]
mean2 = [15,15]
cov2= [[1,0],[0,1]]
mean3 = [20,20]
cov3= [[1,0],[0,1]]
nm1=multivariate_normal(mean1,cov1)
nm2=multivariate_normal(mean2,cov2)
nm3=multivariate_normal(mean3,cov3)
data1_1=nm1.rvs(25)
data1_2=nm2.rvs(35)
data1_3=nm3.rvs(10)
data=np.r_[data1_1,data1_2,data1_3]
plt.scatter(data1_1[:,0],data1_1[:,1],c='r')
plt.scatter(data1_2[:,0],data1_2[:,1],c='b')
plt.scatter(data1_3[:,0],data1_3[:,1],c='y')
遗传算法部分
N=2#num of dimensions
K=3#num of clusters
data=data
toolbox=base.Toolbox()
creator.create('maxFit',base.Fitness,weights=(1,))
creator.create('Individual',np.ndarray,fitness=creator.maxFit)
def initCenter():
return data[random.sample(range(len(data)),1),:].ravel()
toolbox.register('individual',tools.initRepeat,creator.Individual,initCenter,K)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
def form_clusters(x):
#染色体由K个聚类中心组成
centers=toolbox.clone(x)
#create empty clusters
clusters=[[] for i in range(K)]
#cacluate score values respect to each center for each data
#距离矩阵,数据个数*类数,即每个数据相对于每一个类的距离
clustMatrix=np.zeros((len(data),K))#data得是array!
for i,d in enumerate(data):
for j,c in enumerate(centers):
clustMatrix[i,j]=np.linalg.norm(d-c)
#print clustMatrix
#the index of the minumum for each column
#最近的中心:一个array,长度等于数据个数
closestIndex=np.argmin(clustMatrix,axis=1)
#print closestIndex
#根据最近中心,将数据分类
for i,d in enumerate(data):
clusters[closestIndex[i]].append(d)
#重新计算聚类中心:N*K
new_centers=[np.average(np.array(clusters[i]),axis=0) for i in range(K)]
#print '0',new_centers
#下面是处理某一类为空的情况,将其变成上一次的值
for i,c in enumerate(new_centers):
if np.isnan(c).any():
new_centers[i]=centers[i]
#print '1',new_centers
return new_centers,clusters
def evaluate(x):
#将数据重新分类
centers,clusters=form_clusters(x)
#将聚类中心赋给染色体x
x=toolbox.clone(centers)
#计算fitness
s=0
for i in range(K):
if clusters[i]!=[]:
s=s+np.linalg.norm(np.array(clusters[i])-np.array(centers[i])).sum()#使用了broadcast
return 1./s,
toolbox.register("evaluate", evaluate)
def cxOnePoint(ind1, ind2):
size = len(ind1)
cxpoint = random.randint(1,size-1)#Return a random integer N such that a <= N <= b
ind1[cxpoint:],ind2[cxpoint:]=ind2[cxpoint:].copy(),ind1[cxpoint:].copy()
toolbox.register("mate", cxOnePoint)
def mute(ind):
for d in ind:
delta=random.random()
if delta<=0.5:
d=d*(1+2*delta)
else :
d=d*(1-2*delta)
print 'I m in mutation!'
toolbox.register("mutate", mute)
toolbox.register("select", tools.selRoulette)
def test():
pop=toolbox.population(n=10)
CXPB, MUTPB, NGEN = 0.8, 0.001, 100
fitness=map(toolbox.evaluate,pop)
for p,f in zip(pop,fitness):
p.fitness.values=f
for g in range(NGEN):
offspring=map(toolbox.clone,toolbox.select(pop,len(pop)))
for child1,child2 in zip(offspring[::2],offspring[1::2]):
if random.random() < CXPB:
toolbox.mate(child1,child2)
del child1.fitness.values
del child2.fitness.values
for mutant in offspring:
if random.random() < MUTPB:
toolbox.mutate(mutant)
del mutant.fitness.values
invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
fitnesses = map(toolbox.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
pop[:] = offspring
return pop
pop=test()
1./np.array(max(map(toolbox.evaluate,pop)))
结果
array([ 21.98945422])
2015.3.30更新:3个聚类中心的情况,iris数据集产生的4维数据
数据部分
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from deap import base,creator,tools
import random
from sklearn import datasets
iris = datasets.load_iris()
遗传算法部分
由于和上一小节2015.3.30更新:3个聚类中心的情况一样,因此只改动了一行代码
data=iris.data
此时总体仍为10个
pop=toolbox.population(n=10)
结果array([ 15.21805375])
总体为5时为array([ 19.4343661])
低于paper中的数据97.10077
初步猜测性能变好的原因是:paper中的变异部分:
If the value at a gene position is v,after mutation it becomes v(1+或-2delta)
也就是对于染色体中的每一个坐标不管是x或y,都是单独变异的,而我的是一个聚类中心一起变异的!
原算法:
def mute(ind):
for d in ind:
delta=random.random()
if delta<=0.5:
d=d*(1+2*delta)
else :
d=d*(1-2*delta)
print 'I m in mutation!'
和paper一致后
def mute(ind):
for d in ind:
for x in d:
delta=random.random()
if delta<=0.5:
x=x*(1+2*delta)
else :
x=x*(1-2*delta)
print 'I m in mutation!'
toolbox.register("mutate", mute)
实测两种方法下均变异了两次,聚类中心’整个‘变异,就是第一种情况结果array([ 15.71670705])
第二种情况,分别变异结果:array([ 15.47171319])
我汗,果然还是paper中的方法牛,我再增大一下变异概率试试,都从0.001变为0.1
第一种array([ 15.07933766])
第二种array([ 15.3066149])
此时第一种又略好了!
最终版代码:
N=4#num of dimensions这个维度信息其实没有用到
K=3#num of clusters
data=iris.data
toolbox=base.Toolbox()
creator.create('maxFit',base.Fitness,weights=(1,))
creator.create('Individual',np.ndarray,fitness=creator.maxFit)
def initCenter():
return data[random.sample(range(len(data)),1),:].ravel()
toolbox.register('individual',tools.initRepeat,creator.Individual,initCenter,K)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
def form_clusters(x):
#染色体由K个聚类中心组成
centers=toolbox.clone(x)
#create empty clusters
clusters=[[] for i in range(K)]
#cacluate score values respect to each center for each data
#距离矩阵,数据个数*类数,即每个数据相对于每一个类的距离
clustMatrix=np.zeros((len(data),K))#data得是array!
for i,d in enumerate(data):
for j,c in enumerate(centers):
clustMatrix[i,j]=np.linalg.norm(d-c)
#print clustMatrix
#the index of the minumum for each column
#最近的中心:一个array,长度等于数据个数
closestIndex=np.argmin(clustMatrix,axis=1)
#print closestIndex
#根据最近中心,将数据分类
for i,d in enumerate(data):
clusters[closestIndex[i]].append(d)
#重新计算聚类中心:N*K
new_centers=[np.average(np.array(clusters[i]),axis=0) for i in range(K)]
#print '0',new_centers
#下面是处理某一类为空的情况,将其变成上一次的值
for i,c in enumerate(new_centers):
if np.isnan(c).any():
new_centers[i]=centers[i]
#print '1',new_centers
return new_centers,clusters
def evaluate(x):
#将数据重新分类
centers,clusters=form_clusters(x)
#将聚类中心赋给染色体x
x=toolbox.clone(centers)
#计算fitness
s=0
for i in range(K):
if clusters[i]!=[]:
s=s+np.linalg.norm(np.array(clusters[i])-np.array(centers[i])).sum()#使用了broadcast
return 1./s,
toolbox.register("evaluate", evaluate)
def cxOnePoint(ind1, ind2):
size = len(ind1)
cxpoint = random.randint(1,size-1)#Return a random integer N such that a <= N <= b
ind1[cxpoint:],ind2[cxpoint:]=ind2[cxpoint:].copy(),ind1[cxpoint:].copy()
toolbox.register("mate", cxOnePoint)
def mute(ind):
for d in ind:
for x in d:
delta=random.random()
if delta<=0.5:
x=x*(1+2*delta)
else :
x=x*(1-2*delta)
print 'I m in mutation!'
toolbox.register("mutate", mute)
toolbox.register("select", tools.selRoulette)
def test():
pop=toolbox.population(n=10)
CXPB, MUTPB, NGEN = 0.8, 0.001, 100
fitness=map(toolbox.evaluate,pop)
for p,f in zip(pop,fitness):
p.fitness.values=f
for g in range(NGEN):
offspring=map(toolbox.clone,toolbox.select(pop,len(pop)))
for child1,child2 in zip(offspring[::2],offspring[1::2]):
if random.random() < CXPB:
toolbox.mate(child1,child2)
del child1.fitness.values
del child2.fitness.values
for mutant in offspring:
if random.random() < MUTPB:
toolbox.mutate(mutant)
del mutant.fitness.values
invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
fitnesses = map(toolbox.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
pop[:] = offspring
return pop
pop=test()
1./np.array(max(map(toolbox.evaluate,pop)))