#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 27 10:51:45 2019
@author: youxinlin
"""
import copy
import math
import random
import time
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
global MAX # 用于初始化隶属度矩阵U
MAX = 10000.0
global Epsilon # 结束条件
Epsilon = 0.0000001
def import_data_format_iris(file):
"""
file这里是输入文件的路径,如iris.txt.
格式化数据,前四列为data,最后一列为类标号(有0,1,2三类)
如果是你自己的data,就不需要执行此段函数了。
"""
data = []
cluster_location = []
with open(str(file), 'r') as f:
for line in f:
current = line.strip().split(",") # 对每一行以逗号为分割,返回一个list
current_dummy = []
for j in range(0, len(current) - 1):
current_dummy.append(float(current[j])) # current_dummy存放data
# 下面注这段话提供了一个范例:若类标号不是0,1,2之类数字时该怎么给数据集
j += 1
cluster_location.append(current[j])
data.append(current_dummy)
# print(data)
# print(cluster_location)
# print(len(data))
# print(len(cluster_location))
# print(data)
print("加载数据完毕")
return data, cluster_location
def randomize_data(data):
"""
该功能将数据随机化,并保持随机化顺序的记录
"""
order = list(range(0, len(data)))
random.shuffle(order)
new_data = [[] for i in range(0, len(data))]
for index in range(0, len(order)):
new_data[index] = data[order[index]]
# print(new_data)
return new_data, order
def de_randomise_data(data, order):
"""
此函数将返回数据的原始顺序,将randomise_data()返回的order列表作为参数
"""
new_data = [[] for i in range(0, len(data))]
for index in range(len(order)):
new_data[order[index]] = data[index]
return new_data
def print_matrix(list):
"""
以可重复的方式打印矩阵
"""
for i in range(0, len(list)):
print(list[i])
def initialize_U(data, cluster_number):
"""
这个函数是隶属度矩阵U的每行加起来都为1. 此处需要一个全局变量MAX.
"""
global MAX
U = []
for i in range(0, len(data)):
current = []
rand_sum = 0.0
for j in range(0, cluster_number):
dummy = random.randint(1, int(MAX))
current.append(dummy)
rand_sum += dummy
for j in range(0, cluster_number):
current[j] = current[j] / rand_sum
U.append(current)
return U
def distance(point, center):
"""
该函数计算2点之间的距离(作为列表)。我们指欧几里德距离。闵可夫斯基距离
"""
if len(point) != len(center):
return -1
dummy = 0.0
for i in range(0, len(point)):
dummy += abs(point[i] - center[i]) ** 2
return math.sqrt(dummy)
def end_conditon(U, U_old):
"""
结束条件。当U矩阵随着连续迭代停止变化时,触发结束
"""
global Epsilon
for i in range(0, len(U)):
for j in range(0, len(U[0])):
if abs(U[i][j] - U_old[i][j]) > Epsilon:
return False
return True
def normalise_U(U):
"""
在聚类结束时使U模糊化。每个样本的隶属度最大的为1,其余为0
"""
for i in range(0, len(U)):
maximum = max(U[i])
for j in range(0, len(U[0])):
if U[i][j] != maximum:
U[i][j] = 0
else:
U[i][j] = 1
return U
# m的最佳取值范围为[1.5,2.5]
def fuzzy(data, cluster_number, m):
"""
这是主函数,它将计算所需的聚类中心,并返回最终的归一化隶属矩阵U.
参数是:簇数(cluster_number)和隶属度的因子(m)
"""
# 初始化隶属度矩阵U
U = initialize_U(data, cluster_number)
# print_matrix(U)
# 循环更新U
while (True):
# 创建它的副本,以检查结束条件
U_old = copy.deepcopy(U)
# 计算聚类中心
C = []
for j in range(0, cluster_number):
current_cluster_center = []
for i in range(0, len(data[0])):
dummy_sum_num = 0.0
dummy_sum_dum = 0.0
for k in range(0, len(data)):
# 分子
dummy_sum_num += (U[k][j] ** m) * data[k][i]
# 分母
dummy_sum_dum += (U[k][j] ** m)
# 第i列的聚类中心
current_cluster_center.append(dummy_sum_num / dummy_sum_dum)
# 第j簇的所有聚类中心
C.append(current_cluster_center)
# 创建一个距离向量, 用于计算U矩阵。
distance_matrix = []
for i in range(0, len(data)):
current = []
for j in range(0, cluster_number):
current.append(distance(data[i], C[j]))
distance_matrix.append(current)
# 更新U
for j in range(0, cluster_number):
for i in range(0, len(data)):
dummy = 0.0
for k in range(0, cluster_number):
# 分母
dummy += (distance_matrix[i][j] / distance_matrix[i][k]) ** (2 / (m - 1))
U[i][j] = 1 / dummy
if end_conditon(U, U_old):
print("结束聚类")
break
print("标准化 U")
U = normalise_U(U)
return U
# def checker_iris(final_location):
# """
# 和真实的聚类结果进行校验比对
# """
# right = 0.0
# for k in range(0, 2):
# checker = [0, 0, 0]
# for i in range(0, 50):
# for j in range(0, len(final_location[0])):
# if final_location[i + (50 * k)][j] == 1: # i+(50*k)表示 j表示第j类
# checker[j] += 1 # checker分别统计每一类分类正确的个数
# right += max(checker) # 累加分类正确的个数
# print('分类正确的个数是:', right)
# answer = right / 150 * 100
# return "准确率:" + str(answer) + "%"
#
# 计算每一簇的数据量
def tongji(final_location,cluster_number):
new_data = [0 for i in range(0, cluster_number)]
for i in range(0, len(final_location)):
for j in range(0, cluster_number):
if final_location[i][j] == 1:
new_data[j] += 1
return new_data
# 得到每个数据属于聚类中的哪一个种类
def getClusters(membership_mat,data):
cluster_labels = list()
# print(membership_mat)
for i in range(len(data)):
max_val, idx = max((val, idx) for (idx, val) in enumerate(membership_mat[i]))
cluster_labels.append(idx)
# print(max_val)
return cluster_labels
# 把数据属于哪一个种类归为正常(0)异常(1)
def suibian(real_label,cluster_number):
a=list()
for p in range(0,cluster_number):
j = 0
for i in range(0,len(real_label)):
if real_label[i] == p:
j+=1
a.append(j)
if (a[p] > int(len(real_label) / (2 * cluster_number))):
for i in range(0, len(real_label)):
if real_label[i] == p:
real_label[i] = 0
else:
for i in range(0, len(real_label)):
if real_label[i] == p:
real_label[i] = 1
q=0
b=list()
for u in range(0, len(real_label)):
if real_label[u] == 0:
q+=1
b.append(q)
print(b)
print(a)
print(real_label)
print(len(real_label) / (2 * cluster_number))
return real_label
def accuracy(cluster_labels, class_labels,data):
county = [0, 0]
countn = [0, 0]
tp = [0, 0]
tn = [0, 0]
fp = [0, 0]
fn = [0, 0]
for i in range(len(data)):
# Yes = 1, No = 0
if cluster_labels[i] == 1 and class_labels[i] == '1':
tp[0] = tp[0] + 1
if cluster_labels[i] == 0 and class_labels[i] == '0':
tn[0] = tn[0] + 1
if cluster_labels[i] == 1 and class_labels[i] == '0':
fp[0] = fp[0] + 1
if cluster_labels[i] == 0 and class_labels[i] == '1':
fn[0] = fn[0] + 1
for i in range(len(data)):
# Yes = 0, No = 1
if cluster_labels[i] == 0 and class_labels[i] == '0':
tp[1] = tp[1] + 1
if cluster_labels[i] == 1 and class_labels[i] == '1':
tn[1] = tn[1] + 1
if cluster_labels[i] == 0 and class_labels[i] == '1':
fp[1] = fp[1] + 1
if cluster_labels[i] == 1 and class_labels[i] == '0':
fn[1] = fn[1] + 1
a0 = float((tp[0] + tn[0])) / (tp[0] + tn[0] + fn[0] + fp[0])
a1 = float((tp[1] + tn[1])) / (tp[1] + tn[1] + fn[1] + fp[1])
p0 = float(tp[0]) / (tp[0] + fp[0])
p1 = float(tp[1]) / (tp[1] + fp[1])
r0 = float(tp[0]) / (tp[0] + fn[0])
r1 = float(tp[1]) / (tp[1] + fn[1])
accuracy = [a0 * 100, a1 * 100]
precision = [p0 * 100, p1 * 100]
recall = [r0 * 100, r1 * 100]
return accuracy, precision, recall
if __name__ == '__main__':
# 加载数据
data, cluster_location= import_data_format_iris("data1--fcm(1).csv")
# print_matrix(data)
# 随机化数据
data, order = randomize_data(data)
# print_matrix(data)
ss = StandardScaler()
data_1 = ss.fit_transform(data)
start = time.time()
# 现在我们有一个名为data的列表,它只是数字
# 我们还有另一个名为cluster_location的列表,它给出了正确的聚类结果位置
# 调用模糊C均值函数
final_location = fuzzy(data_1, 10, 2)
# 还原数据
final_location = de_randomise_data(final_location, order)
# print_matrix(final_location)
real_label = getClusters(final_location,data_1)
print(real_label)
zuizhong_label = suibian(real_label,10)
# print(len(real_label))
# 准确度分析
# 准确度分析
a,p,r = accuracy(zuizhong_label, cluster_location,data)
new_data=tongji(final_location,10)
print(new_data)
print("Accuracy = " + str(a))
print("Precision = " + str(p))
print("Recall = " + str(r))
print("用时:{0}".format(time.time() - start))