中国城市聚类###

# -*- coding: utf-8 -*-
kmeans算法
"""
Created on Thu May 18 22:55:45 2017

@author: sfzyk
"""
import numpy as np
#import sklearn as skl
from sklearn.cluster import KMeans
import os 
os.chdir(r"D:\mechine_learning\mooc_data")
def loaddata(file):
    fr=open(file)
    lines=fr.readlines()#按照行分割 \n为标志(存在 
    city_data=[]
    city_name=[]
    for line in lines:
        d=line.split(",")
        city_name.append(d[0])
        city_data.append([float(d[i]) for i in range(1,len(d))])                
    return city_name,city_data
city_name,city_data=loaddata("31省市居民家庭消费水平-city.txt")


km=KMeans(n_clusters=10)

label=km.fit_predict(city_data)

expenses=np.sum(km.cluster_centers_,axis=1)

city_cluster=[]

for i in range(km.n_clusters):
    city_cluster.append([])
    city_cluster[i].append(expenses[i])

for i in range(len(city_name)):
    city_cluster[label[i]].append(city_name[i])
    
#  city_cluster,key=lambda x : x[0]
city_cluster.sort(key=lambda x:x[0]) 
  
for i in range(len(city_cluster)):
    print(city_cluster[i])

学生上网数据聚类###

bdscan算法

# -*- coding: utf-8 -*-
"""
Created on Mon May 22 16:24:53 2017

@author: sfzyk
"""
import numpy as np
import sklearn as skl
from sklearn import metrics
import matplotlib.pyplot as plt

 
mac2id=dict()
onlinetimes=[]
f=open("学生月上网时间分布-TestData.txt",encoding='utf-8')
#这里的encoding 是有必要的 不知道在开始指定coding是什么意思
for line in f:
    mac=line.split(',')[2]
    onlinetime=int(line.split(',')[6])
    starttime=int(line.split(',')[4].split(' ')[1].split(':')[0])
    if mac not in mac2id:
        mac2id[mac]=len(onlinetimes)
        onlinetimes.append((starttime,onlinetime))
    else:
        onlinetimes[mac2id[mac]]=[(starttime,onlinetime)]
real_X=np.array(onlinetimes).reshape((-1,2))

X=real_X[:,0:1]
dbscan=skl.cluster.DBSCAN(eps=0.03,min_samples=20).fit(X)
labels=dbscan.labels_

ratio=len(labels[labels[:]==-1])/len(labels)
print("noise ratio %f"%ratio)

n_clusters_ = len(set(labels))-(1 if -1 in labels else 0)

print("Estimated number of clusters:%d "%n_clusters_)

print("Silhouette coefficient:%0.3f"%metrics.silhouette_score(X,labels))

for i in range(n_clusters_):
    print("Clusters ",i,":")
    print(list(X[labels==i].flatten()))
    #flatten  nX1 - 1Xn
plt.hist(X,24)
    

这里有一个Silhouette coefficient是一个轮廓系数,用于评价聚类效果