import xlrd
import xlwt
from datetime import date,datetime
import os
from sklearn.cluster import KMeans
import collections
import pandas
import numpy
import re
def opp(ggv):
workbook = xlrd.open_workbook(ggv)
table = workbook.sheet_by_name(u'准确概率计算逻辑-存在竞品对比部分')
a=table.col_values(7)
return a[1:]
def k_means(pp):
pv=list(pp)
if len(set(pv))>3:
gf=numpy.array([pv]).T
estimator = KMeans(n_clusters=3)
estimator.fit(gf)
label_pred = estimator.labels_
aa=collections.Counter(label_pred)
v=pandas.Series(aa)
gg=list(v)
index_max=gg.index(max(gg))
centroids = estimator.cluster_centers_
ppv=centroids.flatten()
ffv=pandas.DataFrame()
ffv['分类个数']=gg
ffv['某类对应价格']=ppv
ffv=ffv.sort_values(by='分类个数',ascending=False)
center=centroids[index_max][0]
return (int(center))
else:
return 0
"""
changzhou=pandas.read_excel('cz.xlsx')
data=changzhou['房价网均价']
"""
lis = os.listdir()
ll=pandas.Series(lis)
lld=ll[ll!='jjj.py']
bb=[]
for i in lld:
aa=[0,0]
ggv=i
ddf=opp(ggv)
out=k_means(ddf)
st = re.sub("[A-Za-z0-9\!\%\[\]\,\。]", "", i)
aa[0]=st
aa[1]=out
bb.append(aa)
print(aa)
bbcc=pandas.DataFrame(bb,columns=['city','price'])
bbcc=bbcc[bbcc['price']>0]
bbcc.to_excel('bbcc.xls')