"""
Created on Thu Aug 24 15:14:07 2017
@author: Administrator
"""
import pymongo
from pymongo import MongoClient
import numpy as np
import pandas as pd
from pandas import DataFrame,Series
from numpy import row_stack,column_stack
from dateutil.parser import parse
from matplotlib.pylab import date2num
import random
client1 = MongoClient('xxxxxxxx',xxxxx)
db1 = client1.fangjia
seaweed1 = db1.seaweed
cy_rg=["上海","闵行"]
dirtic_list=["蓝色港湾","平吉一村","蓝色港湾",
"新时代富嘉花园","新时代花园"]
query1 = {"status":0,"cat":"district","city":cy_rg[0],"region":cy_rg[1], "name":{"$in":dirtic_list}}
fields1 = {"lat2":1,"lng2":1, "city":1,"region":1,"cat":1,"name":1}
lct= list()
for s in seaweed1.find(query1, fields1):
lct.append(s)
lf=DataFrame(lct)
le=lf
le.index=le['name']
lr=le[['lng2','lat2']]
client = MongoClient('xxxxxxxxx',xxxxxxx)
db = client.fangjia
seawater = db.seawater
seawater.find_one()
query = {"city":cy_rg[0],"cat":"sell","region":cy_rg[1],
"district_name":{"$in":dirtic_list},
"p_date":{"$gt":20170608}}
lt= seawater.count(query)
print(lt)
pos = list()
for s in seawater.find(query).limit(lt-1):
pos.append(s)
data=DataFrame(pos)
'''
p1=pd.DataFrame(Series(pos[1]))
for i in range(1,42):
s=pos[i]
p2=pd.DataFrame(Series(s))
p1 = pd.concat([p1,p2],axis=1)
'''
data.to_excel('data.xls')
choose_class=['total_price','area','height','room',
'direction','hall','toilet','fitment','district_name','p_date'
]
dc=data[choose_class]
dc['lng2']=0
dc['lat2']=1
'''
#这段代码简洁确不能用
for i in range(dc.shape[0]):
bn=dc['district_name']
p=bn[i]
dc['lng2'][i]=lo['lng2'][p]
'''
'''
for i in range(dc.shape[0]):
if dc['district_name'][i]==lr.index[0]:
dc['lng2'][i]=lr['lng2'][0]
dc['lat2'][i]=lr['lat2'][0]
elif dc['district_name'][i]==lr.index[1]:
dc['lng2'][i]=lr['lng2'][1]
dc['lat2'][i]=lr['lat2'][1]
elif dc['district_name'][i]==lr.index[2]:
dc['lng2'][i]=lr['lng2'][2]
dc['lat2'][i]=lr['lat2'][2]
elif dc['district_name'][i]==lr.index[3]:
dc['lng2'][i]=lr['lng2'][3]
dc['lat2'][i]=lr['lat2'][3]
'''
for i in range(dc.shape[0]):
for j in range(lr.shape[0]):
if dc['district_name'][i]==lr.index[j]:
dc['lng2'][i]=lr['lng2'][j]
dc['lat2'][i]=lr['lat2'][j]
mean_price=dc['total_price']/dc['area']
dc['total_price']=mean_price
h=dc['p_date']
for i in range(1,len(h)):
a=int(h[i])
b=str(a)
c=parse(b)
e = date2num(c)
h[i]=e
dc['p_date']=h
dc.to_excel('dc.xls')
for i in dc['direction'].index:
if ('南' in str(dc['direction'][i])) :
dc['direction'][i]=0
elif('透' in str(dc['direction'][i])):
dc['direction'][i]=1
else:
dc['direction'][i]=2
for i in dc['fitment'].index:
if ('豪' or '精') in str(dc['fitment'][i]) :
dc['fitment'][i]=0
else :
dc['fitment'][i]=1
dc=dc.fillna({'height':dc['height'].mean(),
'room':dc['room'].mean(),
'toilet':dc['toilet'].mean(),
'hall':dc['hall'].mean(),
})
ds=dc.drop('district_name',axis=1)
ds = ds.drop([0],axis=0)
lll=list()
for j in dirtic_list:
fg=dc[dc['district_name']==j]
hh=fg['total_price'].values
hmean=hh.mean()
hstd=hh.std()
lg=list(fg['total_price'].index)
for i in lg:
if (fg['total_price'][i]<(hmean-1.645*hstd))or(fg['total_price'][i]>(hmean+1.645*hstd)):
lll.append(i)
data_all = ds.drop(lll,axis=0)
data_all.to_excel('data_all.xls')
listall=list(data_all.index)
kk=int(0.08 *len(listall))
list_index=[random.randint(1,len(listall)) for _ in range(kk)]
test_label=list(map(lambda x:listall[x],list_index))
data_train= data_all.drop(test_label,axis=0)
data_max = data_train.max()
data_min = data_train.min()
data_train1 = (data_train-data_min)/(data_max-data_min+0.2)
x_train = data_train1.iloc[:,1:11].as_matrix()
y_train = data_train1.iloc[:,0:1].as_matrix()
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
model = Sequential()
model.add(Dense(input_dim = 10, output_dim = 48))
model.add(Activation('relu'))
model.add(Dense(input_dim = 48, output_dim = 100))
model.add(Activation('relu'))
model.add(Dense(input_dim = 100, output_dim = 50))
model.add(Activation('relu'))
model.add(Dense(input_dim = 50, output_dim = 36))
model.add(Activation('relu'))
model.add(Dense(input_dim = 36, output_dim = 12))
model.add(Activation('relu'))
model.add(Dense(input_dim = 12, output_dim = 12))
model.add(Activation('relu'))
model.add(Dense(input_dim = 12, output_dim = 1))
model.add(Activation('sigmoid'))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train, y_train, nb_epoch = 300, batch_size = 2)
model.save_weights('net.model')
test=data_all.ix[test_label,:]
test.to_excel('test.xls')
data_test = (test-data_min)/(data_max-data_min+0.2)
x_test = data_test.iloc[:,1:11].as_matrix()
y_test = data_test.iloc[:,0:1].as_matrix()
r = (model.predict(x_test))
rt=r*(data_max.values-data_min.values+0.2)+data_min.values
p=rt[:,0:1].flatten()
jk=dc.drop([0],axis=0)
cx=list(test.index)
p_dmean_ratio=list()
for j in range(len(cx)):
pk=jk[jk['district_name']==jk['district_name'][cx[j]]]
dmean=pk['total_price'].values.mean()
pmn=p[j]/dmean
p_dmean_ratio.append(pmn)
if (pmn>1.19) or(pmn<0.81):
p[j]=dmean
predict=np.array([p]).T
realvalue= test.iloc[:,0:1].as_matrix()
error=abs((predict-realvalue)/realvalue)*100
pro=(np.array([p_dmean_ratio]).T-1)*100
gek=column_stack((predict,realvalue,error,pro))
geek=DataFrame(gek,columns=['predict','realvalue','error','p_dmean_ratio'],
index=test.index)
test_and_geek=pd.concat([test,geek],axis=1)
output_label=['total_price', 'area', 'height', 'room', 'direction', 'hall',
'toilet','fitment', 'p_date', 'predict', 'realvalue', 'error','p_dmean_ratio']
tg=test_and_geek[output_label]
output_label1=['mean_price', 'area', 'height', 'room', 'direction', 'hall',
'toilet','fitment', 'p_date', 'predict', 'realvalue', 'error','p_dmean_ratio']
tg.columns=output_label1
tg.to_excel('tg.xls')
print(tg)
print('平均计算误差:','%.2f'%error.mean(),'%')