lgfj

          数据来源于公司的mongodb 数据库,由于公司保密需要,端口不能给出。大家看下吧。利用四个小区的历史交易训练模型,给出房价输出。

import pymongo
from pymongo import MongoClient
import numpy as np
import pandas as  pd
from pandas import DataFrame,Series
from numpy import row_stack,column_stack


client = MongoClient('192.168.xx.xx',2xxxx)
db = client.fangjia
seawater = db.seawater
seawater.find_one()

#["dancing","swimming"]
query = {"city":"上海","cat":"sell","region":"浦东",
         "district_name":{"$in":["康桥半岛二期","康桥半岛五期",
                                 "绿洲清水湾","中邦城市"]},"p_date":{"$gt":20170508}}


lt= seawater.count(query)
print(lt)
pos = list()

for s in seawater.find(query).limit(lt-1):
    pos.append(s)

data=DataFrame(pos)

data.to_excel('data.xls')

choose_class=['total_price','area','height','room',
             'direction','hall','toilet','fitment','district_name','p_date'
             ]

dc=data[choose_class]

dc.to_excel('dc.xls')



'''
lo=list(range(dc.shape[0]))

la=list(range(dc.shape[0]))




k2=[121.5886,31.148452]    #康桥半岛二期经纬度

k5=[121.589463,31.139917]  #康桥半岛五期经纬度

lw=[121.586066,31.154501]  #绿洲清水湾经纬度

klk=[121.58401,31.157145]  #中邦城市期经纬度

'''

for i in dc['district_name'].index :
    if dc['district_name'][i]=='康桥半岛二期':
        dc['district_name'][i]=0


    elif dc['district_name'][i]=='康桥半岛五期':
        dc['district_name'][i]=1

    elif dc['district_name'][i]=='绿洲清水湾':
        dc['district_name'][i]=2

    elif dc['district_name'][i]=='中邦城市':
         dc['district_name'][i] =3

'''      
for i in dc['district_name'].index :
    if dc['district_name'][i]=='康桥半岛二期':
        dc['district_name'][i]=0


    elif dc['district_name'][i]=='康桥半岛五期':
        dc['district_name'][i]=1

    elif dc['district_name'][i]=='绿洲清水湾':
        dc['district_name'][i]=2

    elif dc['district_name'][i]=='康桥绿洲康城1':
        dc['district_name'][i] =3
'''
'''
dc.to_excel('dc.xls')  


for i in dc['direction'].index:
    if ('' in dc['direction'][i]) or ('西' in dc['direction'][i]):
        dc['direction'][i]=0
    else:

        dc['direction'][i]=1


for i in dc['fitment'].index:
    if ('' in dc['fitment'][i]==True) or ('' in dc['fitment'][i]==True):
        dc['fitment'][i]=0
    elif ('' in dc['fitment'][i]==True) :


        dc['direction'][i]=1   

    else :


        dc['direction'][i]=2 

'''

uy=dc.values



for i in range(uy.shape[0]):
    if (uy[i][4]=='南') or (uy[i][4]=='南北'):
        uy[i][4]=1
    else:

        uy[i][4]=0

for i in range(uy.shape[0]):
    if (uy[i][7]=='精装修') or (uy[i][7]=='中装修'):
        uy[i][7]=1
    else:

        uy[i][7]=0



uu=DataFrame(uy)

uu1 = uu.fillna({2:18,3:3,5:2,6:2,7:1})




data_train = uu1.drop([0],axis=0)



data_max = data_train.max()
data_min = data_train.min()

data_train1 = (data_train-data_min)/(data_max-data_min+0.2) #数据标准化

knife=int(0.95*(data_train.shape[0]))#用于切割数据80%用于训练,20%用于计算



x_train = data_train1.iloc[0:knife,1:9].as_matrix() #训练样本标签列
y_train = data_train1.iloc[0:knife,0:1].as_matrix() #训练样本特征



from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation

model = Sequential() #建立模型
model.add(Dense(input_dim = 8, output_dim = 48)) #添加输入层、隐藏层的连接
model.add(Activation('tanh')) #以Relu函数为激活函数

model.add(Dense(input_dim = 100, output_dim = 100)) #添加隐藏层、隐藏层的连接
model.add(Activation('relu')) #以Relu函数为激活函数
model.add(Dropout(0.2))

model.add(Dense(input_dim = 100, output_dim = 50)) #添加隐藏层、隐藏层的连接
model.add(Activation('relu')) #以Relu函数为激活函数
model.add(Dropout(0.2))
model.add(Dense(input_dim = 50, output_dim = 36)) #添加隐藏层、隐藏层的连接
model.add(Activation('relu')) #以Relu函数为激活函数

model.add(Dense(input_dim = 36, output_dim = 12)) #添加隐藏层、隐藏层的连接
model.add(Activation('relu')) #以Relu函数为激活函数
model.add(Dense(input_dim = 12, output_dim = 12)) #添加隐藏层、隐藏层的连接
model.add(Activation('relu')) #以Relu函数为激活函数


model.add(Dense(input_dim = 12, output_dim = 1)) #添加隐藏层、输出层的连接
model.add(Activation('tanh')) #以sigmoid函数为激活函数
#编译模型,损失函数为binary_crossentropy,用adam法求解
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train, y_train, nb_epoch = 300, batch_size = 5) #训练模型

model.save_weights('net.model') #保存模型参数


x_test = data_train1.iloc[knife:,1:9].as_matrix() #训练样本标签列
y_test = data_train1.iloc[knife:,0:1].as_matrix() #训练样本特征


r = pd.DataFrame(model.predict(x_test))
rt=r*(data_max-data_min+0.2)+data_min 
#print(rt.round(2))




predict=rt.values[:,0:1]

realvalue= data_train.values[knife:,0:1]

error=abs((predict-realvalue)/realvalue)*100

geek=column_stack((predict,realvalue,error))



DataFrame(geek).to_excel('geek.xls')

print(geek)

print('平均计算误差:','%.2f'%error.mean(),'%')

          输出的是小区均价,已经把时间平滑处理,即把时间转换成一组数,随机从数据集中取出一条数据进行验证,当然训练集不包含此条数据,计算结果非常好,误差几乎是0。在这一点上,神经网络秒杀经典机器学习算法,秒杀xgboost

# -*- coding: utf-8 -*-
"""
Created on Thu Aug 24 15:14:07 2017

@author: Administrator
"""

import pymongo
from pymongo import MongoClient
import numpy as np
import pandas as  pd
from pandas import DataFrame,Series
from numpy import row_stack,column_stack
from dateutil.parser import parse
from matplotlib.pylab import date2num
import random

#从公司的数据库中导入数据
client = MongoClient('192.168.xx.xx',2xxxx)
db = client.fangjia
seawater = db.seawater
seawater.find_one()

# 索引数据库里的数据
query = {"city":"上海","cat":"sell","region":"松江",
         "district_name":{"$in":["绿洲比华利花园","沿海丽水馨庭","雅仕轩","上海康城"]},
         "p_date":{"$gt":20170508}}


lt= seawater.count(query)
print(lt)
pos = list()
#数据转化为数组,数组的元素为字典
for s in seawater.find(query).limit(lt-1):
    pos.append(s)

#将数据转化为  DataFrame
data=DataFrame(pos)

data.to_excel('data.xls')

#需要提取的特征
choose_class=['total_price','area','height','room',
             'direction','hall','toilet','fitment','district_name','p_date'
             ]

dc=data[choose_class]
#将'total_price' 转化为均价,并把均价赋值给'total_price'
mean_price=dc['total_price']/dc['area']

dc['total_price']=mean_price #将'total_price' 转化为均价

#这段代码用于把时间转化成一个连续的数,至于是否有效有待观察
####################
h=dc['p_date']
for i in range(1,len(h)):   
    a=int(h[i])    
    b=str(a)    
    c=parse(b)        
    e = date2num(c)    
    h[i]=e 

dc['p_date']=h
################### 
dc.to_excel('dc.xls')



'''
lo=list(range(dc.shape[0]))

la=list(range(dc.shape[0]))




k2=[121.5886,31.148452]    #康桥半岛二期经纬度

k5=[121.589463,31.139917]  #康桥半岛五期经纬度

lw=[121.586066,31.154501]  #绿洲清水湾经纬度

klk=[121.58401,31.157145]  #中邦城市期经纬度

'''

for i in dc['district_name'].index :
    if dc['district_name'][i]=='绿洲比华利花园':
        dc['district_name'][i]=0


    elif dc['district_name'][i]=='沿海丽水馨庭':
        dc['district_name'][i]=1

    elif dc['district_name'][i]=='雅仕轩':
        dc['district_name'][i]=2

    elif dc['district_name'][i]=='上海康城':
         dc['district_name'][i] =3

'''      
for i in dc['district_name'].index :
    if dc['district_name'][i]=='康桥半岛二期':
        dc['district_name'][i]=0


    elif dc['district_name'][i]=='康桥半岛五期':
        dc['district_name'][i]=1

    elif dc['district_name'][i]=='绿洲清水湾':
        dc['district_name'][i]=2

    elif dc['district_name'][i]=='康桥绿洲康城1期':
        dc['district_name'][i] =3
'''
'''
dc.to_excel('dc.xls')  


for i in dc['direction'].index:
    if ('东' in dc['direction'][i]) or ('西' in dc['direction'][i]):
        dc['direction'][i]=0
    else:

        dc['direction'][i]=1


for i in dc['fitment'].index:
    if ('豪' in dc['fitment'][i]==True) or ('精' in dc['fitment'][i]==True):
        dc['fitment'][i]=0
    elif ('毛' in dc['fitment'][i]==True) :


        dc['direction'][i]=1   

    else :


        dc['direction'][i]=2 

'''

uy=dc.values



for i in range(uy.shape[0]):
    if (uy[i][4]=='南') or (uy[i][4]=='南北'):
        uy[i][4]=1
    else:

        uy[i][4]=0

for i in range(uy.shape[0]):
    if (uy[i][7]=='精装修') or (uy[i][7]=='中装修'):
        uy[i][7]=1
    else:

        uy[i][7]=0



uu=DataFrame(uy)

uu1 = uu.fillna({2:18,3:3,5:2,6:2,7:1})




data_all = uu1.drop([0],axis=0)

sample_number=data_all.shape[0]

kk=int(0.05 *sample_number)

test_label=[random.randint(0,sample_number) for _ in range(kk)]







data_train= data_all.drop(test_label,axis=0)
#data_train.to_excel('data_train.xls')
data_max = data_train.max()
data_min = data_train.min()

data_train1 = (data_train-data_min)/(data_max-data_min+0.2) #数据标准化

#knife=int(0.95*(data_train.shape[0]))#用于切割数据80%用于训练,20%用于计算

x_train = data_train1.iloc[:,1:10].as_matrix() #训练样本标签列
y_train = data_train1.iloc[:,0:1].as_matrix() #训练样本特征



from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation

model = Sequential() #建立模型
model.add(Dense(input_dim = 9, output_dim = 48)) #添加输入层、隐藏层的连接
model.add(Activation('tanh')) #以Relu函数为激活函数

model.add(Dense(input_dim = 100, output_dim = 100)) #添加隐藏层、隐藏层的连接
model.add(Activation('relu')) #以Relu函数为激活函数
model.add(Dropout(0.2))

model.add(Dense(input_dim = 100, output_dim = 50)) #添加隐藏层、隐藏层的连接
model.add(Activation('relu')) #以Relu函数为激活函数
model.add(Dropout(0.2))
model.add(Dense(input_dim = 50, output_dim = 36)) #添加隐藏层、隐藏层的连接
model.add(Activation('relu')) #以Relu函数为激活函数

model.add(Dense(input_dim = 36, output_dim = 12)) #添加隐藏层、隐藏层的连接
model.add(Activation('relu')) #以Relu函数为激活函数
model.add(Dense(input_dim = 12, output_dim = 12)) #添加隐藏层、隐藏层的连接
model.add(Activation('relu')) #以Relu函数为激活函数


model.add(Dense(input_dim = 12, output_dim = 1)) #添加隐藏层、输出层的连接
model.add(Activation('tanh')) #以sigmoid函数为激活函数
#编译模型,损失函数为binary_crossentropy,用adam法求解
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train, y_train, nb_epoch = 200, batch_size = 3) #训练模型

model.save_weights('net.model') #保存模型参数


test=data_all.ix[test_label,:]

#test_max = test.max()
#test_min = test.min()
data_test = (test-data_min)/(data_max-data_min+0.2) 

x_test = data_test.iloc[:,1:10].as_matrix()
y_test = data_test.iloc[:,0:1].as_matrix()




#x_test = data_train1.iloc[knife:,1:9].as_matrix() #训练样本标签列
#y_test = data_train1.iloc[knife:,0:1].as_matrix() #训练样本特征


r = pd.DataFrame(model.predict(x_test))
rt=r*(data_max-data_min+0.2)+data_min
#print(rt.round(2))




predict=rt.values[:,0:1]

realvalue= test.iloc[:,0:1].as_matrix()

error=abs((predict-realvalue)/realvalue)*100

geek=column_stack((predict,realvalue,error))



DataFrame(geek).to_excel('geek.xls')

print(geek)

print('平均计算误差:','%.2f'%error.mean(),'%')

均值填充和考虑经纬度2017.8.30

# -*- coding: utf-8 -*-
"""
Created on Thu Aug 24 15:14:07 2017

@author: Administrator
"""

import pymongo
from pymongo import MongoClient
import numpy as np
import pandas as  pd
from pandas import DataFrame,Series
from numpy import row_stack,column_stack
from dateutil.parser import parse
from matplotlib.pylab import date2num
import random

#导入经度和纬度

#从公司的数据库中导入数据
client1 = MongoClient('192.168.0.136',xxx)
db1 = client1.fangjia
seaweed1 = db1.seaweed

#print(seaweed.find_one({"city":"上海","region":"浦东","name":"康桥半岛二期"},{"lat2":1,"lng2":1}))

'''
print(seaweed.find_one({"city":"上海","region":"浦东",
                        "name":{"$in":["康桥半岛二期","康桥半岛三期","绿洲清水湾","中邦城市"]}}
                        ,{"lat2":1,"lng2":1}))

'''
query1 = {"status":0,"cat":"district","city":"上海","region":"浦东", "name":{"$in":["康桥半岛二期","康桥半岛三期","绿洲清水湾","中邦城市"]}}
fields1 = {"lat2":1,"lng2":1, "city":1,"region":1,"cat":1,"name":1}

lct= list()
for s in seaweed.find(query1, fields1):
    lct.append(s)

lf=DataFrame(lct)

le=lf    

le.index=le['name'] 

lr=le[['lng2','lat2']]









#从公司的数据库中导入数据
client = MongoClient('192.168.10.88',2xxxx)
db = client.fangjia
seawater = db.seawater
seawater.find_one()

# 索引数据库里的数据
query = {"city":"上海","cat":"sell","region":"浦东",
         "district_name":{"$in":["康桥半岛二期","康桥半岛三期","绿洲清水湾","中邦城市"]},


         "p_date":{"$gt":20160508}}


lt= seawater.count(query)
print(lt)
pos = list()
#数据转化为数组,数组的元素为字典
for s in seawater.find(query).limit(lt-1):
    pos.append(s)

#将数据转化为  DataFrame
data=DataFrame(pos)

data.to_excel('data.xls')

#需要提取的特征
choose_class=['total_price','area','height','room',
             'direction','hall','toilet','fitment','district_name','p_date'
             ]

dc=data[choose_class]




dc['lng2']=0
dc['lat2']=1


'''
for i in range(dc.shape[0]):

    bn=dc['district_name']

    p=bn[i]

    dc['lng2'][i]=lo['lng2'][p]

'''

for i in range(dc.shape[0]):

    if dc['district_name'][i]==lr.index[0]:

        dc['lng2'][i]=lr['lng2'][0]
        dc['lat2'][i]=lr['lat2'][0]

    elif dc['district_name'][i]==lr.index[1]:
        dc['lng2'][i]=lr['lng2'][1]
        dc['lat2'][i]=lr['lat2'][1]

    elif dc['district_name'][i]==lr.index[2]:
        dc['lng2'][i]=lr['lng2'][2]
        dc['lat2'][i]=lr['lat2'][2]

    elif dc['district_name'][i]==lr.index[3]:
        dc['lng2'][i]=lr['lng2'][3]
        dc['lat2'][i]=lr['lat2'][3]


#将'total_price' 转化为均价,并把均价赋值给'total_price'
mean_price=dc['total_price']/dc['area']

dc['total_price']=mean_price #将'total_price' 转化为均价

#这段代码用于把时间转化成一个连续的数,至于是否有效有待观察
####################
h=dc['p_date']
for i in range(1,len(h)):   
    a=int(h[i])    
    b=str(a)    
    c=parse(b)        
    e = date2num(c)    
    h[i]=e 

dc['p_date']=h
################### 
dc.to_excel('dc.xls')




'''
#给每个小区赋予一个标签
for i in dc['district_name'].index :
    if dc['district_name'][i]=='康桥半岛二期':
        dc['district_name'][i]=0


    elif dc['district_name'][i]=='康桥半岛三期':
        dc['district_name'][i]=1

    elif dc['district_name'][i]=='绿洲清水湾':
        dc['district_name'][i]=2

    elif dc['district_name'][i]=='中邦城市':
         dc['district_name'][i] =3

'''

for i in dc['direction'].index:

    if ('南' in str(dc['direction'][i])) :
        dc['direction'][i]=0
    else:

        dc['direction'][i]=1


for i in dc['fitment'].index:
    if ('豪' or '精') in str(dc['fitment'][i]) :
        dc['fitment'][i]=0

    else :
        dc['fitment'][i]=1




dc=dc.fillna({'height':dc['height'].mean(),
              'room':dc['room'].mean(),
              'toilet':dc['toilet'].mean(),
              'hall':dc['hall'].mean(),
              })

ds=dc.drop('district_name',axis=1)



data_all = ds.drop([0],axis=0)

sample_number=data_all.shape[0]

kk=int(0.05 *sample_number)

test_label=[random.randint(1,sample_number) for _ in range(kk)]







data_train= data_all.drop(test_label,axis=0)
#data_train.to_excel('data_train.xls')
data_max = data_train.max()
data_min = data_train.min()

data_train1 = (data_train-data_min)/(data_max-data_min+0.2) #数据标准化

#knife=int(0.95*(data_train.shape[0]))#用于切割数据80%用于训练,20%用于计算

x_train = data_train1.iloc[:,1:11].as_matrix() #训练样本标签列
y_train = data_train1.iloc[:,0:1].as_matrix() #训练样本特征



from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation

model = Sequential() #建立模型
model.add(Dense(input_dim = 10, output_dim = 48)) #添加输入层、隐藏层的连接
model.add(Activation('relu')) #以Relu函数为激活函数

model.add(Dense(input_dim = 48, output_dim = 100)) #添加隐藏层、隐藏层的连接
model.add(Activation('relu')) #以Relu函数为激活函数


model.add(Dense(input_dim = 100, output_dim = 50)) #添加隐藏层、隐藏层的连接
model.add(Activation('relu')) #以Relu函数为激活函数

model.add(Dense(input_dim = 50, output_dim = 36)) #添加隐藏层、隐藏层的连接
model.add(Activation('relu')) #以Relu函数为激活函数

model.add(Dense(input_dim = 36, output_dim = 12)) #添加隐藏层、隐藏层的连接
model.add(Activation('relu')) #以Relu函数为激活函数
model.add(Dense(input_dim = 12, output_dim = 12)) #添加隐藏层、隐藏层的连接
model.add(Activation('relu')) #以Relu函数为激活函数


model.add(Dense(input_dim = 12, output_dim = 1)) #添加隐藏层、输出层的连接
model.add(Activation('sigmoid')) #以sigmoid函数为激活函数
#编译模型,损失函数为binary_crossentropy,用adam法求解
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train, y_train, nb_epoch = 300, batch_size = 2) #训练模型

model.save_weights('net.model') #保存模型参数


test=data_all.ix[test_label,:]

#test_max = test.max()
#test_min = test.min()
data_test = (test-data_min)/(data_max-data_min+0.2) 

x_test = data_test.iloc[:,1:11].as_matrix()
y_test = data_test.iloc[:,0:1].as_matrix()




#x_test = data_train1.iloc[knife:,1:9].as_matrix() #训练样本标签列
#y_test = data_train1.iloc[knife:,0:1].as_matrix() #训练样本特征


r = (model.predict(x_test))
rt=r*(data_max.values-data_min.values+0.2)+data_min.values
#print(rt.round(2))




predict=rt[:,0:1]

realvalue= test.iloc[:,0:1].as_matrix()

error=abs((predict-realvalue)/realvalue)*100

geek=column_stack((predict,realvalue,error))



DataFrame(geek).to_excel('geek.xls')

print(geek)

print('平均计算误差:','%.2f'%error.mean(),'%')









































posted @ 2022-08-19 22:59  luoganttcc  阅读(100)  评论(0编辑  收藏  举报