李宏毅深度学习第二次作业 Logistic regression 预测年薪超过50W

 1 import pandas as pd
 2 import numpy as np
 3 '''
 4 整体和PM2.5差不多
 5 参考博客:https://www.cnblogs.com/HL-space/p/10785225.html
 6 https://www.cnblogs.com/tingtin/p/12321465.html
 7 '''
 8 epsilon = 1e-5
 9 def train(x_train,y_train,epoch):
10     num =x_train.shape[0]#row
11     feat= x_train.shape[1]#col
12     bias  = 0
13     w = np.ones(feat)
14     lr =1
15     reg_rate=0.001
16     b_sum=0
17     w_sum = np.zeros(feat)
18 
19     for i in range(epoch):
20         b_ =0
21         w_ = np.zeros(feat)
22         for j in range(num):
23             y = w.dot(x_train[j,:])+bias
24             sig = 1/(1+np.exp(-y))
25             b_ += (-1)*(y_train[j]-sig)
26             for k in range(feat):
27                 w_[k] += (-1)*(y_train[j]-sig)*x_train[j,k]+2*reg_rate*w[k]#加入正则化
28         b_/=num
29         w_/=num
30 
31         b_sum+=b_**2
32         w_sum+=w_**2
33 
34 
35         bias-=lr/b_sum**0.5*b_
36         w-=lr/w_sum**0.5*w_
37 
38 
39         if i%3==0:
40             loss = 0
41             acc  =0
42             result = np.zeros(num)
43             for j in range(num):
44                 y = w.dot(x_train[j,:])+bias
45                 sig =1/(1+np.exp(-y))
46                 if sig >=0.5:#大于0.5认为年薪>50W
47                     result[j] =1
48                 else:
49                     result[j] = 0
50                 if result[j] ==y_train[j]:
51                     acc+=1.0
52                #log(x) x接近0可能溢出,那么+1e-5
53                 loss+=(-1)*(y_train[j]*np.log(sig+epsilon) +(1-y_train[j]*np.log(1-sig+epsilon)))#1-sig后面也要加1e-5
54             print('after {} epochs, the loss on train data is:'.format(i), loss / num)
55             print('after {} epochs,the acc on train data is:'.format(i), acc / num)
56 
57 
58     return w,bias
59 
60 
61 
62 
63 def val(x_val,y_val,w,bias):
64     num = x_val.shape[0]#500
65     acc = 0
66     result = np.zeros(num)
67     for j in range(num):
68         y = w.dot(x_val[j, :]) + bias
69         sig = 1 / (1 + np.exp(-y))
70         if sig >= 0.5:
71             result[j] = 1
72         else:
73             result[j] = 0
74         if result[j] == y_val[j]:
75             acc += 1.0
76     return  acc/num
77 
78 def main():
79     cs = pd.read_csv('train.csv')
80     cs  = cs.fillna(0)## 用一个数字(此处用0)填充缺失值
81 
82     array = np.array(cs)
83 
84     x = array[:,1:-1]#第二列到倒数第二列
85     x[:,-1]/=np.mean(x[:,-1])#x[]的最后一列的值均除以该列的均值
86     x[:, -2] /= np.mean(x[:, -2])
87 
88     y = array[:,-1]#取array的最后一列
89     x_train, x_val = x[0:3500,:],x[3500:4000,:]
90     y_train,y_val = y[0:3500],y[3500:4000]
91     epoch = 30
92     w,b  = train(x_train,y_train,epoch)
93     acc = val(x_val,y_val,w,b)
94     print('The acc on test data is: ',acc)
95 
96 
97 if __name__ =='__main__':
98     main()

 

 

 

 

 

数据集下载

链接: https://pan.baidu.com/s/10v3I-nCi9yM8Mc0IJRBmaA 提取码: hyje 

posted on 2020-02-18 18:46  cltt  阅读(542)  评论(0编辑  收藏  举报

导航