机器学习笔记013 kaggle实战代码
1 import hashlib 2 import os 3 import tarfile 4 import zipfile 5 import requests 6 7 # @save 8 DATA_HUB = dict() 9 DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/' 10 11 12 def download(name, cache_dir=os.path.join('..', 'data')): #@save 13 """下载一个DATA_HUB中的文件,返回本地文件名""" 14 assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}" 15 url, sha1_hash = DATA_HUB[name] 16 os.makedirs(cache_dir, exist_ok=True) 17 fname = os.path.join(cache_dir, url.split('/')[-1]) 18 if os.path.exists(fname): 19 sha1 = hashlib.sha1() 20 with open(fname, 'rb') as f: 21 while True: 22 data = f.read(1048576) 23 if not data: 24 break 25 sha1.update(data) 26 if sha1.hexdigest() == sha1_hash: 27 return fname # 命中缓存 28 print(f'正在从{url}下载{fname}...') 29 r = requests.get(url, stream=True, verify=True) 30 with open(fname, 'wb') as f: 31 f.write(r.content) 32 return fname 33 34 def download_extract(name, folder=None): #@save 35 """下载并解压zip/tar文件""" 36 fname = download(name) 37 base_dir = os.path.dirname(fname) 38 data_dir, ext = os.path.splitext(fname) 39 if ext == '.zip': 40 fp = zipfile.ZipFile(fname, 'r') 41 elif ext in ('.tar', '.gz'): 42 fp = tarfile.open(fname, 'r') 43 else: 44 assert False, '只有zip/tar文件可以被解压缩' 45 fp.extractall(base_dir) 46 return os.path.join(base_dir, folder) if folder else data_dir 47 48 def download_all(): #@save 49 """下载DATA_HUB中的所有文件""" 50 for name in DATA_HUB: 51 download(name) 52 53 54 55 import numpy as np 56 import pandas as pd 57 import torch 58 from torch import nn 59 from d2l import torch as d2l 60 61 62 DATA_HUB['kaggle_house_train'] = ( #@save 63 DATA_URL + 'kaggle_house_pred_train.csv', 64 '585e9cc93e70b39160e7921475f9bcd7d31219ce') 65 66 DATA_HUB['kaggle_house_test'] = ( #@save 67 DATA_URL + 'kaggle_house_pred_test.csv', 68 'fa19780a7b011d9b009e8bff8e99922a8ee2eb90') 69 70 train_data = pd.read_csv(download('kaggle_house_train')) 71 test_data = pd.read_csv(download('kaggle_house_test')) 72 73 print(train_data.shape) 74 print(test_data.shape) 75 76 print(train_data.iloc[0:4,[0,1,2,3,-3,-2,-1]]) 77 78 #删掉第一行ID,另外train比test多一列,也就是label结果,在这里也删掉他 79 all_features=pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:])) 80 81 print(all_features.iloc[0:4,[0,1,2,3,4,5,6,7,-3,-2,-1]]) 82 print("___________________________________________________________________________________________-") 83 # 过滤出我们的数值特征 84 numeric_features=all_features.dtypes[all_features.dtypes!='object'].index 85 print(numeric_features.values) 86 # 把数字特征中的每一个这一列,减去均值除以方差,用来标准化,将所有的特征放在一个尺度上,这个尺度的均值为0,方差为1 87 all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std())) 88 # 在标准化数据之后,所有均值消失,因此我们可以将缺失值设置为0 89 all_features[numeric_features] = all_features[numeric_features].fillna(0) 90 91 all_features=pd.get_dummies(all_features,dummy_na=True) #dummy_na代表unKnown也算一个特征 92 # oneHot编码之后的特征: 93 print(all_features.shape) 94 95 # 从pandas提取出NumPy格式,并将其转换为张量表示 96 n_train=train_data.shape[0] 97 train_features=torch.tensor(all_features[:n_train].values, dtype=torch.float32) 98 test_features=torch.tensor(all_features[n_train:].values, dtype=torch.float32) 99 train_labels=torch.tensor(train_data.SalePrice.values.reshape(-1,1),dtype=torch.float32) 100 # 开始训练!!! 101 loss=nn.MSELoss() 102 in_features=train_features.shape[1] 103 104 def get_net(): 105 net=nn.Sequential(nn.Linear(in_features,1)) 106 return net 107 108 # 相对误差,即(y-y_hat)/y 109 # 在这里取log 110 # 这个函数的意思是将所有的feature和label均做一个log,再做正常的线性回归 111 def log_rmse(net, features, labels): 112 # 为了在取对数时进一步稳定该值,将小于1的值设置为1 113 # clamp:torch.clamp(input, min, max, out=None) → Tensor 114 # sigmod是将所有数值映射到一段区间内,但是这个clamp是个框,只把出框的规范到区间内,框内的数值不改变 115 clipped_preds = torch.clamp(net(features), 1, float('inf')) 116 rmse = torch.sqrt(loss(torch.log(clipped_preds), 117 torch.log(labels))) 118 return rmse.item() 119 120 def train(net, train_features, train_labels, test_features, test_labels, 121 num_epochs, learning_rate, weight_decay, batch_size): 122 train_ls, test_ls = [], [] 123 train_iter = d2l.load_array((train_features, train_labels), batch_size) 124 # 这里使用的是Adam优化算法,相对SGD来说比较平滑,对学习率没有那么敏感 125 optimizer = torch.optim.Adam(net.parameters(), 126 lr = learning_rate, 127 weight_decay = weight_decay) 128 for epoch in range(num_epochs): 129 for X, y in train_iter: 130 optimizer.zero_grad() 131 l = loss(net(X), y) 132 l.backward() 133 optimizer.step() 134 train_ls.append(log_rmse(net, train_features, train_labels)) 135 if test_labels is not None: 136 test_ls.append(log_rmse(net, test_features, test_labels)) 137 return train_ls, test_ls 138 139 # 做一个K折交叉验证 140 def get_k_fold_data(k, i, X, y): 141 assert k > 1 142 fold_size = X.shape[0] // k # //是整除的意思 143 X_train, y_train = None, None 144 for j in range(k): 145 idx = slice(j * fold_size, (j + 1) * fold_size) # 切片函数,类似split,就相当于一段索引 146 X_part, y_part = X[idx, :], y[idx] # 这里的idx类似于(j * fold_size : (j + 1) * fold_size) 147 if j == i: # 如果是第i折,则将这一折作为验证集 148 X_valid, y_valid = X_part, y_part 149 elif X_train is None: 150 X_train, y_train = X_part, y_part 151 else: 152 X_train = torch.cat([X_train, X_part], 0) 153 y_train = torch.cat([y_train, y_part], 0) 154 return X_train, y_train, X_valid, y_valid 155 156 def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay, 157 batch_size): 158 train_l_sum, valid_l_sum = 0, 0 159 for i in range(k): 160 data = get_k_fold_data(k, i, X_train, y_train) 161 net = get_net() 162 train_ls, valid_ls = train(net, *data, num_epochs, learning_rate, 163 weight_decay, batch_size) 164 train_l_sum += train_ls[-1] 165 valid_l_sum += valid_ls[-1] 166 if i == 0: 167 d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls], 168 xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs], 169 legend=['train', 'valid'], yscale='log') 170 d2l.plt.show() 171 print(f'折{i + 1},训练log rmse{float(train_ls[-1]):f}, ' 172 f'验证log rmse{float(valid_ls[-1]):f}') 173 return train_l_sum / k, valid_l_sum / k 174 175 k, num_epochs, lr, weight_decay, batch_size = 50, 100, 5, 0, 64 176 train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, 177 weight_decay, batch_size) 178 print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, ' 179 f'平均验证log rmse: {float(valid_l):f}')