Pytorch实现二分类问题
In [1]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
In [2]:
data = pd.read_csv('./HR.csv')
data.head(10)
satisfaction_level | last_evaluation | number_project | average_montly_hours | time_spend_company | Work_accident | left | promotion_last_5years | part | salary | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 1 | 0 | sales | low |
1 | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 1 | 0 | sales | medium |
2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 1 | 0 | sales | medium |
3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 1 | 0 | sales | low |
4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 1 | 0 | sales | low |
5 | 0.41 | 0.50 | 2 | 153 | 3 | 0 | 1 | 0 | sales | low |
6 | 0.10 | 0.77 | 6 | 247 | 4 | 0 | 1 | 0 | sales | low |
7 | 0.92 | 0.85 | 5 | 259 | 5 | 0 | 1 | 0 | sales | low |
8 | 0.89 | 1.00 | 5 | 224 | 5 | 0 | 1 | 0 | sales | low |
9 | 0.42 | 0.53 | 2 | 142 | 3 | 0 | 1 | 0 | sales | low |
In [3]:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 14999 entries, 0 to 14998 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 satisfaction_level 14999 non-null float64 1 last_evaluation 14999 non-null float64 2 number_project 14999 non-null int64 3 average_montly_hours 14999 non-null int64 4 time_spend_company 14999 non-null int64 5 Work_accident 14999 non-null int64 6 left 14999 non-null int64 7 promotion_last_5years 14999 non-null int64 8 part 14999 non-null object 9 salary 14999 non-null object dtypes: float64(2), int64(6), object(2) memory usage: 1.1+ MB
In [4]:
data.part.unique()
Out[4]:
array(['sales', 'accounting', 'hr', 'technical', 'support', 'management', 'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)
In [5]:
data = data.join(pd.get_dummies(data.part)).join(pd.get_dummies(data.salary))
data.drop(columns=['part', 'salary'], inplace=True)
data
satisfaction_level | last_evaluation | number_project | average_montly_hours | time_spend_company | Work_accident | left | promotion_last_5years | IT | RandD | ... | hr | management | marketing | product_mng | sales | support | technical | high | low | medium | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 1 | 0 | False | False | ... | False | False | False | False | True | False | False | False | True | False |
1 | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 1 | 0 | False | False | ... | False | False | False | False | True | False | False | False | False | True |
2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 1 | 0 | False | False | ... | False | False | False | False | True | False | False | False | False | True |
3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 1 | 0 | False | False | ... | False | False | False | False | True | False | False | False | True | False |
4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 1 | 0 | False | False | ... | False | False | False | False | True | False | False | False | True | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
14994 | 0.40 | 0.57 | 2 | 151 | 3 | 0 | 1 | 0 | False | False | ... | False | False | False | False | False | True | False | False | True | False |
14995 | 0.37 | 0.48 | 2 | 160 | 3 | 0 | 1 | 0 | False | False | ... | False | False | False | False | False | True | False | False | True | False |
14996 | 0.37 | 0.53 | 2 | 143 | 3 | 0 | 1 | 0 | False | False | ... | False | False | False | False | False | True | False | False | True | False |
14997 | 0.11 | 0.96 | 6 | 280 | 4 | 0 | 1 | 0 | False | False | ... | False | False | False | False | False | True | False | False | True | False |
14998 | 0.37 | 0.52 | 2 | 158 | 3 | 0 | 1 | 0 | False | False | ... | False | False | False | False | False | True | False | False | True | False |
14999 rows × 21 columns
In [6]:
data.left.value_counts()
Out[6]:
left 0 11428 1 3571 Name: count, dtype: int64
In [7]:
Y_data = data.left.values.reshape(-1, 1)
Y = torch.from_numpy(Y_data).type(torch.FloatTensor)
In [8]:
data = data.drop(columns='left')
X_data = data.values.astype(float)
X = torch.from_numpy(X_data).type(torch.FloatTensor)
In [9]:
class HRModel(nn.Module):
def __init__(self):
super().__init__()
self.lin_1 = nn.Linear(20, 64)
self.lin_2 = nn.Linear(64, 64)
self.lin_3 = nn.Linear(64, 1)
self.activate = nn.SELU()
self.sigmoid = nn.Sigmoid()
def forward(self, input):
x = self.lin_1(input)
x = self.activate(x)
x = self.lin_2(x)
x = self.activate(x)
x = self.lin_3(x)
return self.sigmoid(x)
In [10]:
lr = 0.001
model = HRModel()
opt = torch.optim.Adam(model.parameters(), lr=lr)
batch_size = 64
steps = len(data) // batch_size
epochs = 501
loss_fn = nn.BCELoss()
In [11]:
train_x, test_x, train_y, test_y = train_test_split(X_data, Y_data)
train_x = torch.from_numpy(train_x).type(torch.FloatTensor)
test_x = torch.from_numpy(test_x).type(torch.FloatTensor)
train_y = torch.from_numpy(train_y).type(torch.FloatTensor)
test_y = torch.from_numpy(test_y).type(torch.FloatTensor)
train_ds = TensorDataset(train_x, train_y)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_ds = TensorDataset(test_x, test_y)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=True)
In [12]:
def accuracy(out, yb):
return ((out.data.numpy()>0.5)==yb.numpy()).mean()
In [13]:
%%time
for epoch in range(epochs):
model.train()
for xb, yb in train_dl:
pred = model(xb)
loss = loss_fn(pred, yb)
opt.zero_grad()
loss.backward()
opt.step()
if epoch%50 == 0:
model.eval()
with torch.no_grad():
valid_loss = sum([ loss_fn(model(x), y) for x, y in test_dl ])
acc_mean = np.mean([accuracy(model(x), y) for x, y in test_dl])
print('训练次数:', epoch, ' 损失:', valid_loss/len(test_dl), ' 准确率:', acc_mean)
训练次数: 0 损失: tensor(0.5407) 准确率: 0.7669770294380017 训练次数: 50 损失: tensor(0.1423) 准确率: 0.9626588983050848 训练次数: 100 损失: tensor(0.1460) 准确率: 0.9528462310437109 训练次数: 150 损失: tensor(0.1282) 准确率: 0.9607911462979483 训练次数: 200 损失: tensor(0.1150) 准确率: 0.9667986173059768 训练次数: 250 损失: tensor(0.1369) 准确率: 0.9555084745762712 训练次数: 300 损失: tensor(0.1175) 准确率: 0.96671498661909 训练次数: 350 损失: tensor(0.1191) 准确率: 0.9638018510258698 训练次数: 400 损失: tensor(0.1176) 准确率: 0.964861173059768 训练次数: 450 损失: tensor(0.1346) 准确率: 0.9592161016949152 训练次数: 500 损失: tensor(0.1246) 准确率: 0.9661016949152542 CPU times: total: 1min 48s Wall time: 1min 38s
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· Obsidian + DeepSeek:免费 AI 助力你的知识管理,让你的笔记飞起来!
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了