Pytorch实现二分类问题

In [1]:

import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

In [2]:

data = pd.read_csv('./HR.csv')
data.head(10)

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	left	part	salary
0	0.38	0.53	2	157	3	1	sales	low
1	0.80	0.86	5	262	6	1	sales	medium
2	0.11	0.88	7	272	4	1	sales	medium
3	0.72	0.87	5	223	5	1	sales	low
4	0.37	0.52	2	159	3	1	sales	low
5	0.41	0.50	2	153	3	1	sales	low
6	0.10	0.77	6	247	4	1	sales	low
7	0.92	0.85	5	259	5	1	sales	low
8	0.89	1.00	5	224	5	1	sales	low
9	0.42	0.53	2	142	3	1	sales	low

In [3]:

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   part                   14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB

In [4]:

data.part.unique()

Out[4]:

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

In [5]:

data = data.join(pd.get_dummies(data.part)).join(pd.get_dummies(data.salary))
data.drop(columns=['part', 'salary'], inplace=True)
data

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	Work_accident	left	promotion_last_5years	IT	RandD	...	hr	management	marketing	product_mng	sales	support	technical	high	low	medium
0	0.38	0.53	2	157	3	0	1	0	False	False	...	False	False	False	False	True	False	False	False	True	False
1	0.80	0.86	5	262	6	0	1	0	False	False	...	False	False	False	False	True	False	False	False	False	True
2	0.11	0.88	7	272	4	0	1	0	False	False	...	False	False	False	False	True	False	False	False	False	True
3	0.72	0.87	5	223	5	0	1	0	False	False	...	False	False	False	False	True	False	False	False	True	False
4	0.37	0.52	2	159	3	0	1	0	False	False	...	False	False	False	False	True	False	False	False	True	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
14994	0.40	0.57	2	151	3	0	1	0	False	False	...	False	False	False	False	False	True	False	False	True	False
14995	0.37	0.48	2	160	3	0	1	0	False	False	...	False	False	False	False	False	True	False	False	True	False
14996	0.37	0.53	2	143	3	0	1	0	False	False	...	False	False	False	False	False	True	False	False	True	False
14997	0.11	0.96	6	280	4	0	1	0	False	False	...	False	False	False	False	False	True	False	False	True	False
14998	0.37	0.52	2	158	3	0	1	0	False	False	...	False	False	False	False	False	True	False	False	True	False

14999 rows × 21 columns

In [6]:

data.left.value_counts()

Out[6]:

left
0    11428
1     3571
Name: count, dtype: int64

In [7]:

Y_data = data.left.values.reshape(-1, 1)
Y = torch.from_numpy(Y_data).type(torch.FloatTensor)

In [8]:

data = data.drop(columns='left')
X_data = data.values.astype(float)
X = torch.from_numpy(X_data).type(torch.FloatTensor)

In [9]:

class HRModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lin_1 = nn.Linear(20, 64)
        self.lin_2 = nn.Linear(64, 64)
        self.lin_3 = nn.Linear(64, 1)
        self.activate = nn.SELU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, input):
        x = self.lin_1(input)
        x = self.activate(x)
        x = self.lin_2(x)
        x = self.activate(x)
        x = self.lin_3(x)
        return self.sigmoid(x)

In [10]:

lr = 0.001
model = HRModel()
opt = torch.optim.Adam(model.parameters(), lr=lr)
batch_size = 64
steps = len(data) // batch_size
epochs = 501
loss_fn = nn.BCELoss()

In [11]:

train_x, test_x, train_y, test_y = train_test_split(X_data, Y_data)
train_x = torch.from_numpy(train_x).type(torch.FloatTensor)
test_x = torch.from_numpy(test_x).type(torch.FloatTensor)
train_y = torch.from_numpy(train_y).type(torch.FloatTensor)
test_y = torch.from_numpy(test_y).type(torch.FloatTensor)

train_ds = TensorDataset(train_x, train_y)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_ds = TensorDataset(test_x, test_y)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=True)

In [12]:

def accuracy(out, yb):
    return ((out.data.numpy()>0.5)==yb.numpy()).mean()

In [13]:

%%time
for epoch in range(epochs):
    model.train()
    for xb, yb in train_dl:
        pred = model(xb)
        loss = loss_fn(pred, yb)
        opt.zero_grad()
        loss.backward()
        opt.step()

    if epoch%50 == 0:
        model.eval()
        with torch.no_grad():
            valid_loss = sum([ loss_fn(model(x), y) for x, y in test_dl ])
            acc_mean = np.mean([accuracy(model(x), y) for x, y in test_dl])
        print('训练次数:', epoch, ' 损失:', valid_loss/len(test_dl), ' 准确率:', acc_mean)

训练次数: 0  损失: tensor(0.5407)  准确率: 0.7669770294380017
训练次数: 50  损失: tensor(0.1423)  准确率: 0.9626588983050848
训练次数: 100  损失: tensor(0.1460)  准确率: 0.9528462310437109
训练次数: 150  损失: tensor(0.1282)  准确率: 0.9607911462979483
训练次数: 200  损失: tensor(0.1150)  准确率: 0.9667986173059768
训练次数: 250  损失: tensor(0.1369)  准确率: 0.9555084745762712
训练次数: 300  损失: tensor(0.1175)  准确率: 0.96671498661909
训练次数: 350  损失: tensor(0.1191)  准确率: 0.9638018510258698
训练次数: 400  损失: tensor(0.1176)  准确率: 0.964861173059768
训练次数: 450  损失: tensor(0.1346)  准确率: 0.9592161016949152
训练次数: 500  损失: tensor(0.1246)  准确率: 0.9661016949152542
CPU times: total: 1min 48s
Wall time: 1min 38s