基于支持向量机的分类模型和回归模型案例
''' 案例:事件预测----加载event.txt,预测某个时间段是否会出现特殊事件。步骤如下: 1.数据预处理: 1>.读取文件,加载data数组,删除索引为1的列 2>.针对每一列做编码,离散数据使用LabelEncoder,连续的数字数据使用DigitEncoder(需要自定义),编码器需要保存 3>.整理数据集,划分测试集和训练集 2.训练SVM模型分类器 3.对测试集进行预测 4.自定义测试数据,实现事件预测 5.画图 ''' import numpy as np import matplotlib.pyplot as mp import sklearn.preprocessing as sp import sklearn.model_selection as sm import sklearn.svm as svm import warnings warnings.filterwarnings('ignore') class DigitEncoder: # 自定义编码器:针对数字字符串做标签编码 def fit_transform(self, y): return y.astype('i4') def transform(self, y): return y.astype('i4') def inverse_transform(self, y): return y.astype('str') data = [] with open('./ml_data/event.txt', 'r') as f: for line in f.readlines(): data.append(line[:-1].split(',')) data = np.array(data) # 删除第二列 data = np.delete(data, 1, axis=1) print(data.shape) # 整理输入集与输出集 encoders, x, y = [], [], [] data = data.T for row in range(len(data)): # 判断每个特征值是否为数字 if data[row][0].isdigit(): encoder = DigitEncoder() else: encoder = sp.LabelEncoder() if row < len(data) - 1: x.append(encoder.fit_transform(data[row])) else: y = encoder.fit_transform(data[row]) encoders.append(encoder) x = np.array(x).T # print(x) # print(y) # 拆分测试集与训练集 train_x, test_x, train_y, test_y = sm.train_test_split(x, y, test_size=0.25, random_state=7) # 交叉验证 model = svm.SVC(kernel='rbf', class_weight='balanced') scores = sm.cross_val_score(model, train_x, train_y, cv=5, scoring='f1_weighted') print('交叉验证平均得分:', scores.mean()) model.fit(train_x, train_y) # 测试集测试 pred_test_y = model.predict(test_x) print('预测精度:', (test_y == pred_test_y).sum() / test_y.size) # 对测试数据进行测试 data = [['Tuesday', '13:30:00', '21', '23'], ['Thursday', '13:30:00', '21', '23']] # 对测试数据进行编码 data = np.array(data).T test_x = [] for row in range(len(data)): encoder = encoders[row] test_x.append(encoder.transform(data[row])) test_x = np.array(test_x).T # print(test_x) pred_test_y = model.predict(test_x) pred_test_y = encoders[-1].inverse_transform(pred_test_y) print('预测结果为: ', pred_test_y) 输出结果: (5040, 5) 交叉验证平均得分: 0.9458699461165295 预测精度: 0.9476190476190476 预测结果为: ['noevent' 'noevent']
''' 案例:交通流量预测(回归)。步骤如下: 1.数据预处理: 1>.读取文件,加载data数组,删除索引为1的列 2>.针对每一列做编码,离散数据使用LabelEncoder,连续的数字数据使用DigitEncoder(需要自定义),编码器需要保存 3>.整理数据集,划分测试集和训练集 2.训练SVM模型回归器 3.对测试集进行预测 4.自定义测试数据,实现事件预测 5.画图 ''' import numpy as np import matplotlib.pyplot as mp import sklearn.preprocessing as sp import sklearn.model_selection as sm import sklearn.svm as svm import warnings import sklearn.metrics as mm warnings.filterwarnings('ignore') class DigitEncoder: # 自定义编码器:针对数字字符串做标签编码 def fit_transform(self, y): return y.astype('i4') def transform(self, y): return y.astype('i4') def inverse_transform(self, y): return y.astype('str') data = [] with open('./ml_data/traffic.txt', 'r') as f: for line in f.readlines(): data.append(line[:-1].split(',')) data = np.array(data) # 整理输入集与输出集 encoders, x, y = [], [], [] data = data.T for row in range(len(data)): # 判断每个特征值是否为数字 if data[row][0].isdigit(): encoder = DigitEncoder() else: encoder = sp.LabelEncoder() if row < len(data) - 1: x.append(encoder.fit_transform(data[row])) else: y = encoder.fit_transform(data[row]) encoders.append(encoder) x = np.array(x).T # print(x) # print(y) # 拆分测试集与训练集 train_x, test_x, train_y, test_y = sm.train_test_split(x, y, test_size=0.25, random_state=7) # 基于支持向量机的回归模型-----epsilon为支持向量间距 model = svm.SVR(kernel='rbf', C=10, epsilon=0.2) model.fit(train_x, train_y) # 测试集测试 pred_test_y = model.predict(test_x) # 模型得分 score = mm.r2_score(test_y, pred_test_y) print('r2得分: ', score) # 对测试数据进行测试 data = [['Tuesday', '13:30', 'San Francisco', 'yes'], ['Thursday', '13:30', 'San Francisco', 'no']] # 对测试数据进行编码 data = np.array(data).T test_x = [] for row in range(len(data)): encoder = encoders[row] test_x.append(encoder.transform(data[row])) test_x = np.array(test_x).T # print(test_x) pred_test_y = model.predict(test_x) pred_test_y = encoders[-1].inverse_transform(pred_test_y) print('预测结果为: ', pred_test_y) 预测结果: r2得分: 0.6484595603352119 预测结果为: ['24.111978908657576' '23.61904092888905']