机器学习之logistic回归
import numpy as np from matplotlib import pyplot as plt def load_dataset(): data_list = [] label_list = [] with open('testSet.txt', 'r') as f: for line in f.readlines(): line_arr = line.strip().split() data_list.append([1.0, float(line_arr[0]), float(line_arr[1])]) label_list.append(int(line_arr[2])) return data_list, label_list def sigmoid(x): return 1.0 / (1.0 + np.exp(-x)) def grad_ascent(data_list, label_list): alpha = 0.001 data_mat = np.matrix(data_list) label_mat = np.matrix(label_list).transpose() m, n = data_mat.shape max_cycles = 500 weight = np.ones((n, 1)) for i in range(max_cycles): h = sigmoid(data_mat * weight) error = label_mat - h weight = weight + alpha * data_mat.transpose() * error return weight def plot_best_fit(weight): data_list, label_list = load_dataset() data_arr = np.array(data_list) n = data_arr.shape[0] x_cord1 = [] y_cord1 = [] x_cord2 = [] y_cord2 = [] for i in range(n): if label_list[i] == 1: x_cord1.append(data_arr[i, 1]) y_cord1.append(data_arr[i, 2]) else: x_cord2.append(data_arr[i, 1]) y_cord2.append(data_arr[i, 2]) fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(x_cord1, y_cord1, s=10, c='red', marker='s') ax.scatter(x_cord2, y_cord2, s=10, c='green') x = np.arange(-3.0, 3.0, 0.1) y = (-weight[0, 0] - weight[1, 0] * x) / weight[2, 0] ax.plot(x, y) plt.xlabel('X1') plt.ylabel('X2') plt.show() def random_grad_ascent(data_list, label_list): data_mat = np.matrix(data_list) label_mat = np.matrix(label_list).transpose() m, n = data_mat.shape alpha = 0.01 weight = np.ones((n, 1)) for i in range(m): h = sigmoid(data_mat * weight) error = label_mat - h weight = weight + alpha * data_mat.transpose() * error return weight def random_grad_ascent1(data_list, label_list, num=150): data_mat = np.matrix(data_list) label_mat = np.matrix(label_list).transpose() m, n = data_mat.shape weight = np.ones((n, 1)) for i in range(num): data_index = range(m) for j in range(m): alpha = 4 / (1.0 + i + j) + 0.01 rand_index = int(np.random.uniform(0, len(data_index))) h = sigmoid(data_mat[data_index[rand_index]] * weight) error = label_mat[rand_index] - h weight = weight + alpha * data_mat[data_index[rand_index]].transpose() * error # del data_index[rand_index] return weight def classify_vector(x, weight): prob = sigmoid(sum(x * weight)) return 1.0 if prob > 0.5 else 0.0 def colic_test(): with open('horseColicTraining.txt', 'r') as f: train_set = [] train_label = [] for line in f.readlines(): line_arr1 = line.strip().split("\t") line_arr2 = [float(_) for _ in line_arr1[:21]] train_set.append(line_arr2) train_label.append(float(line_arr1[21])) train_weight = random_grad_ascent1(data_list=train_set, label_list=train_label) error_count = 0 num_test_vec = 0.0 with open('horseColicTest.txt', 'r') as f: for line in f.readlines(): num_test_vec += 1.0 line_arr1 = line.strip().split("\t") line_arr2 = [float(_) for _ in line_arr1[:21]] if int(classify_vector(np.array(line_arr2), train_weight)) != int(line_arr1[21]): error_count += 1 error_rate = error_count / num_test_vec return error_rate def multi_test(): num_test = 10 error_sum = 0.0 for i in range(num_test): error_sum += colic_test() print(f"num_test={num_test}, error_sum={error_sum},error_rate={error_sum / num_test}") if __name__ == '__main__': data_list, label_list = load_dataset() weight1 = grad_ascent(data_list, label_list) plot_best_fit(weight1) weight2 = random_grad_ascent(data_list, label_list) plot_best_fit(weight2) weight3 = random_grad_ascent1(data_list, label_list, num=150) plot_best_fit(weight3) multi_test()
其他logistic示例或者基于主流机器学习框架实现的logistic代码地址:
https://gitee.com/navysummer/machine-learning/tree/master/logistic