机器学习之logistic回归

import numpy as np
from matplotlib import pyplot as plt


def load_dataset():
    data_list = []
    label_list = []
    with open('testSet.txt', 'r') as f:
        for line in f.readlines():
            line_arr = line.strip().split()
            data_list.append([1.0, float(line_arr[0]), float(line_arr[1])])
            label_list.append(int(line_arr[2]))
    return data_list, label_list


def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))


def grad_ascent(data_list, label_list):
    alpha = 0.001
    data_mat = np.matrix(data_list)
    label_mat = np.matrix(label_list).transpose()
    m, n = data_mat.shape
    max_cycles = 500
    weight = np.ones((n, 1))
    for i in range(max_cycles):
        h = sigmoid(data_mat * weight)
        error = label_mat - h
        weight = weight + alpha * data_mat.transpose() * error
    return weight


def plot_best_fit(weight):
    data_list, label_list = load_dataset()
    data_arr = np.array(data_list)
    n = data_arr.shape[0]
    x_cord1 = []
    y_cord1 = []
    x_cord2 = []
    y_cord2 = []
    for i in range(n):
        if label_list[i] == 1:
            x_cord1.append(data_arr[i, 1])
            y_cord1.append(data_arr[i, 2])
        else:
            x_cord2.append(data_arr[i, 1])
            y_cord2.append(data_arr[i, 2])
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(x_cord1, y_cord1, s=10, c='red', marker='s')
    ax.scatter(x_cord2, y_cord2, s=10, c='green')
    x = np.arange(-3.0, 3.0, 0.1)
    y = (-weight[0, 0] - weight[1, 0] * x) / weight[2, 0]
    ax.plot(x, y)
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.show()


def random_grad_ascent(data_list, label_list):
    data_mat = np.matrix(data_list)
    label_mat = np.matrix(label_list).transpose()
    m, n = data_mat.shape
    alpha = 0.01
    weight = np.ones((n, 1))
    for i in range(m):
        h = sigmoid(data_mat * weight)
        error = label_mat - h
        weight = weight + alpha * data_mat.transpose() * error
    return weight


def random_grad_ascent1(data_list, label_list, num=150):
    data_mat = np.matrix(data_list)
    label_mat = np.matrix(label_list).transpose()
    m, n = data_mat.shape
    weight = np.ones((n, 1))
    for i in range(num):
        data_index = range(m)
        for j in range(m):
            alpha = 4 / (1.0 + i + j) + 0.01
            rand_index = int(np.random.uniform(0, len(data_index)))
            h = sigmoid(data_mat[data_index[rand_index]] * weight)
            error = label_mat[rand_index] - h
            weight = weight + alpha * data_mat[data_index[rand_index]].transpose() * error
            # del data_index[rand_index]
    return weight


def classify_vector(x, weight):
    prob = sigmoid(sum(x * weight))
    return 1.0 if prob > 0.5 else 0.0


def colic_test():
    with open('horseColicTraining.txt', 'r') as f:
        train_set = []
        train_label = []
        for line in f.readlines():
            line_arr1 = line.strip().split("\t")
            line_arr2 = [float(_) for _ in line_arr1[:21]]
            train_set.append(line_arr2)
            train_label.append(float(line_arr1[21]))
    train_weight = random_grad_ascent1(data_list=train_set, label_list=train_label)
    error_count = 0
    num_test_vec = 0.0
    with open('horseColicTest.txt', 'r') as f:
        for line in f.readlines():
            num_test_vec += 1.0
            line_arr1 = line.strip().split("\t")
            line_arr2 = [float(_) for _ in line_arr1[:21]]
            if int(classify_vector(np.array(line_arr2), train_weight)) != int(line_arr1[21]):
                error_count += 1
    error_rate = error_count / num_test_vec
    return error_rate


def multi_test():
    num_test = 10
    error_sum = 0.0
    for i in range(num_test):
        error_sum += colic_test()
    print(f"num_test={num_test}, error_sum={error_sum},error_rate={error_sum / num_test}")


if __name__ == '__main__':
    data_list, label_list = load_dataset()
    weight1 = grad_ascent(data_list, label_list)
    plot_best_fit(weight1)
    weight2 = random_grad_ascent(data_list, label_list)
    plot_best_fit(weight2)
    weight3 = random_grad_ascent1(data_list, label_list, num=150)
    plot_best_fit(weight3)
    multi_test()

其他logistic示例或者基于主流机器学习框架实现的logistic代码地址:

https://gitee.com/navysummer/machine-learning/tree/master/logistic

  

posted @ 2024-06-30 23:52  NAVYSUMMER  阅读(7)  评论(0编辑  收藏  举报
交流群 编程书籍