【作业二】林轩田机器学习技法

作业的内容主要是实现AdaBoost-Stump的算法。

作业真是给人一种扮猪吃老虎的感觉,AdaBoost-Stump原理看似很简单,但是却埋了思维陷阱。

可以通过Q12~Q18的python源码如下(训练数据是train.dat, 测试数据是test.dat)

#encoding=utf8
import sys
import numpy as np
import math
from random import *

def read_input_data(path):
    x = []
    y = []
    for line in open(path).readlines():
        items = line.strip().split(' ')
        tmp_x = []
        for i in range(0,len(items)-1): tmp_x.append(float(items[i]))
        x.append(tmp_x)
        y.append(float(items[-1]))
    return np.array(x),np.array(y)

##
# x : input x
# y : input y
# u : weighted vector from last AdaBoost iterator
def calculate_weighted_Ein(x,y,u):
    # calculate median of interval & negative infinite & positive infinite
    thetas = np.array( [float("-inf")]+[ (x[i]+x[i+1])/2 for i in range(0, x.shape[0]-1) ]+[float("inf")] )
    Ein = sum(u) # initial Ein as all wrong
    sign = 1
    target_theta = 0.0
    # positive and negative rays
    for theta in thetas:
        y_positive = np.where(x>theta,1,-1)
        y_negative = np.where(x<theta,1,-1)
        # difference between conditional stump and AdaBoost-stump
        weighted_error_positive = sum((y_positive!=y)*u)
        weighted_error_negative = sum((y_negative!=y)*u)
        if weighted_error_positive>weighted_error_negative:
            if Ein>weighted_error_negative:
                Ein = weighted_error_negative
                sign = -1
                target_theta = theta
        else:
            if Ein>weighted_error_positive:
                Ein = weighted_error_positive
                sign = 1
                target_theta = theta
    # two corner cases
    if target_theta==float("inf"):
        target_theta = 1.0
    if target_theta==float("-inf"):
        target_theta = -1.0
    # calculate scaling factor
    scalingFactor = 0.5
    errorRate = 0
    if sign==1:
        errorRate = 1.0*sum((np.where(x>target_theta,1,-1)!=y)*u)/sum(u)
        scalingFactor = math.sqrt( (1-errorRate)/errorRate )
        # update weight 
        u = scalingFactor*(np.where(x>target_theta,1,-1)!=y)*u + (np.where(x>target_theta,1,-1)==y)*u/scalingFactor
    else:
        errorRate = 1.0*sum((np.where(x<target_theta,1,-1)!=y)*u)/sum(u)
        scalingFactor = math.sqrt( (1-errorRate)/errorRate )
        # update weight 
        u = scalingFactor*(np.where(x<target_theta,1,-1)!=y)*u + (np.where(x<target_theta,1,-1)==y)*u/scalingFactor
    alpha = math.log(scalingFactor,math.e)
    # print errorRate
    return errorRate, u, alpha, target_theta, sign

def main():
    x,y = read_input_data("train.dat")
    sorted_index = []
    for i in range(0, x.shape[1]): sorted_index.append(np.argsort(x[:,i]))
    # each feature dimension has its own sample weigted vector
    u = np.ones(x.shape[0])/x.shape[0]
    u_next = u
    T = 300
    alpha = np.ones(T)
    theta = np.ones(T)
    sign = np.ones(T)
    index = np.zeros(T)
    # Q16 mini error rate
    mini_error = 1
    for t in range(0, T):
        # best parameter in iteration t
        alpha_t = 1
        theta_t = 1
        sign_t = 1
        index_t = 1
        Eu = float("inf")
        # pick best feature dimension and corresponding parameters
        for i in range(0,x.shape[1]):
            xi = x[sorted_index[i],i]
            yi = y[sorted_index[i]]
            E, ui, a, th, s = calculate_weighted_Ein(xi, yi, u[sorted_index[i]])
            # print "E:"+str(E)
            if Eu>E:
                if mini_error>E: mini_error = E
                Eu = E
                alpha_t = a
                theta_t = th
                sign_t = s
                index_t = i
                u_next = ui
        alpha[t] = alpha_t
        theta[t] = theta_t
        sign[t] = sign_t
        index[t] = index_t
        # update u corresponding to the best i
        u[sorted_index[index_t]] = u_next
        # Q12 Ein(g1)  Q14 Ut(2)
        if t==0:
            Ein = 0
            if sign[t]==1:
                Ein = 1.0*sum(np.where(x[:,index_t]>theta_t,1,-1)!=y)/x.shape[0]
            else:
                Ein = 1.0*sum(np.where(x[:,index_t]<theta_t,1,-1)!=y)/x.shape[0]
            print "Ein1:"+str(Ein)
            print "Ut2:"+str(sum(u_next))
        # Q15
        if t==T-1:
            print "UT:"+str(sum(u_next))
    # Q13 Ein(G)
    predict_y = np.zeros(x.shape[0])
    for t in range(0,T):
        if sign[t]==1:
            predict_y = predict_y + alpha[t]*np.where(x[:,index[t]]>theta[t],1,-1)
        else:
            predict_y = predict_y + alpha[t]*np.where(x[:,index[t]]<theta[t],1,-1)
    EinG = 1.0*sum(np.where(predict_y>0,1,-1)!=y)/x.shape[0]
    print "EinG:"+str(EinG)

    # Q15
    print "mini error rate:"+str(mini_error)

    # Q17 Eoutg1 Q18 EoutG
    test_x,test_y = read_input_data("test.dat")
    predict_y = np.zeros(test_x.shape[0])
    # Q17
    if sign[0]==1:
        predict_y = np.where(test_x[:,index[0]]>theta[0],1,-1)
    else:
        predict_y = np.where(test_x[:,index[0]]<theta[0],1,-1)
    Eoutg1 = sum(predict_y!=test_y)
    print "Eout1:"+str(1.0*Eoutg1/test_x.shape[0])
    # Q18
    for t in range(0,T):
        if sign[t]==1:
            predict_y = predict_y + np.where(test_x[:,index[t]]>theta[t],1,-1)*alpha[t]
        else:
            predict_y = predict_y + np.where(test_x[:,index[t]]<theta[t],1,-1)*alpha[t]
    Eout = sum(np.where(predict_y>0,1,-1)!=test_y)
    print "Eout:"+str(Eout*1.0/test_x.shape[0])


if __name__ == '__main__':
    main()

代码的效率不高,但对于了解AdaBoost-Stump原理来说还可以。

要想正确implement这个算法,需要搞清楚如下几点:

1)每一轮挑出来一个维度的feature以及相应的theta sign alpha

比如第一轮迭代后,最后的feature是x1;第二轮迭代后,发现在此轮条件下,最优的feature变成了x2了...

2)每轮迭代完成后,每个sample的weight即u都会更新,最大的坑就在这里:

比如,这一轮迭代的是xi,那么传入函数的样本已经按照xi这个维度进行排序了,自然返回的u也是与排序后的xi一一对应的;接着问题来了,如果下一轮迭代的是xi+1,那么传入函数的样本就会按照xi+1这个维度进行排序了,这时如果把上一轮的u原封不动的传入就跳坑里了(原因是,此时的u是根据上一轮xi这个维度排序后与sample一一对应的,而这一轮sample是按照xi+1这个维度进行排序的,上一轮的u与这一轮的sample就对应不上了,错位了)。

其中2)更容易陷进去,自己纠结了小半天才看出来(已在code中用红字标出来),自己的解决办法是,把每一轮选出来的ui保存下来,以及xi排序后的下标也保存下来,更新u。这样在下一轮迭代(每轮迭代都遍历x的每个维度)中,u的每个分量就真的与原始样本x的每个sample一一对应了,不会序号错乱了。

posted on 2015-08-01 17:22  承续缘  阅读(1272)  评论(0编辑  收藏  举报

导航