【作业一】林轩田机器学习技术

关注的是编程题目Q15~Q20

这里需要借用libsvm的python包,需要一下几个处理步骤:

(1)到libsvm官网(http://www.csie.ntu.edu.tw/~cjlin/libsvm/)下载.gz包,解压缩到一个目录

(2)在解压的根目录make命令

(3)再到解压缩根目录下的python文件夹make命令

(4)把根目录下的libsvm.so.2,python文件夹下的svm.py和svmutil.py,三个文件提取出来,存放到一个新的文件夹(我的文件夹命名为libsvm)下面

(5)为了让libsvm目录被识别为一个包,需要建立名为__init__.py的空文件

(6)修改svm.py的Line 19为 “libsvm = CDLL(path.join(dirname, 'libsvm.so.2'))”

(7)如果要在新建的.py文件中使用libsvm的包,需要加入如下的语句(蓝色字的部分为你建立libsvm目录的路径):  

      import sys
      sys.path.append('/Users/xiabofei/Documents/libsvm')
      from svmutil import *

整体代码如下(读数据公用一个函数,其余的按照每个Question的要求,各自写成一块)

#encoding=utf8
import sys
sys.path.append('/Users/xiabofei/Documents/libsvm')
from svmutil import *
import numpy as np
import math
from random import *

# read raw data from local file
# return scaled data
def read_input_data(path, target_class):
    x = []
    y = []
    # read first line & get x column size & prepare for scaling
    x_size = -1
    f = open(path)
    for line in f.readlines():
        fields = line.strip().split(' ')
        for field in fields:
            if field!='':
                x_size = x_size + 1
        break;
    f.close()
    f = open(path)
    # record each x column's max value for scaling 
    x_min_max = []
    for i in range(x_size):
        x_min_max.append([float("inf"),float("-inf")])
    for line in f.readlines():
        fields = line.strip().split(' ')
        items = []
        for field in fields:
            if field!='':
                items.append(field)
        y.append(1 if float(items[0])==target_class else -1)
        tmp_x = []
        for i in range(1,len(items)):
            val = float(items[i])
            # check each column's min value and max value
            if val<x_min_max[i-1][0]:
                x_min_max[i-1][0] = val
            elif val>x_min_max[i-1][1]:
                x_min_max[i-1][1] = val
            tmp_x.append(val)
        x.append(tmp_x)
    #for min_max in x_min_max:
    #    print "min:"+str(min_max[0])+";max:"+str(min_max[1])
    # scaling x & y
    f.close()
    return x,y,x_size
    for non_scaled in x:
        for i in range(len(non_scaled)):
            non_scaled[i] = (non_scaled[i]-x_min_max[i][0]) / (x_min_max[i][1]-x_min_max[i][0])
    return x,y,x_size


if __name__ == '__main__':
    '''
    print "success import"
    y,x = svm_read_problem('heart_scale')
    model = svm_train(y,x,'-c 4')
    print "success train"
    p_label, p_acc, p_val = svm_predict(y, x, model)
    print "success predict"
    '''
    ## Q15~Q17
    '''
    fw = open('result','w')
    # read raw data & reset labels according to the problem
    max_sum_of_alhpha = float("-inf")
    for target_class in range(0,10,2):
        x,y,x_size = read_input_data('train.dat',target_class)
        problem = svm_problem(y,x)
        # set SVM parameter
        params = svm_parameter('-c 0.01 -t 1 -g 1 -r 1')
        model = svm_train(problem, params)
        svm_save_model('model',model)
        # get W
        f = open('model')
        W = [0 for i in range(x_size)];
        sum_of_alpha = 0
        if_reach_SV = False
        for line in f.readlines():
            if line.strip()=="SV":
                if_reach_SV = True
                continue
            if if_reach_SV:
                items = line.strip().split(' ')
                alphan_yn = float(items[0])
                sum_of_alpha = sum_of_alpha + abs(alphan_yn)
                for i in range(x_size):
                    W[i] = W[i] + alphan_yn*float(items[i+1].split(':')[1].strip())
        fw.writelines(str(sum_of_alpha)+'\n')
        max_sum_of_alhpha = sum_of_alpha if sum_of_alpha>max_sum_of_alhpha else max_sum_of_alhpha
        f.close()
        #test_x,test_y,test_x_size = read_input_data('test.dat',target_class)
        #if x_size!= test_x_size: sys.exit(-1)
        p_label, p_acc, p_val = svm_predict(y, x, model)
        fw.writelines("class:"+str(target_class)+";Ein:"+str(1-p_acc[0]/100.0)+'\n')
    fw.writelines(str(max_sum_of_alhpha)+'\n')
    fw.close()
    '''
    '''
    ## Q18
    fw = open('result','w')
    c = 0.001
    while c<=10:
        x,y,x_size = read_input_data('train.dat',0)
        problem = svm_problem(y,x)
        params = svm_parameter("-c "+str(c)+" -t 2 -g 100")
        model = svm_train(problem, params)
        SV = model.get_SV()
        print SV
        svm_save_model('model',model)
        test_x,test_y,test_x_size = read_input_data('test.dat',0)
        if x_size!= test_x_size: sys.exit(-1)
        # calculate Eout
        p_label, p_acc, p_val = svm_predict(test_y, test_x, model)
        fw.writelines("Eout:"+str(1-p_acc[0]/100.0)+'\n')
        # calculate sum of 
        p_label, p_acc, p_val = svm_predict(test_y, test_x, model)
        c = c*10
    fw.close()
    '''
    '''
    ## Q19
    fw = open('result','w')
    gamma = 1
    x,y,x_size = read_input_data('train.dat',0)
    test_x,test_y,test_x_size = read_input_data('test.dat',0)
    if test_x_size!=x_size: sys.exit(-1)
    while gamma<=10000:
        problem = svm_problem(y,x)
        params = svm_parameter("-c 0.1 -t 2 -g "+str(gamma))
        model = svm_train(problem, params)
        p_label, p_acc, p_val = svm_predict(test_y, test_x, model)
        fw.writelines("gamma:"+str(gamma)+"\tEout:"+str(1-p_acc[0]/100.0)+'\n')
        gamma = gamma * 10
    fw.close()
    '''
    ## Q20
    fw = open('result','w')
    T = 50
    test_size = 1000
    gamma_minEvalTimes = dict()
    x,y,x_size = read_input_data('train.dat',0)
    for i in range(T):
        # prepare train and test data
        test_indexs = np.random.random_integers(0,len(x)-1,test_size)
        train_x = []
        train_y = []
        for i in range(len(x)):
            if not (i in test_indexs):
                train_x.append(x[i])
                train_y.append(y[i])
        test_x = np.array(x)[test_indexs].tolist()
        test_y = np.array(y)[test_indexs].tolist()
        problem = svm_problem(train_y,train_x)
        # find which gamma perfroms best Eval
        min_Eval = float("inf")
        min_gamma = -1
        gamma = 1
        while gamma<=10000: 
            params = svm_parameter("-c 0.1 -t 2 -g "+str(gamma))
            model = svm_train(problem, params)
            p_label, p_acc, p_val = svm_predict(test_y, test_x, model)
            Eval = 1-p_acc[0]/100.0
            fw.writelines("gamma:"+str(gamma)+"\t Eval:"+str(Eval)+'\n')
            if min_Eval>Eval:
                min_Eval = Eval
                min_gamma = gamma
            gamma = gamma * 10
        # update each gamma's best perform times
        if gamma_minEvalTimes.has_key(min_gamma):
            gamma_minEvalTimes[min_gamma] += 1
        else:
            gamma_minEvalTimes[min_gamma] = 1
    for k,v in gamma_minEvalTimes.items():
        fw.writelines("gamma:"+str(k)+"\ttimes:"+str(v))
    fw.close()
    '''
    print W
    w_1_F = 0
    for i in range(x_size):
        w_1_F = w_1_F + math.pow(W[i],2)
    print math.sqrt(w_1_F)
    '''

    '''
    print len(x)
    print len(y)
    for i in range(len(x)):
        out_str = str(y[i])
        for j in range(len(x[i])):
            out_str = out_str + '\t' + str(x[i][j])
        print out_str
    '''

 

posted on 2015-07-07 22:38  承续缘  阅读(1578)  评论(2编辑  收藏  举报

导航