【作业一】林轩田机器学习技术
关注的是编程题目Q15~Q20
这里需要借用libsvm的python包,需要一下几个处理步骤:
(1)到libsvm官网(http://www.csie.ntu.edu.tw/~cjlin/libsvm/)下载.gz包,解压缩到一个目录
(2)在解压的根目录make命令
(3)再到解压缩根目录下的python文件夹make命令
(4)把根目录下的libsvm.so.2,python文件夹下的svm.py和svmutil.py,三个文件提取出来,存放到一个新的文件夹(我的文件夹命名为libsvm)下面
(5)为了让libsvm目录被识别为一个包,需要建立名为__init__.py的空文件
(6)修改svm.py的Line 19为 “libsvm = CDLL(path.join(dirname, 'libsvm.so.2'))”
(7)如果要在新建的.py文件中使用libsvm的包,需要加入如下的语句(蓝色字的部分为你建立libsvm目录的路径):
import sys
sys.path.append('/Users/xiabofei/Documents/libsvm')
from svmutil import *
整体代码如下(读数据公用一个函数,其余的按照每个Question的要求,各自写成一块)
#encoding=utf8 import sys sys.path.append('/Users/xiabofei/Documents/libsvm') from svmutil import * import numpy as np import math from random import * # read raw data from local file # return scaled data def read_input_data(path, target_class): x = [] y = [] # read first line & get x column size & prepare for scaling x_size = -1 f = open(path) for line in f.readlines(): fields = line.strip().split(' ') for field in fields: if field!='': x_size = x_size + 1 break; f.close() f = open(path) # record each x column's max value for scaling x_min_max = [] for i in range(x_size): x_min_max.append([float("inf"),float("-inf")]) for line in f.readlines(): fields = line.strip().split(' ') items = [] for field in fields: if field!='': items.append(field) y.append(1 if float(items[0])==target_class else -1) tmp_x = [] for i in range(1,len(items)): val = float(items[i]) # check each column's min value and max value if val<x_min_max[i-1][0]: x_min_max[i-1][0] = val elif val>x_min_max[i-1][1]: x_min_max[i-1][1] = val tmp_x.append(val) x.append(tmp_x) #for min_max in x_min_max: # print "min:"+str(min_max[0])+";max:"+str(min_max[1]) # scaling x & y f.close() return x,y,x_size for non_scaled in x: for i in range(len(non_scaled)): non_scaled[i] = (non_scaled[i]-x_min_max[i][0]) / (x_min_max[i][1]-x_min_max[i][0]) return x,y,x_size if __name__ == '__main__': ''' print "success import" y,x = svm_read_problem('heart_scale') model = svm_train(y,x,'-c 4') print "success train" p_label, p_acc, p_val = svm_predict(y, x, model) print "success predict" ''' ## Q15~Q17 ''' fw = open('result','w') # read raw data & reset labels according to the problem max_sum_of_alhpha = float("-inf") for target_class in range(0,10,2): x,y,x_size = read_input_data('train.dat',target_class) problem = svm_problem(y,x) # set SVM parameter params = svm_parameter('-c 0.01 -t 1 -g 1 -r 1') model = svm_train(problem, params) svm_save_model('model',model) # get W f = open('model') W = [0 for i in range(x_size)]; sum_of_alpha = 0 if_reach_SV = False for line in f.readlines(): if line.strip()=="SV": if_reach_SV = True continue if if_reach_SV: items = line.strip().split(' ') alphan_yn = float(items[0]) sum_of_alpha = sum_of_alpha + abs(alphan_yn) for i in range(x_size): W[i] = W[i] + alphan_yn*float(items[i+1].split(':')[1].strip()) fw.writelines(str(sum_of_alpha)+'\n') max_sum_of_alhpha = sum_of_alpha if sum_of_alpha>max_sum_of_alhpha else max_sum_of_alhpha f.close() #test_x,test_y,test_x_size = read_input_data('test.dat',target_class) #if x_size!= test_x_size: sys.exit(-1) p_label, p_acc, p_val = svm_predict(y, x, model) fw.writelines("class:"+str(target_class)+";Ein:"+str(1-p_acc[0]/100.0)+'\n') fw.writelines(str(max_sum_of_alhpha)+'\n') fw.close() ''' ''' ## Q18 fw = open('result','w') c = 0.001 while c<=10: x,y,x_size = read_input_data('train.dat',0) problem = svm_problem(y,x) params = svm_parameter("-c "+str(c)+" -t 2 -g 100") model = svm_train(problem, params) SV = model.get_SV() print SV svm_save_model('model',model) test_x,test_y,test_x_size = read_input_data('test.dat',0) if x_size!= test_x_size: sys.exit(-1) # calculate Eout p_label, p_acc, p_val = svm_predict(test_y, test_x, model) fw.writelines("Eout:"+str(1-p_acc[0]/100.0)+'\n') # calculate sum of p_label, p_acc, p_val = svm_predict(test_y, test_x, model) c = c*10 fw.close() ''' ''' ## Q19 fw = open('result','w') gamma = 1 x,y,x_size = read_input_data('train.dat',0) test_x,test_y,test_x_size = read_input_data('test.dat',0) if test_x_size!=x_size: sys.exit(-1) while gamma<=10000: problem = svm_problem(y,x) params = svm_parameter("-c 0.1 -t 2 -g "+str(gamma)) model = svm_train(problem, params) p_label, p_acc, p_val = svm_predict(test_y, test_x, model) fw.writelines("gamma:"+str(gamma)+"\tEout:"+str(1-p_acc[0]/100.0)+'\n') gamma = gamma * 10 fw.close() ''' ## Q20 fw = open('result','w') T = 50 test_size = 1000 gamma_minEvalTimes = dict() x,y,x_size = read_input_data('train.dat',0) for i in range(T): # prepare train and test data test_indexs = np.random.random_integers(0,len(x)-1,test_size) train_x = [] train_y = [] for i in range(len(x)): if not (i in test_indexs): train_x.append(x[i]) train_y.append(y[i]) test_x = np.array(x)[test_indexs].tolist() test_y = np.array(y)[test_indexs].tolist() problem = svm_problem(train_y,train_x) # find which gamma perfroms best Eval min_Eval = float("inf") min_gamma = -1 gamma = 1 while gamma<=10000: params = svm_parameter("-c 0.1 -t 2 -g "+str(gamma)) model = svm_train(problem, params) p_label, p_acc, p_val = svm_predict(test_y, test_x, model) Eval = 1-p_acc[0]/100.0 fw.writelines("gamma:"+str(gamma)+"\t Eval:"+str(Eval)+'\n') if min_Eval>Eval: min_Eval = Eval min_gamma = gamma gamma = gamma * 10 # update each gamma's best perform times if gamma_minEvalTimes.has_key(min_gamma): gamma_minEvalTimes[min_gamma] += 1 else: gamma_minEvalTimes[min_gamma] = 1 for k,v in gamma_minEvalTimes.items(): fw.writelines("gamma:"+str(k)+"\ttimes:"+str(v)) fw.close() ''' print W w_1_F = 0 for i in range(x_size): w_1_F = w_1_F + math.pow(W[i],2) print math.sqrt(w_1_F) ''' ''' print len(x) print len(y) for i in range(len(x)): out_str = str(y[i]) for j in range(len(x[i])): out_str = out_str + '\t' + str(x[i][j]) print out_str '''