支持向量机
#不少文本引用自
#http://blog.csdn.net/Felomeng/archive/2009/04/09/4058669.aspx
#和其他等等地方
from svm import *
#一个有3个类的问题
labels = [0, 1, 1, 2]
samples = [0, 0], [0, 1], [1, 0], [1, 1]
"""
可以理解为这个图
B-C
| |
A-B
"""
problem = svm_problem(labels, samples)
#结构体svm_problem将问题形式化:
#struct svm_problem
#{
# int l;
# double *y;
# struct svm_node **x;
#};
#
#
#其中“l”表示训练数据的实例数,
#而“y”是一个数组,用于存放它们的目标值。(类型值用整型数据,回归值用实数)
#“x”是一个数组指针,每一个指针指向一个稀疏的训练向量(即一个svm_node数组)。
#
#例如,如果我们有如下的训练数据:
#
#
#LABEL ATTR1 ATTR2 ATTR3 ATTR4 ATTR5
#----- ----- ----- ----- ----- -----
# 1 0 0.1 0.2 0 0
# 2 0 0.1 0.3 -1.2 0
# 1 0.4 0 0 0 0
# 2 0 0.1 0 1.4 0.5
# 3 -0.1 -0.2 0.1 1.1 0.1
#
#
#于是svm_problem的构成如下:
#l = 5
#y -> 1 2 1 2 3
#x -> [ ] -> (2,0.1) (3,0.2) (-1,?)
# [ ] -> (2,0.1) (3,0.3) (4,-1.2) (-1,?)
# [ ] -> (1,0.4) (-1,?)
# [ ] -> (2,0.1) (4,1.4) (5,0.5) (-1,?)
# [ ] -> (1,-0.1) (2,-0.2) (3,0.1) (4,1.1) (5,0.1) (-1,?)
#
#其中(索引,值)存储在结构“svm_node”中:
#struct svm_node{
# int index;
# double value;
#};
#
#当索引为-1时表示已经到达向量的末端。注意索引必须“升序”排列。
size = len(samples)
param = svm_parameter(C = 10, nr_weight = 2, weight_label = [1, 0], weight = [10, 1])
#结构体svm_parameter描述了一个支持向量机的参数:
#struct svm_parameter
#{
# int svm_type;
#
#svm_type可以是C_SVC、NU_SVC、ONE_CLASS、EPSILON_SVR或NU_SVR其中的一种。
#C_SVC: C-SVM 分类
#NU_SVC: nu-SVM 分类
#ONE_CLASS: one-class-SVM
#EPSILON_SVR: epsilon-SVM回归
#NU_SVR: nu-SVM回归
#
# int kernel_type;
#
#kernel_type可以是LINEAR、POLY、RBF、SIGMOID其中一种。
#LINEAR: u'*v
#POLY: (gamma*u'*v + coef0)^degree
#RBF: exp(-gamma*|u-v|^2)
#SIGMOID: tanh(gamma*u'*v + coef0)
#PRECOMPUTED:训练集文件中的核心值
#
# int degree; /* for poly */
# double gamma; /* for poly/rbf/sigmoid */
# double coef0; /* for poly/sigmoid */
#
# /* these are for training only */
# double cache_size; /* in MB */
#cache_size是核心缓存的大小,单位为MB。C是违背约束成本(惩罚值)。eps是结束条件。(一般地,我们在nu-SVC模型中使用0.00001,在其它模型中使用0.001)。nu是nu-SVM、nu-SVR与one-class-SVM中的一个参数。p 是epsilon-SVM回归对epsilon不敏感函数的epsilon值。shirnking = 1表示使用压缩,否则 = 0。 probability = 1表示得到带概率信息的模型,否则 = 0。
#
# double eps; /* stopping criteria */
#
# double C; /* for C_SVC, EPSILON_SVR, and NU_SVR */
#
# int nr_weight; /* for C_SVC */
# int *weight_label; /* for C_SVC */
# double* weight; /* for C_SVC */
#nr_weight、weight_label和weight(权重)用于改变某些类的惩罚因子(如果一个类的权重不变,则将权重值设定为1)这在使用不均衡输入数据或使用不均匀错误成本训练分类器时尤其有用。
#
#nr_weight是数组weight_label和weight中的元素个数。每个weight[i]对应一个weight_label[i],表示类weight_label[i]的惩罚是由因子weight[i]确定的。
#
#如果不想改变任何类的惩罚因子,那么只需要将nr_weight设定为0。
#
#Z注:这个可以用来调整 召回率 和 准确率
#
# double nu; /* for NU_SVC, ONE_CLASS, and NU_SVR */
# double p; /* for EPSILON_SVR */
# int shrinking; /* use the shrinking heuristics */
# int probability; /* do probability estimates */
#};
#
#
#
#*注意*因为svm_model含有指向svm_problem的指针,如果仍要使用由svm_train()产生的svm_model,那么就不要释放svm_problem的内存。
#
#*注意*为了避免错误的参数,在调用svm_train()之前应该先调用svm_check_parameter()。
#这是几个经典核函数
#有资料说 对于文本这样的高维数据 用简单的线性核函数就又快又好
kernels = [LINEAR, POLY, RBF]
kname = ['linear', 'polynomial', 'rbf']
for name, k in zip(kname, kernels):
print "---"*10, "训练:", name, "---"*10
param.kernel_type = k
#这个函数根据给定的参数和训练数据构建并返回一个支持向量机模型。
model = svm_model(problem, param)
errors = 0
print "=== 分类 ==="
for i in range(size):
thing = samples[i]
#Ok,这就分类了
prediction = model.predict(thing)
print "%s -> %s"%(thing, prediction)
if (labels[i] != prediction):
errors = errors + 1
print "核函数 %s: 错误率 = %d / %d" % (kname[param.kernel_type], errors, size)
print ">>>"*10
print "---"*10, "阴阳距:", name, "---"*10
param = svm_parameter(kernel_type = RBF, C=10)
model = svm_model(problem, param)
print "%s" % (samples[1])
print "类别数:", model.get_nr_class()
#根据libsvm的原理,他是先对一个训练集进行建模,在高维映射空间中形成一个超平面。然后再输入测试集,再对测试集根据 f(x)=wx+b 公式进行计算,如果>0,则归为阳性,反之则为阴性。根据他这个原理,那么我每个输入的训练集都应该有个 f(x) 值。
d = model.predict_values(samples[1])
for i in model.get_labels():
for j in model.get_labels():
if j > i:
print "{%d, %d} = %9.5f" % (i, j, d[i, j])
print "---"*10, "置信率:", name, "---"*10
param = svm_parameter(kernel_type = RBF, C=10, probability = 1)
model = svm_model(problem, param)
pred_label, pred_probability = model.predict_probability(samples[1])
print "%s" % (samples[1])
print "predicted class: %d" % (pred_label)
for i in model.get_labels():
print "类 %d 置信率 %f" % (i, pred_probability[i])
print "总概率 为 %s"%sum(pred_probability.itervalues())
#PRECOMPUTED: kernel values in training_set_file
#从一个训练文件中读取的核函数
print "---"*10, "文件中的核函数:", name, "---"*10
samples = [1, 0, 0, 0, 0], [2, 0, 1, 0, 1], [3, 0, 0, 1, 1], [4, 0, 1, 1, 2]
problem = svm_problem(labels, samples);
param = svm_parameter(kernel_type=PRECOMPUTED,C = 10,nr_weight = 2,weight_label = [1,0],weight = [10,1])
model = svm_model(problem, param)
pred_label = model.predict(samples[0])
print "%s %s"%(samples[0],pred_label)
"""
libsvm里面分类除了跟kernel_type有关还和许多其他参数有关,可以采用libsvm附带的easy.py工具调整参数,各个参数意义在包内的README文件里有说明,也可以参考http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf,
此外,这个入门http://ntu.csie.org/~piaip/svm/svm_tutorial.html也不错
http://huangbo929.blog.edu.cn/sort/?act=search&keyword=SVM
这里也有不少不错的文章
比如
http://huangbo929.blog.edu.cn/2008/64686.html
SVM String Kernel 核函数介绍 (2008-07-03 09:38)
http://huangbo929.blog.edu.cn/2008/64689.html
如何使用SVM基于Suffix Array的String Kernel核函数 (2008-07-03 10:07)
http://huangbo929.blog.edu.cn/2008/64688.html
the fastest SVM learning algorithm:最快的SVM算法 (2008-07-03 10:04)
http://huangbo929.blog.edu.cn/2008/64687.html
Tree Kernel SVM 介绍 (2008-07-03 09:41)
等等
"""
------------------------------------------------------------
程序输出
------------------------------------------------------------
~/svm/libsvm_study/python $ python svm_test.py
------------------------------ 训练: linear ------------------------------
.....*..*
optimization finished, #iter = 22
obj = -3.999999, rho = -0.999349
nSV = 3, nBSV = 0
*
optimization finished, #iter = 1
nu = 0.100000
obj = -1.000000, rho = -1.000000
nSV = 2, nBSV = 0
*
optimization finished, #iter = 2
obj = -4.000000, rho = -3.000000
nSV = 3, nBSV = 0
Total nSV = 4
=== 分类 ===
[0, 0] -> 0.0
[0, 1] -> 1.0
[1, 0] -> 1.0
[1, 1] -> 2.0
核函数 linear: 错误率 = 0 / 4
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
------------------------------ 训练: polynomial ------------------------------
*
optimization finished, #iter = 2
obj = -16.875000, rho = 0.375000
nSV = 3, nBSV = 1
*
optimization finished, #iter = 1
nu = 0.200000
obj = -2.000000, rho = -1.000000
nSV = 2, nBSV = 0
.*.*
optimization finished, #iter = 6
obj = -2.461538, rho = -1.153547
nSV = 3, nBSV = 0
Total nSV = 4
=== 分类 ===
[0, 0] -> 1.0
[0, 1] -> 1.0
[1, 0] -> 1.0
[1, 1] -> 2.0
核函数 polynomial: 错误率 = 1 / 4
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
------------------------------ 训练: rbf ------------------------------
..*.*
optimization finished, #iter = 9
obj = -4.247381, rho = 0.671181
nSV = 3, nBSV = 0
*
optimization finished, #iter = 1
nu = 0.158198
obj = -1.581977, rho = 0.000000
nSV = 2, nBSV = 0
.*.*
optimization finished, #iter = 7
obj = -4.247381, rho = -0.671133
nSV = 3, nBSV = 0
Total nSV = 4
=== 分类 ===
[0, 0] -> 0.0
[0, 1] -> 1.0
[1, 0] -> 1.0
[1, 1] -> 2.0
核函数 rbf: 错误率 = 0 / 4
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
------------------------------ 阴阳距: rbf ------------------------------
..*.*
optimization finished, #iter = 9
nu = 0.283131
obj = -4.247381, rho = 0.671181
nSV = 3, nBSV = 0
*
optimization finished, #iter = 1
nu = 0.158198
obj = -1.581977, rho = 0.000000
nSV = 2, nBSV = 0
.*.*
optimization finished, #iter = 7
nu = 0.283090
obj = -4.247381, rho = -0.671133
nSV = 3, nBSV = 0
Total nSV = 4
[0, 1]
类别数: 3
{0, 1} = -1.00013
{0, 2} = 0.00000
{1, 2} = 0.99936
------------------------------ 置信率: rbf ------------------------------
.*.*
optimization finished, #iter = 7
nu = 0.283090
obj = -4.247381, rho = -0.671133
nSV = 3, nBSV = 0
Total nSV = 3
*
optimization finished, #iter = 1
nu = 0.254149
obj = -2.541494, rho = 0.000000
nSV = 2, nBSV = 0
Total nSV = 2
.*.*
optimization finished, #iter = 7
nu = 0.283090
obj = -4.247381, rho = -0.671133
nSV = 3, nBSV = 0
Total nSV = 3
*
optimization finished, #iter = 1
nu = 0.254149
obj = -2.541494, rho = 0.000000
nSV = 2, nBSV = 0
Total nSV = 2
..*.*
optimization finished, #iter = 9
nu = 0.283131
obj = -4.247381, rho = 0.671181
nSV = 3, nBSV = 0
*
optimization finished, #iter = 1
nu = 0.158198
obj = -1.581977, rho = 0.000000
nSV = 2, nBSV = 0
Total nSV = 2
*
optimization finished, #iter = 1
nu = 0.158198
obj = -1.581977, rho = 0.000000
nSV = 2, nBSV = 0
Total nSV = 2
*
optimization finished, #iter = 1
nu = 0.158198
obj = -1.581977, rho = 0.000000
nSV = 2, nBSV = 0
Total nSV = 2
*
optimization finished, #iter = 1
nu = 0.158198
obj = -1.581977, rho = 0.000000
nSV = 2, nBSV = 0
.*.*
optimization finished, #iter = 7
nu = 0.283090
obj = -4.247381, rho = -0.671133
nSV = 3, nBSV = 0
Total nSV = 3
*
optimization finished, #iter = 1
nu = 0.254149
obj = -2.541494, rho = 0.000000
nSV = 2, nBSV = 0
Total nSV = 2
.*.*
optimization finished, #iter = 7
nu = 0.283090
obj = -4.247381, rho = -0.671133
nSV = 3, nBSV = 0
Total nSV = 3
*
optimization finished, #iter = 1
nu = 0.254149
obj = -2.541494, rho = 0.000000
nSV = 2, nBSV = 0
Total nSV = 2
.*.*
optimization finished, #iter = 7
nu = 0.283090
obj = -4.247381, rho = -0.671133
nSV = 3, nBSV = 0
Total nSV = 4
[0, 1]
predicted class: 0
类 0 置信率 0.400236
类 1 置信率 0.199806
类 2 置信率 0.399958
总概率 为 1.0
------------------------------ 文件中的核函数: rbf ------------------------------
.....*..*
optimization finished, #iter = 22
obj = -3.999999, rho = -0.999349
nSV = 3, nBSV = 0
*
optimization finished, #iter = 1
nu = 0.100000
obj = -1.000000, rho = -1.000000
nSV = 2, nBSV = 0
*
optimization finished, #iter = 2
obj = -4.000000, rho = -3.000000
nSV = 3, nBSV = 0
Total nSV = 4
[1, 0, 0, 0, 0] 0.0
zuroc@aragorn ~/svm/