感知机算法C++实现
2013-05-06 13:14 夜与周公 阅读(1450) 评论(0) 编辑 收藏 举报本节介绍感知机分类器的C++实现,其分类器实现框架会在后续的一些算法实现都会使用到。
1、类的设计
感知机类设计如下:
View Code
#pragma once #include <iostream> #include <fstream> #include <iomanip> #include <string> #include <vector> #include <set> #include <map> #include <algorithm> #include <cmath> #include <ctime> using namespace std; struct sparse_feat #特征稀疏表示数据结构 { vector<int> id_vec; vector<float> value_vec; }; class Per { private: vector<sparse_feat> samp_feat_vec; #存储数据集特征 vector<int> samp_class_vec; #存储数据集的类别 int feat_set_size; #数据集的特征数 int class_set_size; #数据集的类别数 vector< vector<float> > omega; #模型参数 public: Per(); ~Per(); void save_model(string model_file); void load_model(string model_file); void load_training_file(string training_file); void init_omega(); int train_online(int criter, int max_loop, double loss_thrd, float learn_rate, float lambda, int avg); #在线更新,主函数 vector<float> calc_score(sparse_feat &samp_feat); vector<float> calc_neg_score(sparse_feat &samp_feat); vector<float> score_to_prb(vector<float> &score); int score_to_class(vector<float> &score); float classify_testing_file(string testing_file, string output_file, int output_format); private: void read_samp_file(string samp_file, vector<sparse_feat> &samp_feat_vec, vector<int> &samp_class_vec); void update_online_per(int samp_class, sparse_feat &samp_feat, float learn_rate); #感知机更新侧率 void calc_loss_per(double *loss, float *acc); #感知机损失函数计算 float calc_acc(vector<int> &test_class_vec, vector<int> &pred_class_vec); sparse_feat product_pair_feature(const sparse_feat &samp_feat); vector<string> string_split(string terms_str, string spliting_tag); };
本文采用稀疏表示的数据结构存储特征,用结构体sparse_feat表示特征,其中id_vec存储索引号的向量,value_vec存储特征值的向量,两者一一对应。目前主流的机器学习工具包均采用稀疏特征的存储方式。
2.实现代码:
View Code
1 /******************************************************************** 2 * Linear Discriminant Function Classifier V0.10 3 *********************************************************************/ 4 5 #include "Per.h" 6 7 Per::Per() 8 { 9 } 10 11 Per::~Per() 12 { 13 } 14 15 void Per::save_model(string model_file) 16 { 17 cout << "Saving model..." << endl; 18 ofstream fout(model_file.c_str()); 19 for (int k = 0; k < feat_set_size; k++) { 20 for (int j = 0; j < class_set_size; j++) { 21 fout << omega[k][j] << " "; 22 } 23 fout << endl; 24 } 25 fout.close(); 26 } 27 28 29 void Per::load_model(string model_file) 30 { 31 cout << "Loading model..." << endl; 32 omega.clear(); 33 ifstream fin(model_file.c_str()); 34 if(!fin) { 35 cerr << "Error opening file: " << model_file << endl; 36 exit(0); 37 } 38 string line_str; 39 while (getline(fin, line_str)) 40 { 41 vector<string> line_vec = string_split(line_str, " "); 42 vector<float> line_omega; 43 for (vector<string>::iterator it = line_vec.begin(); it != line_vec.end(); it++) 44 { 45 float weight = (float)atof(it->c_str()); 46 line_omega.push_back(weight); 47 } 48 omega.push_back(line_omega); 49 } 50 fin.close(); 51 feat_set_size = (int)omega.size(); 52 class_set_size = (int)omega[0].size(); 53 } 54 55 56 void Per::read_samp_file(string samp_file, vector<sparse_feat> &samp_feat_vec, vector<int> &samp_class_vec) { 57 ifstream fin(samp_file.c_str()); 58 if(!fin) 59 { 60 cerr << "Error opening file: " << samp_file << endl; 61 exit(0); 62 } 63 string line_str; 64 while (getline(fin, line_str)) 65 { 66 size_t class_pos = line_str.find_first_of("\t"); 67 int class_id = atoi(line_str.substr(0, class_pos).c_str()); 68 samp_class_vec.push_back(class_id); 69 string terms_str = line_str.substr(class_pos+1); 70 sparse_feat samp_feat; 71 samp_feat.id_vec.push_back(0); // bias 72 samp_feat.value_vec.push_back(1); // bias 73 if (terms_str != "") { 74 vector<string> fv_vec = string_split(terms_str, " "); 75 for (vector<string>::iterator it = fv_vec.begin(); it != fv_vec.end(); it++) 76 { 77 size_t feat_pos = it->find_first_of(":"); 78 int feat_id = atoi(it->substr(0, feat_pos).c_str()); 79 float feat_value = (float)atof(it->substr(feat_pos+1).c_str()); 80 samp_feat.id_vec.push_back(feat_id); 81 samp_feat.value_vec.push_back(feat_value); 82 } 83 } 84 samp_feat_vec.push_back(samp_feat); 85 } 86 fin.close(); 87 } 88 89 90 void Per::load_training_file(string training_file) 91 { 92 cout << "Loading training data..." << endl; 93 read_samp_file(training_file, samp_feat_vec, samp_class_vec); 94 feat_set_size = 0; 95 class_set_size = 0; 96 for (size_t i = 0; i < samp_class_vec.size(); i++) 97 { 98 if (samp_class_vec[i] > class_set_size) 99 { 100 class_set_size = samp_class_vec[i]; 101 } 102 if (samp_feat_vec[i].id_vec.back() > feat_set_size) { 103 feat_set_size = samp_feat_vec[i].id_vec.back(); 104 } 105 } 106 class_set_size += 1; 107 feat_set_size += 1; 108 } 109 110 void Per::init_omega() 111 { 112 //float int_value = 0.0; 113 float int_value = (float)1/class_set_size; 114 for (int i = 0; i < feat_set_size; i++) 115 { 116 vector<float> temp_vec(class_set_size, int_value); 117 omega.push_back(temp_vec); 118 } 119 } 120 121 // Stochastic Gradient Descent (SGD) optimization for the criteria of Perceptron (PER), Cross Entropy (CE), Leat Mean Square (LMS) 122 int Per::train_online(int criter, int max_loop, double loss_thrd, float learn_rate, float lambda, int avg) 123 { 124 int id = 0; 125 double loss = 0.0; 126 double loss_pre = 0.0; 127 vector< vector<float> > omega_sum(omega); 128 while (id <= max_loop*(int)samp_class_vec.size()) 129 { 130 // check loss value 131 if (id%samp_class_vec.size() == 0) { 132 int loop = id/(int)samp_class_vec.size(); 133 double loss = 0.0; 134 float acc = 0.0; 135 136 calc_loss_per(&loss, &acc); 137 138 cout.setf(ios::left); 139 cout << "Iter: " << setw(8) << loop << "Loss: " << setw(18) << loss << "Acc: " << setw(8) << acc << endl; 140 if ((loss_pre - loss) < loss_thrd && loss_pre >= loss && id != 0) 141 { 142 cout << "Reaching the minimal loss value decrease!" << endl; 143 break; 144 } 145 loss_pre = loss; 146 } 147 // update omega 148 int r = (int)(rand()%samp_class_vec.size()); 149 //int r = (int)i%samp_class_vec.size(); 150 sparse_feat samp_feat = samp_feat_vec[r]; 151 int samp_class = samp_class_vec[r]; 152 update_online_per(samp_class, samp_feat, learn_rate); 153 if (avg == 1 && id%samp_class_vec.size() == 0) 154 { 155 for (int i = 0; i < feat_set_size; i++) 156 { 157 for (int j = 0; j < class_set_size; j++) 158 { 159 omega_sum[i][j] += omega[i][j]; 160 } 161 } 162 } 163 id++; 164 } 165 if (avg == 1) { 166 for (int i = 0; i < feat_set_size; i++) 167 { 168 for (int j = 0; j < class_set_size; j++) 169 { 170 omega[i][j] = (float)omega_sum[i][j] / id; 171 } 172 } 173 } 174 return 1; 175 } 176 177 void Per::update_online_per(int samp_class, sparse_feat &samp_feat, float learn_rate) 178 { 179 vector<float> score = calc_score(samp_feat); 180 int pred_class = score_to_class(score); 181 if (samp_class != pred_class) 182 { 183 for (size_t k = 0; k < samp_feat.id_vec.size(); k++) 184 { 185 int feat_id = samp_feat.id_vec[k]; 186 float feat_value = samp_feat.value_vec[k]; 187 omega[feat_id][pred_class] -= learn_rate * feat_value; 188 omega[feat_id][samp_class] += learn_rate * feat_value; 189 } 190 } 191 } 192 193 194 195 void Per::calc_loss_per(double *loss, float *acc) 196 { 197 double loss_value = 0.0; 198 int err_num = 0; 199 for (size_t i = 0; i < samp_class_vec.size(); i++) 200 { 201 int samp_class = samp_class_vec[i]; 202 sparse_feat samp_feat = samp_feat_vec[i]; 203 vector<float> score = calc_score(samp_feat); 204 int pred_class = score_to_class(score); 205 if (pred_class != samp_class) 206 { 207 err_num++; 208 loss_value += (score[pred_class] - score[samp_class]); 209 } 210 } 211 *acc = 1 - (float)err_num / samp_class_vec.size(); 212 *loss = loss_value / samp_class_vec.size(); 213 } 214 215 216 217 vector<float> Per::calc_score(sparse_feat &samp_feat) 218 { 219 vector<float> score(class_set_size, 0); 220 for (int j = 0; j < class_set_size; j++) 221 { 222 for (size_t k = 0; k < samp_feat.id_vec.size(); k++) 223 { 224 int feat_id = samp_feat.id_vec[k]; 225 float feat_value = samp_feat.value_vec[k]; 226 score[j] += omega[feat_id][j] * feat_value; 227 } 228 } 229 return score; 230 } 231 232 vector<float> Per::calc_neg_score(sparse_feat &samp_feat) 233 { 234 vector<float> score(class_set_size, 0); 235 for (int j = 0; j < class_set_size; j++) 236 { 237 for (size_t k = 0; k < samp_feat.id_vec.size(); k++) 238 { 239 int feat_id = samp_feat.id_vec[k]; 240 float feat_value = 1-samp_feat.value_vec[k]; 241 score[j] += omega[feat_id][j] * feat_value; 242 } 243 } 244 return score; 245 246 } 247 vector<float> Per::score_to_prb(vector<float> &score) 248 { 249 vector<float> prb(class_set_size, 0); 250 for (int i = 0; i < class_set_size; i++) 251 { 252 float delta_prb_sum = 0.0; 253 for (int j = 0; j < class_set_size; j++) 254 { 255 delta_prb_sum += exp(score[j] - score[i]); 256 } 257 prb[i] = 1 / delta_prb_sum; 258 } 259 return prb; 260 } 261 262 int Per::score_to_class(vector<float> &score) 263 { 264 int pred_class = 0; 265 float max_score = score[0]; 266 for (int j = 1; j < class_set_size; j++) 267 { 268 if (score[j] > max_score) { 269 max_score = score[j]; 270 pred_class = j; 271 } 272 } 273 return pred_class; 274 } 275 276 float Per::classify_testing_file(string testing_file, string output_file, int output_format) 277 { 278 cout << "Pair Classifying testing file..." << endl; 279 vector<sparse_feat> test_feat_vec; 280 vector<int> test_class_vec; 281 vector<int> pair_test_class_vec; 282 vector<int> pred_class_vec; 283 vector<int> pair_pred_class_vec; 284 read_samp_file(testing_file, test_feat_vec, test_class_vec); 285 ofstream fout(output_file.c_str()); 286 for (size_t i = 0; i < test_class_vec.size(); i++) 287 { 288 int samp_class = test_class_vec[i]; 289 pair_test_class_vec.push_back(1-test_class_vec[i]); 290 291 sparse_feat samp_feat = test_feat_vec[i]; 292 sparse_feat pair_samp_feat=product_pair_feature(test_feat_vec[i]); 293 294 vector<float> pred_score = calc_score(samp_feat); 295 int pred_class = score_to_class(pred_score); 296 297 vector<float> pair_pred_score=calc_score(pair_samp_feat); 298 int pair_pred_class=score_to_class(pair_pred_score); 299 300 pred_class_vec.push_back(pred_class); 301 302 303 if (pred_class==pair_pred_class) 304 { 305 vector<float> pred_prb = score_to_prb(pred_score); 306 vector<float> pair_pred_prb=score_to_prb(pair_pred_score); 307 float max_score=*(max_element(pred_prb.begin(),pred_prb.end())); 308 float pair_max_score=*(max_element(pair_pred_prb.begin(),pair_pred_prb.end())); 309 if (pair_max_score-max_score>0.2) 310 pair_pred_class=1-pair_pred_class; 311 } 312 313 pair_pred_class_vec.push_back(pair_pred_class); 314 315 fout << pred_class << "\t"; 316 if (output_format == 1) { 317 for (int j = 0; j < class_set_size; j++) { 318 fout << j << ":" << pred_score[j] << ' '; 319 } 320 } 321 else if (output_format == 2) 322 { 323 vector<float> pred_prb = score_to_prb(pred_score); 324 for (int j = 0; j < class_set_size; j++) 325 { 326 fout << j << ":" << pred_prb[j] << ' '; 327 } 328 } 329 fout << endl; 330 } 331 fout.close(); 332 float acc = calc_acc(test_class_vec, pred_class_vec); 333 float pair_acc=calc_acc(pair_test_class_vec,pair_pred_class_vec); 334 cout<<"Orgional acc is :"<<acc<<endl; 335 cout<<"Pair acc is :"<<pair_acc<<endl; 336 return acc; 337 } 338 339 float Per::calc_acc(vector<int> &test_class_vec, vector<int> &pred_class_vec) 340 { 341 size_t len = test_class_vec.size(); 342 if (len != pred_class_vec.size()) 343 { 344 cerr << "Error: two vectors should have the same lenght." << endl; 345 exit(0); 346 } 347 int err_num = 0; 348 for (size_t id = 0; id != len; id++) 349 { 350 if (test_class_vec[id] != pred_class_vec[id]) 351 { 352 err_num++; 353 } 354 } 355 return 1 - ((float)err_num) / len; 356 } 357 358 359 vector<string> Per::string_split(string terms_str, string spliting_tag) 360 { 361 vector<string> feat_vec; 362 size_t term_beg_pos = 0; 363 size_t term_end_pos = 0; 364 while ((term_end_pos = terms_str.find_first_of(spliting_tag, term_beg_pos)) != string::npos) 365 { 366 if (term_end_pos > term_beg_pos) 367 { 368 string term_str = terms_str.substr(term_beg_pos, term_end_pos - term_beg_pos); 369 feat_vec.push_back(term_str); 370 } 371 term_beg_pos = term_end_pos + 1; 372 } 373 if (term_beg_pos < terms_str.size()) 374 { 375 string end_str = terms_str.substr(term_beg_pos); 376 feat_vec.push_back(end_str); 377 } 378 return feat_vec; 379 } 380 381 sparse_feat Per::product_pair_feature(const sparse_feat &samp_feat) 382 { 383 sparse_feat pair_sparse_feat; 384 for (int i=0;i!=samp_feat.id_vec.size();i++) 385 { 386 int feat_id=samp_feat.id_vec[i]; 387 pair_sparse_feat.id_vec.push_back(feat_id); 388 float feat_value=samp_feat.value_vec[i]; 389 pair_sparse_feat.value_vec.push_back(2-feat_value); 390 391 } 392 393 return pair_sparse_feat; 394 395 }
3.代码说明
上述算法是后续代码的框架,比如load_model(), save_model(), update_online()等都是通用的,只是在不同的分类器,迭代侧率与损失函数不一样而已(算法关键),在感知机分类器我们迭代策略在使用update_online_per()函数,在计算损失函数使用calc_loss_per()函数。因此,你可以根据自己的需求,在上述算法框架上增加不同的准则策略。
4.模型的训练与预测
感知模型训练主函数Demo
View Code
View Code #include <cstdlib> #include <iostream> #include <cstring> #include "Per.h" using namespace std; void print_help() { cout << "\nPer training module, " << VERSION << ", " << VERSION_DATE << "\n\n" << "usage: per_train [options] training_file model_file [pre_model_file]\n\n" << "options: -h -> help\n" << " -n int -> maximal iteration loops (default 200)\n" << " -m double -> minimal loss value decrease (default 1e-03)\n" << " -r double -> regularization parameter lambda of gaussian prior (default 0)\n" << " -l float -> learning rate (default 1.0)\n" << " -a -> 0: final weight (default)\n" << " -> 1: average weights of all iteration loops\n" << " -u [0,1] -> 0: initial training model (default)\n" << " -> 1: updating model (pre_model_file is needed)\n" << endl; } void read_parameters(int argc, char *argv[], char *training_file, char *model_file, int *criter, int *max_loop, double *loss_thrd, float *learn_rate, float *lambda, int *avg, int *update, char *pre_model_file) { // set default options *criter = 0; *max_loop = 200; *loss_thrd = 1e-3; *learn_rate = 1.0; *lambda = 0.0; *avg = 0; *update = 0; int i; for (i = 1; (i<argc) && (argv[i])[0]=='-'; i++) { switch ((argv[i])[1]) { case 'h': print_help(); exit(0); case 'c': *criter = atoi(argv[++i]); break; case 'n': *max_loop = atoi(argv[++i]); break; case 'm': *loss_thrd = atof(argv[++i]); break; case 'l': *learn_rate = (float)atof(argv[++i]); break; case 'r': *lambda = (float)atof(argv[++i]); break; case 'a': *avg = atoi(argv[++i]); break; case 'u': *update = atoi(argv[++i]); break; default: cout << "Unrecognized option: " << argv[i] << "!" << endl; print_help(); exit(0); } } if ((i+1)>=argc) { cout << "Not enough parameters!" << endl; print_help(); exit(0); } strcpy (training_file, argv[i]); strcpy (model_file, argv[i+1]); if (*update) { if ((i+2)>=argc) { cout << "Previous model file is needed in update mode!" << endl; print_help(); exit(0); } strcpy (pre_model_file, argv[i+2]); } } int per_train(int argc, char *argv[]) { char training_file[200]; char model_file[200]; int criter; int max_loop; double loss_thrd; float learn_rate; float lambda; int avg; int update; char pre_model_file[200]; read_parameters(argc, argv, training_file, model_file, &criter, &max_loop, &loss_thrd, &learn_rate, &lambda, &avg, &update, pre_model_file); Per per; per.load_training_file(training_file); if (update) { per.load_model(pre_model_file); } else { per.init_omega(); } per.train_online(criter, max_loop, loss_thrd, learn_rate, lambda, avg); per.save_model(model_file); return 0; } int main(int argc, char *argv[]) { return per_train(argc, argv); }
感知机模型预测主函数Demo
View Code
View Code View Code #include <cstdlib> #include <iostream> #include <cstring> #include "Per.h" using namespace std; void print_help() { cout << "\nOpenPR-LDF classification module, " << VERSION << ", " << VERSION_DATE << "\n\n" << "usage: ldf_classify [options] testing_file model_file output_file\n\n" << "options: -h -> help\n" << " -f [0..2] -> 0: only output class label (default)\n" << " -> 1: output class label with log-likelihood (weighted sum)\n" << " -> 2: output class label with soft probability\n" << endl; } void read_parameters(int argc, char *argv[], char *testing_file, char *model_file, char *output_file, int *output_format) { // set default options *output_format = 0; int i; for (i = 1; (i<argc) && (argv[i])[0]=='-'; i++) { switch ((argv[i])[1]) { case 'h': print_help(); exit(0); case 'f': *output_format = atoi(argv[++i]); break; default: cout << "Unrecognized option: " << argv[i] << "!" << endl; print_help(); exit(0); } } if ((i+2)>=argc) { cout << "Not enough parameters!" << endl; print_help(); exit(0); } strcpy(testing_file, argv[i]); strcpy(model_file, argv[i+1]); strcpy(output_file, argv[i+2]); } int per_classify(int argc, char *argv[]) { char testing_file[200]; char model_file[200]; char output_file[200]; int output_format; read_parameters(argc, argv, testing_file, model_file, output_file, &output_format); Per per; per.load_model(model_file); float acc = per.classify_testing_file(testing_file, output_file, output_format); cout << "Accuracy: " << acc << endl; return 0; } int main(int argc, char *argv[]) { return per_classify(argc, argv); }