代码改变世界

感知机算法C++实现

2013-05-06 13:14  夜与周公  阅读(1450)  评论(0编辑  收藏  举报

  本节介绍感知机分类器的C++实现,其分类器实现框架会在后续的一些算法实现都会使用到。

  1、类的设计

  感知机类设计如下:

View Code
#pragma once
#include <iostream>
#include <fstream>
#include <iomanip>
#include <string>
#include <vector>
#include <set>
#include <map>
#include <algorithm>
#include <cmath>
#include <ctime>
using namespace std;

struct sparse_feat   #特征稀疏表示数据结构
{
    vector<int> id_vec;
    vector<float> value_vec;
};

class Per
{
private:
    vector<sparse_feat> samp_feat_vec; #存储数据集特征
    vector<int> samp_class_vec;        #存储数据集的类别
    int feat_set_size;     #数据集的特征数
    int class_set_size;    #数据集的类别数
    vector< vector<float> > omega;  #模型参数
     
public:
    Per();
    ~Per();
    void save_model(string model_file);
    void load_model(string model_file);
    void load_training_file(string training_file);
    void init_omega();
    
    int train_online(int criter, int max_loop, double loss_thrd, float learn_rate, float lambda, int avg);   #在线更新,主函数
    vector<float> calc_score(sparse_feat &samp_feat);
    vector<float> calc_neg_score(sparse_feat &samp_feat);
    vector<float> score_to_prb(vector<float> &score);
    int score_to_class(vector<float> &score);
    
    float classify_testing_file(string testing_file, string output_file, int output_format);

private:
    void read_samp_file(string samp_file, vector<sparse_feat> &samp_feat_vec, vector<int> &samp_class_vec);
    void update_online_per(int samp_class, sparse_feat &samp_feat, float learn_rate);   #感知机更新侧率
   void calc_loss_per(double *loss, float *acc);                                       #感知机损失函数计算
    float calc_acc(vector<int> &test_class_vec, vector<int> &pred_class_vec);    
    sparse_feat product_pair_feature(const sparse_feat &samp_feat);
    
    vector<string> string_split(string terms_str, string spliting_tag);

};

 
  本文采用稀疏表示的数据结构存储特征,用结构体sparse_feat表示特征,其中id_vec存储索引号的向量,value_vec存储特征值的向量,两者一一对应。目前主流的机器学习工具包均采用稀疏特征的存储方式。

  2.实现代码:

View Code
  1 /********************************************************************
  2 * Linear Discriminant Function Classifier V0.10
  3 *********************************************************************/
  4 
  5 #include "Per.h"
  6 
  7 Per::Per()
  8 {
  9 }
 10 
 11 Per::~Per()
 12 {
 13 }
 14 
 15 void Per::save_model(string model_file)
 16 { 
 17     cout << "Saving model..." << endl;
 18     ofstream fout(model_file.c_str());
 19     for (int k = 0; k < feat_set_size; k++) {
 20         for (int j = 0; j < class_set_size; j++) {
 21             fout << omega[k][j] << " ";
 22         }
 23         fout << endl;
 24     }
 25     fout.close();
 26 }
 27 
 28 
 29 void Per::load_model(string model_file)
 30 {
 31     cout << "Loading model..." << endl;
 32     omega.clear();
 33     ifstream fin(model_file.c_str());
 34     if(!fin) {
 35         cerr << "Error opening file: " << model_file << endl;
 36         exit(0);
 37     }
 38     string line_str;
 39     while (getline(fin, line_str)) 
 40     {
 41         vector<string> line_vec = string_split(line_str, " ");
 42         vector<float>  line_omega;
 43         for (vector<string>::iterator it = line_vec.begin(); it != line_vec.end(); it++) 
 44         {
 45             float weight = (float)atof(it->c_str());
 46             line_omega.push_back(weight);
 47         }
 48         omega.push_back(line_omega);
 49     }
 50     fin.close();
 51     feat_set_size = (int)omega.size();
 52     class_set_size = (int)omega[0].size();
 53 }
 54 
 55 
 56 void Per::read_samp_file(string samp_file, vector<sparse_feat> &samp_feat_vec, vector<int> &samp_class_vec) {
 57     ifstream fin(samp_file.c_str());
 58     if(!fin) 
 59     {
 60         cerr << "Error opening file: " << samp_file << endl;
 61         exit(0);
 62     }
 63     string line_str;
 64     while (getline(fin, line_str)) 
 65     {
 66         size_t class_pos = line_str.find_first_of("\t");
 67         int class_id = atoi(line_str.substr(0, class_pos).c_str());
 68         samp_class_vec.push_back(class_id);
 69         string terms_str = line_str.substr(class_pos+1);
 70         sparse_feat samp_feat;
 71         samp_feat.id_vec.push_back(0); // bias
 72         samp_feat.value_vec.push_back(1); // bias
 73         if (terms_str != "") {
 74             vector<string> fv_vec = string_split(terms_str, " ");
 75             for (vector<string>::iterator it = fv_vec.begin(); it != fv_vec.end(); it++)
 76             {
 77                 size_t feat_pos = it->find_first_of(":");
 78                 int feat_id = atoi(it->substr(0, feat_pos).c_str());
 79                 float feat_value = (float)atof(it->substr(feat_pos+1).c_str());
 80                 samp_feat.id_vec.push_back(feat_id);
 81                 samp_feat.value_vec.push_back(feat_value);
 82             }
 83         }
 84         samp_feat_vec.push_back(samp_feat);
 85     }
 86     fin.close();
 87 }
 88 
 89 
 90 void Per::load_training_file(string training_file)
 91 {
 92     cout << "Loading training data..." << endl;
 93     read_samp_file(training_file, samp_feat_vec, samp_class_vec);
 94     feat_set_size = 0;
 95     class_set_size = 0;
 96     for (size_t i = 0; i < samp_class_vec.size(); i++) 
 97     {
 98         if (samp_class_vec[i] > class_set_size) 
 99         {
100             class_set_size = samp_class_vec[i];
101         }
102         if (samp_feat_vec[i].id_vec.back() > feat_set_size) {
103             feat_set_size = samp_feat_vec[i].id_vec.back();
104         }    
105     }
106     class_set_size += 1;
107     feat_set_size += 1;
108 }
109 
110 void Per::init_omega()
111 {
112     //float int_value = 0.0;
113     float int_value = (float)1/class_set_size;
114     for (int i = 0; i < feat_set_size; i++)
115     {
116         vector<float> temp_vec(class_set_size, int_value);
117         omega.push_back(temp_vec);
118     }
119 }
120 
121 // Stochastic Gradient Descent (SGD) optimization for the criteria of Perceptron (PER), Cross Entropy (CE), Leat Mean Square (LMS)
122 int Per::train_online(int criter, int max_loop, double loss_thrd, float learn_rate, float lambda, int avg)
123 {
124     int id = 0;
125     double loss = 0.0;
126     double loss_pre = 0.0;
127     vector< vector<float> > omega_sum(omega);
128     while (id <= max_loop*(int)samp_class_vec.size()) 
129     {
130         // check loss value
131         if (id%samp_class_vec.size() == 0) {
132             int loop = id/(int)samp_class_vec.size();
133             double loss = 0.0;
134             float acc = 0.0;
135         
136             calc_loss_per(&loss, &acc);
137         
138             cout.setf(ios::left);
139             cout << "Iter: " << setw(8) << loop << "Loss: " << setw(18) << loss << "Acc: " << setw(8) << acc << endl;
140             if ((loss_pre - loss) < loss_thrd && loss_pre >= loss && id != 0)
141             {
142                 cout << "Reaching the minimal loss value decrease!" << endl;
143                 break;
144             }
145             loss_pre = loss;
146         }
147         // update omega
148         int r = (int)(rand()%samp_class_vec.size());
149         //int r = (int)i%samp_class_vec.size();
150         sparse_feat samp_feat = samp_feat_vec[r];
151         int samp_class = samp_class_vec[r];
152         update_online_per(samp_class, samp_feat, learn_rate);
153         if (avg == 1 && id%samp_class_vec.size() == 0) 
154         {
155             for (int i = 0; i < feat_set_size; i++)
156             {
157                 for (int j = 0; j < class_set_size; j++)
158                 {
159                     omega_sum[i][j] += omega[i][j];
160                 }
161             }            
162         }
163         id++;
164     }
165     if (avg == 1) {
166         for (int i = 0; i < feat_set_size; i++) 
167         {
168             for (int j = 0; j < class_set_size; j++)
169             {
170                 omega[i][j] = (float)omega_sum[i][j] / id;
171             }
172         }        
173     }
174     return 1;
175 }
176 
177 void Per::update_online_per(int samp_class, sparse_feat &samp_feat, float learn_rate)
178 {
179     vector<float> score = calc_score(samp_feat);
180     int pred_class = score_to_class(score);
181     if (samp_class != pred_class) 
182     {          
183         for (size_t k = 0; k < samp_feat.id_vec.size(); k++) 
184         {
185             int feat_id = samp_feat.id_vec[k];
186             float feat_value = samp_feat.value_vec[k];
187             omega[feat_id][pred_class] -= learn_rate * feat_value;
188             omega[feat_id][samp_class] += learn_rate * feat_value;
189         }
190     }
191 }
192 
193 
194 
195 void Per::calc_loss_per(double *loss, float *acc)
196 {
197     double loss_value = 0.0;
198     int err_num = 0;
199     for (size_t i = 0; i < samp_class_vec.size(); i++)
200     {
201         int samp_class = samp_class_vec[i];
202         sparse_feat samp_feat = samp_feat_vec[i];
203         vector<float> score = calc_score(samp_feat);
204         int pred_class = score_to_class(score);
205         if (pred_class != samp_class) 
206         {
207             err_num++;
208             loss_value += (score[pred_class] - score[samp_class]);
209         }
210     }
211     *acc = 1 - (float)err_num / samp_class_vec.size();
212     *loss = loss_value / samp_class_vec.size();
213 }
214 
215 
216 
217 vector<float> Per::calc_score(sparse_feat &samp_feat)
218 {
219     vector<float> score(class_set_size, 0);
220     for (int j = 0; j < class_set_size; j++)
221     {
222         for (size_t k = 0; k < samp_feat.id_vec.size(); k++)
223         {
224             int feat_id = samp_feat.id_vec[k];
225             float feat_value = samp_feat.value_vec[k];
226             score[j] += omega[feat_id][j] * feat_value;
227         }
228     }
229     return score;
230 }
231 
232 vector<float> Per::calc_neg_score(sparse_feat &samp_feat)
233 {
234     vector<float> score(class_set_size, 0);
235     for (int j = 0; j < class_set_size; j++) 
236     {
237         for (size_t k = 0; k < samp_feat.id_vec.size(); k++)
238         {
239             int feat_id = samp_feat.id_vec[k];
240             float feat_value = 1-samp_feat.value_vec[k];
241             score[j] += omega[feat_id][j] * feat_value;
242         }
243     }
244     return score;
245 
246 }
247 vector<float> Per::score_to_prb(vector<float> &score)
248 {   
249     vector<float> prb(class_set_size, 0);
250     for (int i = 0; i < class_set_size; i++)
251     {
252         float delta_prb_sum = 0.0;
253         for (int j = 0; j < class_set_size; j++)
254         {
255             delta_prb_sum += exp(score[j] - score[i]);
256         }
257         prb[i] = 1 / delta_prb_sum;
258     }
259     return prb;
260 }
261 
262 int Per::score_to_class(vector<float> &score)
263 {
264     int pred_class = 0;    
265     float max_score = score[0];
266     for (int j = 1; j < class_set_size; j++)
267     {
268         if (score[j] > max_score) {
269             max_score = score[j];
270             pred_class = j;
271         }
272     }
273     return pred_class;
274 }
275 
276 float Per::classify_testing_file(string testing_file, string output_file, int output_format)
277 {
278     cout << "Pair Classifying testing file..." << endl;
279     vector<sparse_feat> test_feat_vec;
280     vector<int> test_class_vec;
281     vector<int> pair_test_class_vec;
282     vector<int> pred_class_vec;
283     vector<int> pair_pred_class_vec;
284     read_samp_file(testing_file, test_feat_vec, test_class_vec);
285     ofstream fout(output_file.c_str());
286     for (size_t i = 0; i < test_class_vec.size(); i++) 
287     {
288         int samp_class = test_class_vec[i];
289         pair_test_class_vec.push_back(1-test_class_vec[i]);
290 
291         sparse_feat samp_feat = test_feat_vec[i];
292         sparse_feat pair_samp_feat=product_pair_feature(test_feat_vec[i]);
293 
294         vector<float> pred_score = calc_score(samp_feat);            
295         int pred_class = score_to_class(pred_score);
296 
297         vector<float> pair_pred_score=calc_score(pair_samp_feat);
298         int pair_pred_class=score_to_class(pair_pred_score);
299 
300         pred_class_vec.push_back(pred_class);
301         
302 
303         if (pred_class==pair_pred_class)
304         {
305             vector<float> pred_prb = score_to_prb(pred_score);
306             vector<float> pair_pred_prb=score_to_prb(pair_pred_score);
307             float max_score=*(max_element(pred_prb.begin(),pred_prb.end()));
308             float pair_max_score=*(max_element(pair_pred_prb.begin(),pair_pred_prb.end()));
309             if (pair_max_score-max_score>0.2)
310                 pair_pred_class=1-pair_pred_class;
311         }
312 
313         pair_pred_class_vec.push_back(pair_pred_class);
314 
315         fout << pred_class << "\t";
316         if (output_format == 1) {
317             for (int j = 0; j < class_set_size; j++) {
318                 fout << j << ":" << pred_score[j] << ' '; 
319             }        
320         }
321         else if (output_format == 2)
322         {
323             vector<float> pred_prb = score_to_prb(pred_score);
324             for (int j = 0; j < class_set_size; j++)
325             {
326                 fout << j << ":" << pred_prb[j] << ' '; 
327             }
328         }
329         fout << endl;        
330     }
331     fout.close();
332     float acc = calc_acc(test_class_vec, pred_class_vec);
333     float pair_acc=calc_acc(pair_test_class_vec,pair_pred_class_vec);
334     cout<<"Orgional acc is :"<<acc<<endl;
335     cout<<"Pair acc is :"<<pair_acc<<endl;
336     return acc;
337 }
338 
339 float Per::calc_acc(vector<int> &test_class_vec, vector<int> &pred_class_vec)
340 {
341     size_t len = test_class_vec.size();
342     if (len != pred_class_vec.size()) 
343     {
344         cerr << "Error: two vectors should have the same lenght." << endl;
345         exit(0);
346     }
347     int err_num = 0;
348     for (size_t id = 0; id != len; id++)
349     {
350         if (test_class_vec[id] != pred_class_vec[id])
351         {
352             err_num++;
353         }
354     }
355     return 1 - ((float)err_num) / len;
356 }
357 
358 
359 vector<string> Per::string_split(string terms_str, string spliting_tag)
360 {
361     vector<string> feat_vec;
362     size_t term_beg_pos = 0;
363     size_t term_end_pos = 0;
364     while ((term_end_pos = terms_str.find_first_of(spliting_tag, term_beg_pos)) != string::npos) 
365     {
366         if (term_end_pos > term_beg_pos) 
367         {
368             string term_str = terms_str.substr(term_beg_pos, term_end_pos - term_beg_pos);
369             feat_vec.push_back(term_str);
370         }
371         term_beg_pos = term_end_pos + 1;
372     }
373     if (term_beg_pos < terms_str.size()) 
374     {
375         string end_str = terms_str.substr(term_beg_pos);
376         feat_vec.push_back(end_str);
377     }
378     return feat_vec;
379 }
380 
381 sparse_feat Per::product_pair_feature(const sparse_feat &samp_feat)
382 {
383     sparse_feat pair_sparse_feat;
384     for (int i=0;i!=samp_feat.id_vec.size();i++)
385     {
386         int feat_id=samp_feat.id_vec[i];
387         pair_sparse_feat.id_vec.push_back(feat_id);
388         float feat_value=samp_feat.value_vec[i];
389         pair_sparse_feat.value_vec.push_back(2-feat_value);
390 
391     }
392 
393     return pair_sparse_feat;
394 
395 }

  3.代码说明
  上述算法是后续代码的框架,比如load_model(), save_model(), update_online()等都是通用的,只是在不同的分类器,迭代侧率与损失函数不一样而已(算法关键),在感知机分类器我们迭代策略在使用update_online_per()函数,在计算损失函数使用calc_loss_per()函数。因此,你可以根据自己的需求,在上述算法框架上增加不同的准则策略。

  4.模型的训练与预测

  感知模型训练主函数Demo

  

View Code
View Code 

#include <cstdlib>
#include <iostream>
#include <cstring>
#include "Per.h"

using namespace std;


void print_help() 
{
    cout << "\nPer training module, " << VERSION << ", " << VERSION_DATE << "\n\n"
        << "usage: per_train [options] training_file model_file [pre_model_file]\n\n"
        << "options: -h        -> help\n"
        << "         -n int    -> maximal iteration loops (default 200)\n"
        << "         -m double -> minimal loss value decrease (default 1e-03)\n"
        << "         -r double -> regularization parameter lambda of gaussian prior (default 0)\n"        
        << "         -l float  -> learning rate (default 1.0)\n"
        << "         -a        -> 0: final weight (default)\n"
        << "                   -> 1: average weights of all iteration loops\n"
        << "         -u [0,1]  -> 0: initial training model (default)\n"
        << "                   -> 1: updating model (pre_model_file is needed)\n" 
        << endl;
}

void read_parameters(int argc, char *argv[], char *training_file, char *model_file, 
                        int *criter, int *max_loop, double *loss_thrd, float *learn_rate, float *lambda,
                        int *avg, int *update, char *pre_model_file) {
    // set default options
    *criter = 0;
    *max_loop = 200;
    *loss_thrd = 1e-3;
    *learn_rate = 1.0;
    *lambda = 0.0;
    *avg = 0;
    *update = 0;
    int i;
    for (i = 1; (i<argc) && (argv[i])[0]=='-'; i++) {
        switch ((argv[i])[1]) {
            case 'h':
                print_help();
                exit(0);
            case 'c':
                *criter = atoi(argv[++i]);
                break;
            case 'n':
                *max_loop = atoi(argv[++i]);
                break;
            case 'm':
                *loss_thrd = atof(argv[++i]);
                break;
            case 'l':
                *learn_rate = (float)atof(argv[++i]);
                break;
            case 'r':
                *lambda = (float)atof(argv[++i]);
                break;
            case 'a':
                *avg = atoi(argv[++i]);
                break;
            case 'u':
                *update = atoi(argv[++i]);
                break;
            default:
                cout << "Unrecognized option: " << argv[i] << "!" << endl;
                print_help();
                exit(0);
        }
    }
    
    if ((i+1)>=argc) {
        cout << "Not enough parameters!" << endl;
        print_help();
        exit(0);
    }
    strcpy (training_file, argv[i]);
    strcpy (model_file, argv[i+1]);
    if (*update) {
        if ((i+2)>=argc) {
            cout << "Previous model file is needed in update mode!" << endl;
            print_help();
            exit(0);
        }
        strcpy (pre_model_file, argv[i+2]);
    }
}

int per_train(int argc, char *argv[])
{
    char training_file[200];
    char model_file[200];
    int criter;
    int max_loop;
    double loss_thrd;
    float learn_rate;
    float lambda;
    int avg;
    int update;
    char pre_model_file[200];
    read_parameters(argc, argv, training_file, model_file, &criter, &max_loop, &loss_thrd, &learn_rate, &lambda, &avg, &update, pre_model_file);
    
    Per per;
    per.load_training_file(training_file);
    if (update) {
        per.load_model(pre_model_file);
    }
    else {
        per.init_omega();    
    }
    per.train_online(criter, max_loop, loss_thrd, learn_rate, lambda, avg);
    per.save_model(model_file);
    return 0;
}

int main(int argc, char *argv[])
{
    return per_train(argc, argv);
}

  感知机模型预测主函数Demo

View Code
View Code 

View Code 

#include <cstdlib>
#include <iostream>
#include <cstring>
#include "Per.h"

using namespace std;


void print_help()
 {
    cout << "\nOpenPR-LDF classification module, " << VERSION << ", " << VERSION_DATE << "\n\n"
        << "usage: ldf_classify [options] testing_file model_file output_file\n\n"
        << "options: -h        -> help\n"
        << "         -f [0..2] -> 0: only output class label (default)\n"
        << "                   -> 1: output class label with log-likelihood (weighted sum)\n"
        << "                   -> 2: output class label with soft probability\n"
        << endl;
}

void read_parameters(int argc, char *argv[], char *testing_file, char *model_file, 
                        char *output_file, int *output_format) 
    {
    // set default options
    *output_format = 0;
    int i;
    for (i = 1; (i<argc) && (argv[i])[0]=='-'; i++) 
    {
        switch ((argv[i])[1]) 
        {
            case 'h':
                print_help();
                exit(0);
            case 'f':
                *output_format = atoi(argv[++i]);
                break;
            default:
                cout << "Unrecognized option: " << argv[i] << "!" << endl;
                print_help();
                exit(0);
        }
    }
    
    if ((i+2)>=argc)
     {
        cout << "Not enough parameters!" << endl;
        print_help();
        exit(0);
    }
    strcpy(testing_file, argv[i]);
    strcpy(model_file, argv[i+1]);
    strcpy(output_file, argv[i+2]);
}

int per_classify(int argc, char *argv[])
{
    char testing_file[200];
    char model_file[200];
    char output_file[200];
    int output_format;
    read_parameters(argc, argv, testing_file, model_file, output_file, &output_format);
    Per per;
    per.load_model(model_file);
    float acc = per.classify_testing_file(testing_file, output_file, output_format);
    cout << "Accuracy: " << acc << endl;
    return 0;
}

int main(int argc, char *argv[])
{
    return per_classify(argc, argv);
}