代码改变世界

logistic regression C++实现

2013-06-11 20:14  夜与周公  阅读(5050)  评论(9编辑  收藏  举报

  在“逻辑斯特回归模型(logistic regression)”一文中阐述了logistic regression模型的理论基础,本节采用C++实现logistic regression。算法是由笔者跟我们老板一起合作完成(老板提供算法框架和理论上的支持,笔者编写代码,准备开源发布)。下面分数据结构、模型实现、模型训练、模型预测与usage五个部分介绍logistic regression模型的实现与用法。

  1.数据结构

/********************************************************************
* Logistic Regression Classifier V0.10
* Implemented by Rui Xia(rxia@nlpr.ia.ac.cn) , Wang Tao(wangbogong@gmail.com)
* Last updated on 2012-6-12. 
*********************************************************************/
#pragma once
#include <iostream>
#include <fstream>
#include <iomanip>
#include <string>
#include <vector>
#include <set>
#include <map>
#include <algorithm>
#include <climits>
#include <math.h>
#include <time.h>

# define VERSION       "V0.10"
# define VERSION_DATE  "2012-6-12"

using namespace std;

const long  double min_threshold=1e-300;

struct sparse_feat                       //稀疏特征表示结构
{
    vector<int> id_vec;
    vector<float> value_vec;
};

class LR                                 //logistic regression实现类
{
private:
    vector<sparse_feat> samp_feat_vec;
    vector<int> samp_class_vec;
    int feat_set_size;
    int class_set_size;
    vector< vector<float> > omega; //模型的参数矩阵omega = feat_set_size * class_set_size
     
public:
    LR();
    ~LR();
    void save_model(string model_file);
    void load_model(string model_file);
    void load_training_file(string training_file);
    void init_omega();
    
    int train_online(int max_loop, double loss_thrd, float learn_rate, float lambda, int avg);    //logistic regression随机梯度优化算法
    vector<float> calc_score(sparse_feat &samp_feat);
    vector<float> score_to_prb(vector<float> &score);
    int score_to_class(vector<float> &score);
    
    float classify_testing_file(string testing_file, string output_file, int output_format);      //模型分类预测

private:
    void read_samp_file(string samp_file, vector<sparse_feat> &samp_feat_vec, vector<int> &samp_class_vec);   //更新函数
    void update_online_ce(int samp_class, sparse_feat &samp_feat, float learn_rate, float lambda);
    void calc_loss_ce(double *loss, float *acc);                                                              //计算损失函数
    float calc_acc(vector<int> &test_class_vec, vector<int> &pred_class_vec);    
    float sigmoid(float x);
    vector<string> string_split(string terms_str, string spliting_tag);

};
  sparse_feat结构体表示的稀疏的存在结构,id_vec用于存放特征的id号,而value_vec用于存在对应的特征值。LR类是logistic regression 实现类,主要的两个函数train_online()与
classify_testing_file(),train_online()函数实现了logistic regression模型的随机梯度下降优化算法,classify_testing_file()函数实现了logistic regression模型对测试样本的预测。

  2.模型的实现

#include "LR.h"

LR::LR()
{

}

LR::~LR()
{
}

void LR::save_model(string model_file)
{ 
    cout << "Saving model..." << endl;
    ofstream fout(model_file.c_str());
    for (int k = 0; k < feat_set_size; k++) {
        for (int j = 0; j < class_set_size; j++) {
            fout << omega[k][j] << " ";
        }
        fout << endl;
    }
    fout.close();
}


void LR::load_model(string model_file)
{
    cout << "Loading model..." << endl;
    omega.clear();
    ifstream fin(model_file.c_str());
    if(!fin) {
        cerr << "Error opening file: " << model_file << endl;
        exit(0);
    }
    string line_str;
    while (getline(fin, line_str)) {
        vector<string> line_vec = string_split(line_str, " ");
        vector<float>  line_omega;
        for (vector<string>::iterator it = line_vec.begin(); it != line_vec.end(); it++) {
            float weight = (float)atof(it->c_str());
            line_omega.push_back(weight);
        }
        omega.push_back(line_omega);
    }
    fin.close();
    feat_set_size = (int)omega.size();
    class_set_size = (int)omega[0].size();
}


void LR::read_samp_file(string samp_file, vector<sparse_feat> &samp_feat_vec, vector<int> &samp_class_vec) {
    ifstream fin(samp_file.c_str());
    if(!fin) {
        cerr << "Error opening file: " << samp_file << endl;
        exit(0);
    }
    string line_str;
    while (getline(fin, line_str)) 
    {
        size_t class_pos = line_str.find_first_of("\t");
        int class_id = atoi(line_str.substr(0, class_pos).c_str());
        samp_class_vec.push_back(class_id);
        string terms_str = line_str.substr(class_pos+1);
        sparse_feat samp_feat;
        samp_feat.id_vec.push_back(0); // bias
        samp_feat.value_vec.push_back(1); // bias
        if (terms_str != "") 
        {
            vector<string> fv_vec = string_split(terms_str, " ");
            for (vector<string>::iterator it = fv_vec.begin(); it != fv_vec.end(); it++) 
            {
                size_t feat_pos = it->find_first_of(":");
                int feat_id = atoi(it->substr(0, feat_pos).c_str());
                float feat_value = (float)atof(it->substr(feat_pos+1).c_str());
                samp_feat.id_vec.push_back(feat_id);
                samp_feat.value_vec.push_back(feat_value);
            }
        }
        samp_feat_vec.push_back(samp_feat);
    }
    fin.close();
}


void LR::load_training_file(string training_file)
{
    cout << "Loading training data..." << endl;
    read_samp_file(training_file, samp_feat_vec, samp_class_vec);
    feat_set_size = 0;
    class_set_size = 0;
    for (size_t i = 0; i < samp_class_vec.size(); i++) {
        if (samp_class_vec[i] > class_set_size) {
            class_set_size = samp_class_vec[i];
        }
        if (samp_feat_vec[i].id_vec.back() > feat_set_size) {
            feat_set_size = samp_feat_vec[i].id_vec.back();
        }    
    }
    class_set_size += 1;
    feat_set_size += 1;
}

void LR::init_omega()
{
     float init_value = 0.0;
    //float init_value = (float)1/class_set_size;
    for (int i = 0; i < feat_set_size; i++) 
    {
        vector<float> temp_vec(class_set_size, init_value);
        omega.push_back(temp_vec);
    }
}

// Stochastic Gradient Descent (SGD) optimization for the criteria of  Cross Entropy (CE)
int LR::train_online( int max_loop, double loss_thrd, float learn_rate, float lambda, int avg)
{
    int id = 0;
    double loss = 0.0;
    double loss_pre = 0.0;
    vector< vector<float> > omega_pre=omega;
    float acc=0.0;

    vector< vector<float> > omega_sum(omega);
    
    while (id <= max_loop*(int)samp_class_vec.size()) 
    {
    
        
        if (id%samp_class_vec.size() == 0)    // 完成一次迭代,预处理工作。
        {
            int loop = id/(int)samp_class_vec.size();               //check loss
            loss = 0.0;
            acc = 0.0;
            
            calc_loss_ce(&loss, &acc);    
            cout.setf(ios::left);
            cout << "Iter: " << setw(8) << loop << "Loss: " << setw(18) << loss << "Acc: " << setw(8) << acc << endl;
            if ((loss_pre - loss) < loss_thrd && loss_pre >= loss && id != 0)
            {
                cout << "Reaching the minimal loss value decrease!" << endl;
                break;
            }
            loss_pre = loss;
                                
            if (id)            //表示第一次不做正则项计算
            {
                for (int i=0;i!=omega_pre.size();i++)
                    for (int j=0;j!=omega_pre[i].size();j++)
                        omega[i][j]+=omega_pre[i][j]*lambda;
            }

            omega_pre=omega;
        }

        // update omega
        int r = (int)(rand()%samp_class_vec.size());
        sparse_feat samp_feat = samp_feat_vec[r];
        int samp_class = samp_class_vec[r];
    
         
       update_online_ce(samp_class, samp_feat, learn_rate, lambda);
        
        if (avg == 1 && id%samp_class_vec.size() == 0) 
        {
            for (int i = 0; i < feat_set_size; i++) 
            {
                for (int j = 0; j < class_set_size; j++) 
                {
                    omega_sum[i][j] += omega[i][j];
                }
            }            
        }
        id++;
    }

    if (avg == 1) 
    {
        for (int i = 0; i < feat_set_size; i++) 
        {
            for (int j = 0; j < class_set_size; j++)
            {
                omega[i][j] = (float)omega_sum[i][j] / id;
            }
        }        
    }
    return 0;
}

void LR::update_online_ce(int samp_class, sparse_feat &samp_feat, float learn_rate, float lambda)
{
    
    vector<float> score=calc_score(samp_feat);//(W'*X)
    vector<float> softMaxVec(class_set_size);

    float maxScore=*(max_element(score.begin(),score.end()));
    float softMaxSum=0;

    for (int j=0;j<class_set_size;j++)
    {
        softMaxVec[j]=exp(score[j]-maxScore);
        softMaxSum+=softMaxVec[j];                             //同时除最大的score;
    }
    for (int k=0;k<class_set_size;k++)
        softMaxVec[k]/=softMaxSum;


    for (int i=0;i<class_set_size;i++)                          //对于每一个类
    {
        float error_term=((int)(i==samp_class)-softMaxVec[i]);
        for (int j=0;j<samp_feat.id_vec.size();j++)             //对于每个类中的
        {
            int feat_id=samp_feat.id_vec[j];
            float feat_value=samp_feat.value_vec[j];
            float delt=error_term*feat_value;
            //float regul = lambda * omega[feat_id][i];
            omega[feat_id][i]+=learn_rate*delt;

        }

    }
    
     
}

void LR::calc_loss_ce(double *loss, float *acc)
{
    double loss_value = 0.0;
    int err_num = 0;

    for (size_t i = 0; i < samp_class_vec.size(); i++) 
    {
        int samp_class = samp_class_vec[i];
        sparse_feat samp_feat = samp_feat_vec[i];

        vector<float> score = calc_score(samp_feat);
        vector<float> softMaxVec(class_set_size);
        float softMaxSum=0;

        int pred_class = score_to_class(score);
        if (pred_class != samp_class) 
        {
            err_num += 1;
        }

        float maxScore=*(max_element(score.begin(),score.end()));
        for (int k=0;k<class_set_size;k++)
        {
           softMaxVec[k]=exp(score[k]-maxScore);
           softMaxSum+=softMaxVec[k];                     //同时除最大的score;
        }

        for (int j = 0; j < class_set_size; j++)
        {
            if (j == samp_class) 
            { 

                double yi=softMaxVec[j]/softMaxSum;
                long double temp=yi<min_threshold ? min_threshold:yi;
                loss_value -= log(temp);                             
        
            }


        }
    }

    *loss = loss_value ;
    *acc = 1 - (float)err_num / samp_class_vec.size();
}

vector<float> LR::calc_score(sparse_feat &samp_feat)
{
    vector<float> score(class_set_size, 0);
    for (int j = 0; j < class_set_size; j++)
    {
        for (size_t k = 0; k < samp_feat.id_vec.size(); k++) 
        {
            int feat_id = samp_feat.id_vec[k];
            float feat_value = samp_feat.value_vec[k];
            score[j] += omega[feat_id][j] * feat_value;
        }
    }
    return score;
}

vector<float> LR::score_to_prb(vector<float> &score)
{   
    //vector<float> prb(class_set_size, 0);
    //for (int i = 0; i < class_set_size; i++) 
    //{
    //    float delta_prb_sum = 0.0;
    //    for (int j = 0; j < class_set_size; j++) 
    //    {
    //        delta_prb_sum += exp(score[j] - score[i]);
    //    }
    //    prb[i] = 1 / delta_prb_sum;
    //}
    //return prb;

    vector<float> softMaxVec(class_set_size);

    float maxScore=*(max_element(score.begin(),score.end()));
    float softMaxSum=0;

    for (int j=0;j<class_set_size;j++)
    {
        softMaxVec[j]=exp(score[j]-maxScore);
        softMaxSum+=softMaxVec[j];                             //同时除最大的score;
    }
    for (int k=0;k<class_set_size;k++)
        softMaxVec[k]/=softMaxSum;

    return softMaxVec;

}


int LR::score_to_class(vector<float> &score)
{
    int pred_class = 0;    
    float max_score = score[0];
    for (int j = 1; j < class_set_size; j++) {
        if (score[j] > max_score) {
            max_score = score[j];
            pred_class = j;
        }
    }
    return pred_class;
}

float LR::classify_testing_file(string testing_file, string output_file, int output_format)
{
    cout << "Classifying testing file..." << endl;
    vector<sparse_feat> test_feat_vec;
    vector<int> test_class_vec;
    vector<int> pred_class_vec;
    read_samp_file(testing_file, test_feat_vec, test_class_vec);
    ofstream fout(output_file.c_str());
    for (size_t i = 0; i < test_class_vec.size(); i++) 
    {
        int samp_class = test_class_vec[i];
        sparse_feat samp_feat = test_feat_vec[i];
        vector<float> pred_score = calc_score(samp_feat);            
        int pred_class = score_to_class(pred_score);
        pred_class_vec.push_back(pred_class);
        fout << pred_class << "\t"<<samp_class<<"\t";
        if (output_format == 1) 
        {
            for (int j = 0; j < class_set_size; j++) 
            {
                fout << j << ":" << pred_score[j] << ' '; 
            }        
        }
        else if (output_format == 2) 
        {
            vector<float> pred_prb = score_to_prb(pred_score);
            for (int j = 0; j < class_set_size; j++)
            {
                fout << j << ":" << pred_prb[j] << ' '; 
            }
        }

        fout << endl;        
    }
    fout.close();
    float acc = calc_acc(test_class_vec, pred_class_vec);
    return acc;
}

float LR::calc_acc(vector<int> &test_class_vec, vector<int> &pred_class_vec)
{
    size_t len = test_class_vec.size();
    if (len != pred_class_vec.size()) {
        cerr << "Error: two vectors should have the same lenght." << endl;
        exit(0);
    }
    int err_num = 0;
    for (size_t id = 0; id != len; id++) {
        if (test_class_vec[id] != pred_class_vec[id]) {
            err_num++;
        }
    }
    return 1 - ((float)err_num) / len;
}

float LR::sigmoid(float x) 
{
    double sgm = 1 / (1+exp(-(double)x));
    return (float)sgm;
}

vector<string> LR::string_split(string terms_str, string spliting_tag)
{
    vector<string> feat_vec;
    size_t term_beg_pos = 0;
    size_t term_end_pos = 0;
    while ((term_end_pos = terms_str.find_first_of(spliting_tag, term_beg_pos)) != string::npos) 
    {
        if (term_end_pos > term_beg_pos)
        {
            string term_str = terms_str.substr(term_beg_pos, term_end_pos - term_beg_pos);
            feat_vec.push_back(term_str);
        }
        term_beg_pos = term_end_pos + 1;
    }
    if (term_beg_pos < terms_str.size())
    {
        string end_str = terms_str.substr(term_beg_pos);
        feat_vec.push_back(end_str);
    }
    return feat_vec;
}

  3.模型训练代码

#include <cstdlib>
#include <iostream>
#include <cstring>

#include "LR.h"

using namespace std;


void print_help() 
{
    cout << "\nOpenPR-LR training module, " << VERSION << ", " << VERSION_DATE << "\n\n"
        << "usage: LR_train [options] training_file model_file [pre_model_file]\n\n"
        << "options: -h        -> help\n"
        << "         -n int    -> maximal iteration loops (default 200)\n"
        << "         -m double -> minimal loss value decrease (default 1e-03)\n"
        << "         -r double -> regularization parameter lambda of gaussian prior (default 0)\n"        
        << "         -l float  -> learning rate (default 1.0)\n"
        << "         -a        -> 0: final weight (default)\n"
        << "                   -> 1: average weights of all iteration loops\n"
        << "         -u [0,1]  -> 0: initial training model (default)\n"
        << "                   -> 1: updating model (pre_model_file is needed)\n" 
        << endl;
}

void read_parameters(int argc, char *argv[], char *training_file, char *model_file, 
                     int *max_loop, double *loss_thrd, float *learn_rate, float *lambda,
                        int *avg, int *update, char *pre_model_file)
{
    // set default options
    *max_loop = 200;
    *loss_thrd = 1e-3;
    *learn_rate = 1.0;
    *lambda = 0.0;
    *avg = 0;
    *update = 0;
    int i;
    for (i = 1; (i<argc) && (argv[i])[0]=='-'; i++) 
    {
        switch ((argv[i])[1]) {
            case 'h':
                print_help();
                exit(0);
            case 'n':
                *max_loop = atoi(argv[++i]);
                break;
            case 'm':
                *loss_thrd = atof(argv[++i]);
                break;
            case 'l':
                *learn_rate = (float)atof(argv[++i]);
                break;
            case 'r':
                *lambda = (float)atof(argv[++i]);
                break;
            case 'a':
                *avg = atoi(argv[++i]);
                break;
            case 'u':
                *update = atoi(argv[++i]);
                break;
            default:
                cout << "Unrecognized option: " << argv[i] << "!" << endl;
                print_help();
                exit(0);
        }
    }
    
    if ((i+1)>=argc) 
    {
        cout << "Not enough parameters!" << endl;
        print_help();
        exit(0);
    }

    strcpy (training_file, argv[i]);
    strcpy (model_file, argv[i+1]);
    if (*update) 
    {
        if ((i+2)>=argc) 
        {
            cout << "Previous model file is needed in update mode!" << endl;
            print_help();
            exit(0);
        }
        strcpy (pre_model_file, argv[i+2]);
    }
}

int LR_train(int argc, char *argv[])
{
    char training_file[200];
    char model_file[200];
    int max_loop;
    double loss_thrd;
    float learn_rate;
    float lambda;
    int avg;
    int update;
    char pre_model_file[200];
    read_parameters(argc, argv, training_file, model_file, &max_loop, &loss_thrd, &learn_rate, &lambda, &avg, &update, pre_model_file);
    
    LR LR;
    LR.load_training_file(training_file);
    if (update) 
    {
        LR.load_model(pre_model_file);
    }
    else 
    {
        LR.init_omega();    
    }
    LR.train_online( max_loop, loss_thrd, learn_rate, lambda, avg);
    LR.save_model(model_file);
    return 0;
}

int main(int argc, char *argv[])
{
    return LR_train(argc, argv);
}

  4.模型预测

#include <cstdlib>
#include <iostream>
#include <cstring>
#include "LR.h"

using namespace std;


void print_help()
{
    cout << "\nOpenPR-LR classification module, " << VERSION << ", " << VERSION_DATE << "\n\n"
        << "usage: LR_classify [options] testing_file model_file output_file\n\n"
        << "options: -h        -> help\n"
        << "         -f [0..2] -> 0: only output class label (default)\n"
        << "                   -> 1: output class label with log-likelihood (weighted sum)\n"
        << "                   -> 2: output class label with soft probability\n"
        << endl;
}

void read_parameters(int argc, char *argv[], char *testing_file, char *model_file, 
                        char *output_file, int *output_format) 
{
    // set default options
    *output_format = 0;
    int i;
    for (i = 1; (i<argc) && (argv[i])[0]=='-'; i++)
    {
        switch ((argv[i])[1]) {
            case 'h':
                print_help();
                exit(0);
            case 'f':
                *output_format = atoi(argv[++i]);
                break;
            default:
                cout << "Unrecognized option: " << argv[i] << "!" << endl;
                print_help();
                exit(0);
        }
    }
    
    if ((i+2)>=argc)
    {
        cout << "Not enough parameters!" << endl;
        print_help();
        exit(0);
    }
    strcpy(testing_file, argv[i]);
    strcpy(model_file, argv[i+1]);
    strcpy(output_file, argv[i+2]);
}

int LR_classify(int argc, char *argv[])
{
    char testing_file[200];
    char model_file[200];
    char output_file[200];
    int output_format;
    read_parameters(argc, argv, testing_file, model_file, output_file, &output_format);
    LR LR;
    LR.load_model(model_file);
    float acc = LR.classify_testing_file(testing_file, output_file, output_format);
    cout << "Accuracy: " << acc << endl;
    //ofstream outfile("d:\\result.txt",ios::app);
    //outfile<<testing_file<<"\t"<<acc<<endl;
    return 0;
}

int main(int argc, char *argv[])
{
    return LR_classify(argc, argv);
}

  5.usage

  算法的处理样本的数据格式与libsvm样本数据格式一致,均采用稀疏的存储结构。

  模型训练:> lr_train -n 50 -m 1e-06  data/train.samp result/ldf.mod。训练样本保存在data/train.samp文件,模型训练迭代的最大次数是50,最小损失值是1e-06,模型文件存储在result/ldf.mod文件。

  分类预测:> lr_classify data/test.samp data/ldf.mod result/ldf.out。测试样本存放在data/test.samp文件,样本输出的结果文件存在在result/ldf.out文件。

 (ps:这里是logisitc regerssion的工具包,包括了说明了文档,源代码和vs2010编译的可执行程序)