作者:finallyliuyu 转载使用等请注明出处
首先介绍libsvm 中主要的文件svm.h,svm.c ,这个两个文件实现了svm的算法。 svm-train.c,svm-predict.c 分别完成训练和预测功能。
本来我参照svm-train,svm-predict中的 main函数,将train功能,和predict功能直接在程序中整合,结果,调了一天都有异常。。(我还是太菜了)最后在同学的建议下 工程中改用系统调用的方式。为了获得准确率(将分类准确率输出到文本文件),将svm-predict函数做了如下修改:
注意 accuracy_file部分对应的修改。
void predict(FILE *input, FILE *output, FILE *accuracy_file) { int correct = 0; int total = 0; double error = 0; double sump = 0, sumt = 0, sumpp = 0, sumtt = 0, sumpt = 0; int svm_type=svm_get_svm_type(model); int nr_class=svm_get_nr_class(model); double *prob_estimates=NULL; int j; if(predict_probability) { if (svm_type==NU_SVR || svm_type==EPSILON_SVR) printf("Prob. model for test data: target value = predicted value + z,\nz: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma=%g\n",svm_get_svr_probability(model)); else { int *labels=(int *) malloc(nr_class*sizeof(int)); svm_get_labels(model,labels); prob_estimates = (double *) malloc(nr_class*sizeof(double)); fprintf(output,"labels"); for(j=0;j<nr_class;j++) fprintf(output," %d",labels[j]); fprintf(output,"\n"); free(labels); } } max_line_len = 1024; line = (char *)malloc(max_line_len*sizeof(char)); while(readline(input) != NULL) { int i = 0; double target_label, predict_label; char *idx, *val, *label, *endptr; int inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0 label = strtok(line," \t"); target_label = strtod(label,&endptr); if(endptr == label) exit_input_error(total+1); while(1) { if(i>=max_nr_attr-1) // need one more for index = -1 { max_nr_attr *= 2; x = (struct svm_node *) realloc(x,max_nr_attr*sizeof(struct svm_node)); } idx = strtok(NULL,":"); val = strtok(NULL," \t"); if(val == NULL) break; errno = 0; x[i].index = (int) strtol(idx,&endptr,10); if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index) exit_input_error(total+1); else inst_max_index = x[i].index; errno = 0; x[i].value = strtod(val,&endptr); if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(total+1); ++i; } x[i].index = -1; if (predict_probability && (svm_type==C_SVC || svm_type==NU_SVC)) { predict_label = svm_predict_probability(model,x,prob_estimates); fprintf(output,"%g",predict_label); for(j=0;j<nr_class;j++) fprintf(output," %g",prob_estimates[j]); fprintf(output,"\n"); } else { predict_label = svm_predict(model,x); fprintf(output,"%g\n",predict_label); } if(predict_label == target_label) ++correct; error += (predict_label-target_label)*(predict_label-target_label); sump += predict_label; sumt += target_label; sumpp += predict_label*predict_label; sumtt += target_label*target_label; sumpt += predict_label*target_label; ++total; } if (svm_type==NU_SVR || svm_type==EPSILON_SVR) { printf("Mean squared error = %g (regression)\n",error/total); printf("Squared correlation coefficient = %g (regression)\n", ((total*sumpt-sump*sumt)*(total*sumpt-sump*sumt))/ ((total*sumpp-sump*sump)*(total*sumtt-sumt*sumt)) ); } else { float accuracy_rate = (float)correct/total*100; fprintf(accuracy_file, "%f", accuracy_rate); printf("Accuracy = %g%% (%d/%d) (classification)\n", (double)correct/total*100,correct,total); } if(predict_probability) free(prob_estimates); }
int main(int argc, char **argv) { FILE *input, *output, *accuracy_file; int i; ////@@debug //fprintf(stdout, "my svm_predict..."); // parse options for(i=1;i<argc;i++) { if(argv[i][0] != '-') break; ++i; switch(argv[i-1][1]) { case 'b': predict_probability = atoi(argv[i]); break; default: fprintf(stderr,"Unknown option: -%c\n", argv[i-1][1]); exit_with_help(); } } //if(i>=argc-2) if(i>=argc-3) exit_with_help(); input = fopen(argv[i],"r"); if(input == NULL) { fprintf(stderr,"can't open input file %s\n",argv[i]); exit(1); } if((model=svm_load_model(argv[i+1]))==0) { fprintf(stderr,"can't open model file %s\n",argv[i+1]); exit(1); } output = fopen(argv[i+2],"w"); if(output == NULL) { fprintf(stderr,"can't open output file %s\n",argv[i+2]); exit(1); } accuracy_file = fopen(argv[i+3], "w"); if(accuracy_file == NULL) { fprintf(stderr,"can't open output file %s\n",argv[i+3]); exit(1); } x = (struct svm_node *) malloc(max_nr_attr*sizeof(struct svm_node)); if(predict_probability) { if(svm_check_probability_model(model)==0) { fprintf(stderr,"Model does not support probabiliy estimates\n"); exit(1); } } else { if(svm_check_probability_model(model)!=0) printf("Model supports probability estimates, but disabled in prediction.\n"); } predict(input,output, accuracy_file); svm_destroy_model(model); free(x); free(line); fclose(input); fclose(output); return 0; }
调用Libsvm完成分类,准确率计算的主程序。(我的代码)
include "stdio.h" #include "stdlib.h" #include "memory.h" #include "string.h" #define MAX_COMMAND_LINE_LENGTH 2048 int svm_train(char *command_path, char *train_libsvm, char *model_libsvm) { // 生成命令行 char command_line[MAX_COMMAND_LINE_LENGTH] = {'\0'}; sprintf(command_line, "%s -t 0 %s %s", command_path, train_libsvm, model_libsvm); // 执行命令行 system(command_line); return 1; } int svm_predict(char *command_path, char *test_libsvm, char *model_libsvm, char *result_path, char *accuracy_path) { // 生成命令行 char command_line[MAX_COMMAND_LINE_LENGTH] = {'\0'}; sprintf(command_line, "%s %s %s %s %s", command_path, test_libsvm, model_libsvm, result_path, accuracy_path); // 执行命令行 system(command_line); return 1; } int main() { void AccuracyFormation(); int LibSvm(); int end; AccuracyFormation(); //LibSvm(); printf("finalfinish,congratulations!"); scanf("%d",&end); return 1; } //char command_line[MAX_COMMAND_LINE_LENGTH] = {'\0'}; //// train //sprintf(command_line, "..\\Release\\svm_train.exe -t 0 D:\\libsvmdata\\500\\0\\100\\train.libsvm D:\\libsvmdata\\500\\0\\100\\model.libsvm"); //system(command_line); //// predict ////command_line[0] = '\0'; //memset(command_line, 0, sizeof(command_line[0])*MAX_COMMAND_LINE_LENGTH); // //sprintf(command_line, "..\\Release\\svm_predict.exe D:\\libsvmdata\\500\\0\\100\\test.libsvm D:\\libsvmdata\\500\\0\\100\\model.libsvm D:\\libsvmdata\\500\\0\\100\\result.txt D:\\libsvmdata\\500\\0\\100\\accuracy.txt"); //system(command_line); int LibSvm() { int vfold=5; char *train_path_part="\\data\\train.libsvm"; char *test_path_part="\\data\\test.libsvm"; char *result_path_part="\\data\\result.txt"; char *model_path_part="\\data\\model.libsvm"; char *accuracy_path_part="\\data\\accuracy.txt"; char featureDimensions[15][20]={"10","20","30","40","50","60","70","80","90","100","110","120","130","140","150"};//特征维数 char done_research_times[5][10]={"0","1","2","3","4"};//已经进行了几次试验 char N_corpus[4][20]={"100","500","1000","1500"};//文档集规模 char command_path_train[] = "..\\Release\\svm_train.exe"; char command_path_predict[] = "..\\Release\\svm_predict.exe"; /*char train_libsvm[] = "D:\\1_100\\TextCategorization_1_100_100\\data\\train.libsvm"; char test_libsvm[] = "D:\\1_100\\TextCategorization_1_100_100\\data\\test.libsvm"; char model_libsvm[] = "D:\\1_100\\TextCategorization_1_100_100\\data\\model.libsvm"; char result_path_1[] = "D:\\1_100\\TextCategorization_1_100_100\\data\\result.txt"; char accuracy_path_1[] = "D:\\1_100\\TextCategorization_1_100_100\\data\\accuracy.txt";*/ char file_address[300][5000]; char *temp=(char*) malloc(10000); char *train_path=(char*) malloc(10000); char *test_path=(char*) malloc(10000); char *result_path=(char*) malloc(10000); char *model_path=(char *)malloc(10000); char *accuracy_path=(char *)malloc(10000); int i,j,k; memset(temp,0,10000); memset(train_path,0,10000); memset(result_path,0,10000); memset(test_path,0,10000); memset(model_path,0,10000); memset(accuracy_path,0,10000); /* // train svm_train(command_path_train, train_libsvm, model_libsvm); // predict svm_predict(command_path_predict, test_libsvm, model_libsvm, result_path_1, accuracy_path_1); // 停住 system("pause"); return 1;*/ /*************************************生成文件名****************************************************/ for(i=0;i<5;i++)//指征 done_research_times { for(j=0;j<4;j++)//指征文档集规模 { for( k=0;k<15;k++)//指征特征词维数 { strcat(temp,"D:\\"); strcat(temp,done_research_times[i]); strcat(temp,"_"); strcat(temp,N_corpus[j]); strcat(temp,"_rfinish"); strcat(temp,"\\TextCategorization_"); strcat(temp,done_research_times[i]); strcat(temp,"_"); strcat(temp,N_corpus[j]); strcat(temp,"_"); strcat(temp,featureDimensions[k]); strcpy(file_address[i*60+j*15+k],temp); //printf("%s\n",temp); memset(temp,0,10000); } } } free(temp); for(i=0;i<300;i++) { //printf("%s\n", file_address[i]); strcat(train_path,file_address[i]); strcat(train_path,train_path_part); strcat(test_path,file_address[i]); strcat(test_path,test_path_part); strcat(result_path,file_address[i]); strcat(result_path,result_path_part); strcat(model_path,file_address[i]); strcat(model_path,model_path_part); strcat(accuracy_path,file_address[i]); strcat(accuracy_path,accuracy_path_part); // train svm_train(command_path_train, train_path, model_path); // predict svm_predict(command_path_predict, test_path, model_path, result_path, accuracy_path); printf("\n%s路径下的LibSVM分类完成\n",file_address[i]); memset(train_path,0,10000); memset(result_path,0,10000); memset(test_path,0,10000); memset(model_path,0,10000); memset(accuracy_path,0,10000); } free(train_path); free(test_path); free(model_path); free(result_path); free(accuracy_path); printf("试验完成\n"); return 1; } void AccuracyFormation() { char *accuracy_path_part="\\data\\accuracy.txt"; //char featureDimensions[11][20]={"100","500","1000","1500","2000","2500","3000","3500","4000","4500","5000"};//特征维数 char featureDimensions[15][20]={"10","20","30","40","50","60","70","80","90","100","110","120","130","140","150"};//特征维数 char done_research_times[5][10]={"0","1","2","3","4"};//已经进行了几次试验 char N_corpus[4][20]={"100","500","1000","1500"};//文档集规模 char *accuracy_path=(char *)malloc(10000); char dest_accuracy[5][20]={"0.txt","1.txt","2.txt","3.txt","4.txt"}; int i,j,k; int reallen=0; FILE *fp=NULL; char *temp=(char *)malloc(1000); memset(accuracy_path,0,10000); memset(temp,0,100); for(i=0;i<5;i++)//指征 done_research_times { for(j=0;j<4;j++)//指征文档集规模 { for( k=0;k<15;k++)//指征特征词维数 { //构造路径 strcat(accuracy_path,"D:\\"); strcat(accuracy_path,done_research_times[i]); strcat(accuracy_path,"_"); strcat(accuracy_path,N_corpus[j]); strcat(accuracy_path,"_r1"); strcat(accuracy_path,"\\TextCategorization_"); strcat(accuracy_path,done_research_times[i]); strcat(accuracy_path,"_"); strcat(accuracy_path,N_corpus[j]); strcat(accuracy_path,"_"); strcat(accuracy_path,featureDimensions[k]); strcat(accuracy_path,accuracy_path_part); fp=fopen(accuracy_path,"r"); if(fp==NULL) { printf("FILENAEM ERROR"); exit(0); } fread(temp,1,100,fp); fclose(fp); fp=fopen(dest_accuracy[i],"a"); if(fp==NULL) { printf("FILENAEM ERROR"); exit(0); } if(k<14)//添加逗号 { strcat(temp,","); } fwrite(temp,1,strlen(temp),fp); fclose(fp); printf("%s处理完毕\n",accuracy_path); memset(accuracy_path,0,10000); memset(temp,0,1000); } fp=fopen(dest_accuracy[i],"a"); if(fp==NULL) { printf("FILENAEM ERROR"); exit(0); } strcat(temp,"\r\n"); fwrite(temp,1,strlen(temp),fp); fclose(fp); printf("一行处理完毕\n"); memset(temp,0,1000); } printf("%s填写完毕\n",dest_accuracy[i]); } free(temp); free(accuracy_path); }