归一化问题
原理:
进行缩放的原因和使用神经网络时的考虑是一样的,由于RBF网络中采用样本数据的欧式距离来计算。主要优点就是避免数值范围较大的属性控制数值范围较小的属性。另一个优点就是避免计算时的numerical difficulties. 因为核值通常依赖特征向量的内积(inner product),而较大的属性值可能导致numerical问题。因此推荐把每个属性缩放到[-1, 1]或者[0, 1]之间,而且前一个范围要比后一个好,即对列向量进行规范化,其详细解释和计算公式见http://www.faqs.org/faqs/ai-faq/neural-nets/part2/中的“Should I standardize the input variables (column vectors)?”。libsvm中没有考虑属性的类型(效益、成本、固定、偏离、区间、偏离区间 6 种不同的属性类型的规范化计算公式是不一样的,详见:徐泽水,《不确定多属性决策方法及应用》,清华大学出版社,2004。)而采用了统一的线性缩放,作者以为此处可以改进一下。
需要注意的是,在进行测试之前,要对测试数据进行同样的缩放操作。其实在libsvm中有程序(svmscale.exe)来进行缩放操作。
上面这两种方法基本上可以完成所有的样本的预处理了。
解决方法:
源:A[]
结果:B[]
A的最大最小值 MaxVal,MinVal
B中希望的最大最小值 MaxOut,MinOut
循环
{
B[] = (a[] - MinVal) / (MaxVal - MinVal);
}
思路:准备把数据从txt读入vector二维数组进行处理!
一点想法,想保存起来:
主要是得到了转置矩阵,不过后来想想貌似不起神马作用:
#include <iostream> #include <string> #include <fstream> #include <sstream> #include <exception> #include <stdexcept> #include<vector> using namespace std; template <class T> ostream& operator << (ostream& os, const vector<T> vec) { copy(vec.begin(), vec.end(), ostream_iterator<T>(os, " ")); return os; } template <class T> ostream& operator << (ostream& os, vector<vector<T> > vec) { for (vector<vector<T> >::iterator iter = vec.begin(); iter!=vec.end(); iter++) { os << *iter << endl; } return os; } template <class T> T normalization(T *minval,T *maxval,int *data) { return (*data-*minval)/(maxval-minval); } template <class T> vector<vector<T> > InverseMatrix(vector<vector<T> > ivecvec) { //存储数列行数row int row = ivecvec.size(); //数列列数 int line = ivecvec[0].size(); vector<vector<T> > invers_vec(line,row); for (int i = 0; i < line; ++i) { for (int j = 0; j < row; ++j) { invers_vec[i][j] = ivecvec[j][i]; } } return invers_vec; } int main() { vector<double> ivec; vector<double>::iterator iter; vector<vector<double> > ivecvec; vector<vector<double> >::iterator iiter; ifstream infile("e:\\test_data.txt"); string temp; double a; while(getline(infile, temp)) { stringstream line(temp); while(line >> a) { ivec.push_back(a); } ivecvec.push_back(ivec); ivec.clear(); } //存储数列行数row int row = ivecvec.size(); //数列列数 int line = ivecvec[0].size(); //存储每列的最值 vector<vector<double> > m_val(line-1,2); cout << InverseMatrix(ivecvec); return 0; }
修改思路,终于搞定,就是麻烦点,算法效率低点吧,继续改进!
#include <iostream> #include <string> #include <fstream> #include <sstream> #include <exception> #include <stdexcept> #include<vector> using namespace std; template <class T> ostream& operator << (ostream& os, const vector<T> vec) { copy(vec.begin(), vec.end(), ostream_iterator<T>(os, " ")); return os; } template <class T> ostream& operator << (ostream& os, vector<vector<T> > vec) { for (vector<vector<T> >::iterator iter = vec.begin(); iter!=vec.end(); iter++) { os << *iter << endl; } return os; } template <class T> T normalization(T minval,T maxval,T data) { return (data-minval)/(maxval-minval); } template <class T> vector<vector<T> > get_vec_normalization(vector<vector<T> > ivecvec) { //存储数列行数row int row = ivecvec.size(); //数列列数 int line = ivecvec[0].size(); vector<vector<T> > m_val(line,2); T max_val,min_val,temp; //计算每列最值 for (int i = 0; i < line; ++i) { max_val = min_val = 0; for (int j = 0; j < row; ++j) { temp = ivecvec[j][i]; if (max_val<temp) max_val = temp; else if(min_val > temp) min_val = temp; } m_val[i][0]=min_val; m_val[i][1] = max_val; } //归一化 for (int i = 0; i < line; ++i) { for (int j = 0; j < row; ++j) { ivecvec[j][i] = normalization(m_val[i][0],m_val[i][1],ivecvec[j][i]); } } return ivecvec; } int main() { vector<double> ivec; vector<double>::iterator iter; vector<vector<double> > ivecvec; vector<vector<double> >::iterator iiter; ifstream infile("e:\\train.txt"); string temp; double a; while(getline(infile, temp)) { stringstream line(temp); while(line >> a) { ivec.push_back(a); } ivecvec.push_back(ivec); ivec.clear(); } cout << get_vec_normalization(ivecvec); return 0; }
还是存储到文件比较以后进行处理,继续改:
#include <iostream> #include <string> #include <fstream> #include <sstream> #include <exception> #include <stdexcept> #include<vector> using namespace std; template <class T> ostream& operator << (ostream& os, const vector<T> vec) { copy(vec.begin(), vec.end(), ostream_iterator<T>(os, " ")); return os; } template <class T> ostream& operator << (ostream& os, vector<vector<T> > vec) { for (vector<vector<T> >::iterator iter = vec.begin(); iter!=vec.end(); iter++) { os << *iter << endl; } return os; } template <class T> T normalization(T minval,T maxval,T data) { return (data-minval)/(maxval-minval); } template <class T> vector<vector<T> > get_vec_normalization(vector<vector<T> > ivecvec) { ofstream outfile("e:\\outfile.txt"); if(!outfile) throw runtime_error("openfile error"); //存储数列行数row int row = ivecvec.size(); //数列列数 int line = ivecvec[0].size(); vector<vector<T> > m_val(line,2); T max_val,min_val,temp; //计算每列最值 for (int i = 0; i < line; ++i) { max_val = min_val = 0; for (int j = 0; j < row; ++j) { temp = ivecvec[j][i]; if (max_val<temp) max_val = temp; else if(min_val > temp) min_val = temp; } m_val[i][0]=min_val; m_val[i][1] = max_val; } //归一化 for (int i = 0; i < line; ++i) { for (int j = 0; j < row; ++j) { ivecvec[j][i] = normalization(m_val[i][0],m_val[i][1],ivecvec[j][i]); } } for (int i = 0; i < row; ++i) { for (int j = 0; j < line; ++j) { outfile << ivecvec[i][j] << " "; } outfile << endl; } outfile.close(); return ivecvec; } int main() { vector<double> ivec; vector<double>::iterator iter; vector<vector<double> > ivecvec; vector<vector<double> >::iterator iiter; ifstream infile("e:\\train.txt"); string temp; double a; while(getline(infile, temp)) { stringstream line(temp); while(line >> a) { ivec.push_back(a); } ivecvec.push_back(ivec); ivec.clear(); } cout << get_vec_normalization(ivecvec); return 0; }
根据公式
对原来的代码进行修改:
#include <iostream> #include <string> #include <fstream> #include <sstream> #include <exception> #include <stdexcept> #include<vector> using namespace std; template <class T> ostream& operator << (ostream& os, const vector<T> vec) { copy(vec.begin(), vec.end(), ostream_iterator<T>(os, " ")); return os; } template <class T> ostream& operator << (ostream& os, vector<vector<T> > vec) { for (vector<vector<T> >::iterator iter = vec.begin(); iter!=vec.end(); iter++) { os << *iter << endl; } return os; } template <class T> T normalization(T y_min,T y_max,T value,int y_upper,int y_lower) { if(value == y_min) value = y_lower; else if(value == y_max) value = y_upper; else value = y_lower + (y_upper-y_lower) * (value - y_min)/(y_max-y_min); return value; } template <class T> vector<vector<T> > get_vec_normalization(vector<vector<T> > ivecvec) { ofstream outfile("e:\\outfile.txt"); if(!outfile) throw runtime_error("openfile error"); //存储数列行数row int row = ivecvec.size(); //数列列数 int line = ivecvec[0].size(); vector<vector<T> > m_val(line,2); T max_val,min_val,temp; //计算每列最值 for (int i = 0; i < line; ++i) { max_val = min_val = 0; for (int j = 0; j < row; ++j) { temp = ivecvec[j][i]; if (max_val<temp) max_val = temp; else if(min_val > temp) min_val = temp; } m_val[i][0]=min_val; m_val[i][1] = max_val; } //归一化 for (int i = 0; i < line; ++i) { for (int j = 0; j < row; ++j) { ivecvec[j][i] = normalization(m_val[i][0],m_val[i][1],ivecvec[j][i],1,-1); } } for (int i = 0; i < row; ++i) { for (int j = 0; j < line; ++j) { outfile << ivecvec[i][j] << " "; } outfile << endl; } outfile.close(); return ivecvec; } int main() { vector<double> ivec; vector<double>::iterator iter; vector<vector<double> > ivecvec; vector<vector<double> >::iterator iiter; ifstream infile("e:\\features.txt"); string temp; double a; while(getline(infile, temp)) { stringstream line(temp); while(line >> a) { ivec.push_back(a); } ivecvec.push_back(ivec); ivec.clear(); } cout << get_vec_normalization(ivecvec); return 0; }