归一化问题

原理

 进行缩放的原因和使用神经网络时的考虑是一样的,由于RBF网络中采用样本数据的欧式距离来计算。主要优点就是避免数值范围较大的属性控制数值范围较小的属性。另一个优点就是避免计算时的numerical difficulties. 因为核值通常依赖特征向量的内积(inner product),而较大的属性值可能导致numerical问题。因此推荐把每个属性缩放到[-1, 1]或者[0, 1]之间,而且前一个范围要比后一个好,即对列向量进行规范化,其详细解释和计算公式见http://www.faqs.org/faqs/ai-faq/neural-nets/part2/中的“Should I standardize the input variables (column vectors)?”。libsvm中没有考虑属性的类型(效益、成本、固定、偏离、区间、偏离区间 6 种不同的属性类型的规范化计算公式是不一样的,详见:徐泽水,《不确定多属性决策方法及应用》,清华大学出版社,2004。)而采用了统一的线性缩放,作者以为此处可以改进一下。

    需要注意的是,在进行测试之前,要对测试数据进行同样的缩放操作。其实在libsvm中有程序(svmscale.exe)来进行缩放操作。

    上面这两种方法基本上可以完成所有的样本的预处理了。

解决方法:

源:A[]
结果:B[]
A的最大最小值 MaxVal,MinVal
B中希望的最大最小值 MaxOut,MinOut

循环
{
  B[] = (a[] - MinVal) / (MaxVal - MinVal);

}

思路:准备把数据从txt读入vector二维数组进行处理!

一点想法,想保存起来:

主要是得到了转置矩阵,不过后来想想貌似不起神马作用:

#include <iostream>
#include <string>
#include <fstream>
#include <sstream>
#include <exception>
#include <stdexcept>
#include<vector>
using namespace std;
template <class T>
ostream& operator << (ostream& os, const vector<T> vec)
{
	copy(vec.begin(), vec.end(), ostream_iterator<T>(os, " "));
	return os;
}
template <class T>
ostream& operator << (ostream& os, vector<vector<T> > vec)
{
	for (vector<vector<T> >::iterator iter = vec.begin(); iter!=vec.end(); iter++)
	{
		os << *iter << endl;
	}
	return os;
}
template <class T>
T normalization(T *minval,T *maxval,int *data)
{
    return (*data-*minval)/(maxval-minval);
}
template <class T>
vector<vector<T> > InverseMatrix(vector<vector<T> > ivecvec)
{
	//存储数列行数row
	int row = ivecvec.size();
	//数列列数
	int line = ivecvec[0].size();
   vector<vector<T> > invers_vec(line,row);
   for (int i = 0; i < line; ++i)
   {
	   for (int j = 0; j < row; ++j)
	   {
		   invers_vec[i][j] = ivecvec[j][i];
	   }
   }
   return invers_vec;
}
int main()
{
	vector<double> ivec;
	vector<double>::iterator iter;
	vector<vector<double> > ivecvec;
	vector<vector<double> >::iterator iiter;
	ifstream infile("e:\\test_data.txt");
	string temp;
	double a;
	while(getline(infile, temp))
	{
		stringstream line(temp);
		while(line >> a)
		{
			ivec.push_back(a);
		}
		ivecvec.push_back(ivec);
		ivec.clear();
	}
	//存储数列行数row
	int row = ivecvec.size();
	//数列列数
	int line = ivecvec[0].size();
	//存储每列的最值
	vector<vector<double> > m_val(line-1,2);
    cout << InverseMatrix(ivecvec);
	return 0;
}

 

  修改思路,终于搞定,就是麻烦点,算法效率低点吧,继续改进!

#include <iostream>
#include <string>
#include <fstream>
#include <sstream>
#include <exception>
#include <stdexcept>
#include<vector>
using namespace std;
template <class T>
ostream& operator << (ostream& os, const vector<T> vec)
{
	copy(vec.begin(), vec.end(), ostream_iterator<T>(os, " "));
	return os;
}
template <class T>
ostream& operator << (ostream& os, vector<vector<T> > vec)
{
	for (vector<vector<T> >::iterator iter = vec.begin(); iter!=vec.end(); iter++)
	{
		os << *iter << endl;
	}
	return os;
}
template <class T>
T normalization(T minval,T maxval,T data)
{
    return (data-minval)/(maxval-minval);
}
template <class T>
vector<vector<T> > get_vec_normalization(vector<vector<T> > ivecvec)
{
	//存储数列行数row
	int row = ivecvec.size();
	//数列列数
	int line = ivecvec[0].size();
   vector<vector<T> > m_val(line,2);
   T max_val,min_val,temp;
   //计算每列最值
   for (int i = 0; i < line; ++i)
   {
	   max_val = min_val = 0;
	   for (int j = 0; j < row; ++j)
	   {
		   temp = ivecvec[j][i];
		   if (max_val<temp)
		       max_val = temp; 
		   else
			   if(min_val > temp)
				   min_val = temp;   
	   }
	   m_val[i][0]=min_val;
	   m_val[i][1] = max_val;
   }
   //归一化
   for (int i = 0; i < line; ++i)
   {
	   for (int j = 0; j < row; ++j)
	   {
          ivecvec[j][i] = normalization(m_val[i][0],m_val[i][1],ivecvec[j][i]);
	   }
   }
   return ivecvec;
}

int main()
{
	vector<double> ivec;
	vector<double>::iterator iter;
	vector<vector<double> > ivecvec;
	vector<vector<double> >::iterator iiter;
	ifstream infile("e:\\train.txt");
	string temp;
	double a;
	while(getline(infile, temp))
	{
		stringstream line(temp);
		while(line >> a)
		{
			ivec.push_back(a);
		}
		ivecvec.push_back(ivec);
		ivec.clear();
	}
    cout << get_vec_normalization(ivecvec);
	return 0;
}

  

还是存储到文件比较以后进行处理,继续改:

 

#include <iostream>
#include <string>
#include <fstream>
#include <sstream>
#include <exception>
#include <stdexcept>
#include<vector>
using namespace std;
template <class T>
ostream& operator << (ostream& os, const vector<T> vec)
{
	copy(vec.begin(), vec.end(), ostream_iterator<T>(os, " "));
	return os;
}
template <class T>
ostream& operator << (ostream& os, vector<vector<T> > vec)
{
	for (vector<vector<T> >::iterator iter = vec.begin(); iter!=vec.end(); iter++)
	{
		os << *iter << endl;
	}
	return os;
}
template <class T>
T normalization(T minval,T maxval,T data)
{
	return (data-minval)/(maxval-minval);
}
template <class T>
vector<vector<T> > get_vec_normalization(vector<vector<T> > ivecvec)
{
	ofstream outfile("e:\\outfile.txt");
	if(!outfile)
		throw runtime_error("openfile error");

	//存储数列行数row
	int row = ivecvec.size();
	//数列列数
	int line = ivecvec[0].size();
	vector<vector<T> > m_val(line,2);
	T max_val,min_val,temp;
	//计算每列最值
	for (int i = 0; i < line; ++i)
	{
		max_val = min_val = 0;
		for (int j = 0; j < row; ++j)
		{
			temp = ivecvec[j][i];
			if (max_val<temp)
				max_val = temp;
			else
				if(min_val > temp)
					min_val = temp;  
		}
		m_val[i][0]=min_val;
		m_val[i][1] = max_val;
	}
	//归一化
	for (int i = 0; i < line; ++i)
	{
		for (int j = 0; j < row; ++j)
		{
			ivecvec[j][i] = normalization(m_val[i][0],m_val[i][1],ivecvec[j][i]);
		}
	}
	for (int i = 0; i < row; ++i)
	{
		for (int j = 0; j < line; ++j)
		{
			outfile << ivecvec[i][j] << " ";
		}
		outfile << endl;
	}
	outfile.close();
	return ivecvec;
}

int main()
{
	vector<double> ivec;
	vector<double>::iterator iter;
	vector<vector<double> > ivecvec;
	vector<vector<double> >::iterator iiter;
	ifstream infile("e:\\train.txt");
	string temp;
	double a;
	while(getline(infile, temp))
	{
		stringstream line(temp);
		while(line >> a)
		{
			ivec.push_back(a);
		}
		ivecvec.push_back(ivec);
		ivec.clear();
	}
	cout << get_vec_normalization(ivecvec);
	return 0;
}

 

  根据公式

对原来的代码进行修改:

 

#include <iostream>
#include <string>
#include <fstream>
#include <sstream>
#include <exception>
#include <stdexcept>
#include<vector>
using namespace std;
template <class T>
ostream& operator << (ostream& os, const vector<T> vec)
{
	copy(vec.begin(), vec.end(), ostream_iterator<T>(os, " "));
	return os;
}
template <class T>
ostream& operator << (ostream& os, vector<vector<T> > vec)
{
	for (vector<vector<T> >::iterator iter = vec.begin(); iter!=vec.end(); iter++)
	{
		os << *iter << endl;
	}
	return os;
}
template <class T>
T normalization(T y_min,T y_max,T value,int y_upper,int y_lower)
{
	if(value == y_min)
		value = y_lower;
	else if(value == y_max)
		value = y_upper;
	else value = y_lower + (y_upper-y_lower) *
		(value - y_min)/(y_max-y_min);
   return value;
}
template <class T>
vector<vector<T> > get_vec_normalization(vector<vector<T> > ivecvec)
{
	ofstream outfile("e:\\outfile.txt");
	if(!outfile)
		throw runtime_error("openfile error");

	//存储数列行数row
	int row = ivecvec.size();
	//数列列数
	int line = ivecvec[0].size();
	vector<vector<T> > m_val(line,2);
	T max_val,min_val,temp;
	//计算每列最值
	for (int i = 0; i < line; ++i)
	{
		max_val = min_val = 0;
		for (int j = 0; j < row; ++j)
		{
			temp = ivecvec[j][i];
			if (max_val<temp)
				max_val = temp;
			else
				if(min_val > temp)
					min_val = temp;  
		}
		m_val[i][0]=min_val;
		m_val[i][1] = max_val;
	}
	//归一化
	for (int i = 0; i < line; ++i)
	{
		for (int j = 0; j < row; ++j)
		{
			ivecvec[j][i] = normalization(m_val[i][0],m_val[i][1],ivecvec[j][i],1,-1);
		}
	}
	for (int i = 0; i < row; ++i)
	{
		for (int j = 0; j < line; ++j)
		{
			outfile << ivecvec[i][j] << " ";
		}
		outfile << endl;
	}
	outfile.close();
	return ivecvec;
}

int main()
{
	vector<double> ivec;
	vector<double>::iterator iter;
	vector<vector<double> > ivecvec;
	vector<vector<double> >::iterator iiter;
	ifstream infile("e:\\features.txt");
	string temp;
	double a;
	while(getline(infile, temp))
	{
		stringstream line(temp);
		while(line >> a)
		{
			ivec.push_back(a);
		}
		ivecvec.push_back(ivec);
		ivec.clear();
	}
	cout << get_vec_normalization(ivecvec);
	return 0;
}

 

  


posted @ 2011-09-03 15:16  hailong  阅读(2970)  评论(0编辑  收藏  举报