LaplacianScore计算特征得分,对特征进行选择
1、
clear; load iris.dat fea = iris(:,1:4);%最后一列是label,这里只需要特征,我们把特征拿出来单独成一个矩阵。每一行表示一个数据 fea = NormalizeFea(fea);% %Normalize each data vector to have L2-norm equal to 1 W = constructW(fea); Y = LaplacianScore(fea, W);%Y: Vector of (1-LaplacianScore) for each feature,The features with larger y are more important. [dump,idx] = sort(-Y);
2、
function fea = NormalizeFea(fea,row) % if row == 1, normalize each row of fea to have unit norm; % if row == 0, normalize each column of fea to have unit norm; % % version 3.0 --Jan/2012 % version 2.0 --Jan/2012 % version 1.0 --Oct/2003 % % Written by Deng Cai (dengcai AT gmail.com) % if ~exist('row','var') row = 1; end if row nSmp = size(fea,1); feaNorm = max(1e-14,full(sum(fea.^2,2))); fea = spdiags(feaNorm.^-.5,0,nSmp,nSmp)*fea; else nSmp = size(fea,2); feaNorm = max(1e-14,full(sum(fea.^2,1))'); fea = fea*spdiags(feaNorm.^-.5,0,nSmp,nSmp); end return; if row [nSmp, mFea] = size(fea); if issparse(fea) fea2 = fea'; feaNorm = mynorm(fea2,1); for i = 1:nSmp fea2(:,i) = fea2(:,i) ./ max(1e-10,feaNorm(i)); end fea = fea2'; else feaNorm = sum(fea.^2,2).^.5; fea = fea./feaNorm(:,ones(1,mFea)); end else [mFea, nSmp] = size(fea); if issparse(fea) feaNorm = mynorm(fea,1); for i = 1:nSmp fea(:,i) = fea(:,i) ./ max(1e-10,feaNorm(i)); end else feaNorm = sum(fea.^2,1).^.5; fea = fea./feaNorm(ones(1,mFea),:); end end
3、
function [Y] = LaplacianScore(X, W) % Usage: % [Y] = LaplacianScore(X, W) % % X: Rows of vectors of data points % W: The affinity matrix. % Y: Vector of (1-LaplacianScore) for each feature. % The features with larger y are more important. % % Examples: % % fea = rand(50,70); % options = []; % options.Metric = 'Cosine'; % options.NeighborMode = 'KNN'; % options.k = 5; % options.WeightMode = 'Cosine'; % W = constructW(fea,options); % % LaplacianScore = LaplacianScore(fea,W); % [junk, index] = sort(-LaplacianScore); % % newfea = fea(:,index); % %the features in newfea will be sorted based on their importance. % % Type "LaplacianScore" for a self-demo. % % See also constructW % %Reference: % % Xiaofei He, Deng Cai and Partha Niyogi, "Laplacian Score for Feature Selection". % Advances in Neural Information Processing Systems 18 (NIPS 2005), % Vancouver, Canada, 2005. % % Deng Cai, 2004/08 if nargin == 0, selfdemo; return; end [nSmp,nFea] = size(X); if size(W,1) ~= nSmp error('W is error'); end D = full(sum(W,2)); L = W; allone = ones(nSmp,1); tmp1 = D'*X; D = sparse(1:nSmp,1:nSmp,D,nSmp,nSmp); DPrime = sum((X'*D)'.*X)-tmp1.*tmp1/sum(diag(D)); LPrime = sum((X'*L)'.*X)-tmp1.*tmp1/sum(diag(D)); DPrime(find(DPrime < 1e-12)) = 10000; Y = LPrime./DPrime; Y = Y'; Y = full(Y);
4、
function D = EuDist2(fea_a,fea_b,bSqrt) %EUDIST2 Efficiently Compute the Euclidean Distance Matrix by Exploring the %Matlab matrix operations. % % D = EuDist(fea_a,fea_b) % fea_a: nSample_a * nFeature % fea_b: nSample_b * nFeature % D: nSample_a * nSample_a % or nSample_a * nSample_b % % Examples: % % a = rand(500,10); % b = rand(1000,10); % % A = EuDist2(a); % A: 500*500 % D = EuDist2(a,b); % D: 500*1000 % % version 2.1 --November/2011 % version 2.0 --May/2009 % version 1.0 --November/2005 % % Written by Deng Cai (dengcai AT gmail.com) if ~exist('bSqrt','var') bSqrt = 1; end if (~exist('fea_b','var')) || isempty(fea_b) aa = sum(fea_a.*fea_a,2); ab = fea_a*fea_a'; if issparse(aa) aa = full(aa); end D = bsxfun(@plus,aa,aa') - 2*ab; D(D<0) = 0; if bSqrt D = sqrt(D); end D = max(D,D'); else aa = sum(fea_a.*fea_a,2); bb = sum(fea_b.*fea_b,2); ab = fea_a*fea_b'; if issparse(aa) aa = full(aa); bb = full(bb); end D = bsxfun(@plus,aa,bb') - 2*ab; D(D<0) = 0; if bSqrt D = sqrt(D); end end
5、
function W = constructW(fea,options) % Usage: % W = constructW(fea,options) % % fea: Rows of vectors of data points. Each row is x_i % options: Struct value in Matlab. The fields in options that can be set: % % NeighborMode - Indicates how to construct the graph. Choices % are: [Default 'KNN'] % 'KNN' - k = 0 % Complete graph % k > 0 % Put an edge between two nodes if and % only if they are among the k nearst % neighbors of each other. You are % required to provide the parameter k in % the options. Default k=5. % 'Supervised' - k = 0 % Put an edge between two nodes if and % only if they belong to same class. % k > 0 % Put an edge between two nodes if % they belong to same class and they % are among the k nearst neighbors of % each other. % Default: k=0 % You are required to provide the label % information gnd in the options. % % WeightMode - Indicates how to assign weights for each edge % in the graph. Choices are: % 'Binary' - 0-1 weighting. Every edge receiveds weight % of 1. % 'HeatKernel' - If nodes i and j are connected, put weight % W_ij = exp(-norm(x_i - x_j)/2t^2). You are % required to provide the parameter t. [Default One] % 'Cosine' - If nodes i and j are connected, put weight % cosine(x_i,x_j). % % k - The parameter needed under 'KNN' NeighborMode. % Default will be 5. % gnd - The parameter needed under 'Supervised' % NeighborMode. Colunm vector of the label % information for each data point. % bLDA - 0 or 1. Only effective under 'Supervised' % NeighborMode. If 1, the graph will be constructed % to make LPP exactly same as LDA. Default will be % 0. % t - The parameter needed under 'HeatKernel' % WeightMode. Default will be 1 % bNormalized - 0 or 1. Only effective under 'Cosine' WeightMode. % Indicates whether the fea are already be % normalized to 1. Default will be 0 % bSelfConnected - 0 or 1. Indicates whether W(i,i) == 1. Default 0 % if 'Supervised' NeighborMode & bLDA == 1, % bSelfConnected will always be 1. Default 0. % bTrueKNN - 0 or 1. If 1, will construct a truly kNN graph % (Not symmetric!). Default will be 0. Only valid % for 'KNN' NeighborMode % % % Examples: % % fea = rand(50,15); % options = []; % options.NeighborMode = 'KNN'; % options.k = 5; % options.WeightMode = 'HeatKernel'; % options.t = 1; % W = constructW(fea,options); % % % fea = rand(50,15); % gnd = [ones(10,1);ones(15,1)*2;ones(10,1)*3;ones(15,1)*4]; % options = []; % options.NeighborMode = 'Supervised'; % options.gnd = gnd; % options.WeightMode = 'HeatKernel'; % options.t = 1; % W = constructW(fea,options); % % % fea = rand(50,15); % gnd = [ones(10,1);ones(15,1)*2;ones(10,1)*3;ones(15,1)*4]; % options = []; % options.NeighborMode = 'Supervised'; % options.gnd = gnd; % options.bLDA = 1; % W = constructW(fea,options); % % % For more details about the different ways to construct the W, please % refer: % Deng Cai, Xiaofei He and Jiawei Han, "Document Clustering Using % Locality Preserving Indexing" IEEE TKDE, Dec. 2005. % % % Written by Deng Cai (dengcai2 AT cs.uiuc.edu), April/2004, Feb/2006, % May/2007 % bSpeed = 1; if (~exist('options','var')) options = []; end if isfield(options,'Metric') warning('This function has been changed and the Metric is no longer be supported'); end if ~isfield(options,'bNormalized') options.bNormalized = 0; end %================================================= if ~isfield(options,'NeighborMode') options.NeighborMode = 'KNN'; end switch lower(options.NeighborMode) case {lower('KNN')} %For simplicity, we include the data point itself in the kNN if ~isfield(options,'k') options.k = 5; end case {lower('Supervised')} if ~isfield(options,'bLDA') options.bLDA = 0; end if options.bLDA options.bSelfConnected = 1; end if ~isfield(options,'k') options.k = 0; end if ~isfield(options,'gnd') error('Label(gnd) should be provided under ''Supervised'' NeighborMode!'); end if ~isempty(fea) && length(options.gnd) ~= size(fea,1) error('gnd doesn''t match with fea!'); end otherwise error('NeighborMode does not exist!'); end %================================================= if ~isfield(options,'WeightMode') options.WeightMode = 'HeatKernel'; end bBinary = 0; bCosine = 0; switch lower(options.WeightMode) case {lower('Binary')} bBinary = 1; case {lower('HeatKernel')} if ~isfield(options,'t') nSmp = size(fea,1); if nSmp > 3000 D = EuDist2(fea(randsample(nSmp,3000),:)); else D = EuDist2(fea); end options.t = mean(mean(D)); end case {lower('Cosine')} bCosine = 1; otherwise error('WeightMode does not exist!'); end %================================================= if ~isfield(options,'bSelfConnected') options.bSelfConnected = 0; end %================================================= if isfield(options,'gnd') nSmp = length(options.gnd); else nSmp = size(fea,1); end maxM = 62500000; %500M BlockSize = floor(maxM/(nSmp*3)); if strcmpi(options.NeighborMode,'Supervised') Label = unique(options.gnd); nLabel = length(Label); if options.bLDA G = zeros(nSmp,nSmp); for idx=1:nLabel classIdx = options.gnd==Label(idx); G(classIdx,classIdx) = 1/sum(classIdx); end W = sparse(G); return; end switch lower(options.WeightMode) case {lower('Binary')} if options.k > 0 G = zeros(nSmp*(options.k+1),3); idNow = 0; for i=1:nLabel classIdx = find(options.gnd==Label(i)); D = EuDist2(fea(classIdx,:),[],0); [dump idx] = sort(D,2); % sort each row clear D dump; idx = idx(:,1:options.k+1); nSmpClass = length(classIdx)*(options.k+1); G(idNow+1:nSmpClass+idNow,1) = repmat(classIdx,[options.k+1,1]); G(idNow+1:nSmpClass+idNow,2) = classIdx(idx(:)); G(idNow+1:nSmpClass+idNow,3) = 1; idNow = idNow+nSmpClass; clear idx end G = sparse(G(:,1),G(:,2),G(:,3),nSmp,nSmp); G = max(G,G'); else G = zeros(nSmp,nSmp); for i=1:nLabel classIdx = find(options.gnd==Label(i)); G(classIdx,classIdx) = 1; end end if ~options.bSelfConnected for i=1:size(G,1) G(i,i) = 0; end end W = sparse(G); case {lower('HeatKernel')} if options.k > 0 G = zeros(nSmp*(options.k+1),3); idNow = 0; for i=1:nLabel classIdx = find(options.gnd==Label(i)); D = EuDist2(fea(classIdx,:),[],0); [dump idx] = sort(D,2); % sort each row clear D; idx = idx(:,1:options.k+1); dump = dump(:,1:options.k+1); dump = exp(-dump/(2*options.t^2)); nSmpClass = length(classIdx)*(options.k+1); G(idNow+1:nSmpClass+idNow,1) = repmat(classIdx,[options.k+1,1]); G(idNow+1:nSmpClass+idNow,2) = classIdx(idx(:)); G(idNow+1:nSmpClass+idNow,3) = dump(:); idNow = idNow+nSmpClass; clear dump idx end G = sparse(G(:,1),G(:,2),G(:,3),nSmp,nSmp); else G = zeros(nSmp,nSmp); for i=1:nLabel classIdx = find(options.gnd==Label(i)); D = EuDist2(fea(classIdx,:),[],0); D = exp(-D/(2*options.t^2)); G(classIdx,classIdx) = D; end end if ~options.bSelfConnected for i=1:size(G,1) G(i,i) = 0; end end W = sparse(max(G,G')); case {lower('Cosine')} if ~options.bNormalized fea = NormalizeFea(fea); end if options.k > 0 G = zeros(nSmp*(options.k+1),3); idNow = 0; for i=1:nLabel classIdx = find(options.gnd==Label(i)); D = fea(classIdx,:)*fea(classIdx,:)'; [dump idx] = sort(-D,2); % sort each row clear D; idx = idx(:,1:options.k+1); dump = -dump(:,1:options.k+1); nSmpClass = length(classIdx)*(options.k+1); G(idNow+1:nSmpClass+idNow,1) = repmat(classIdx,[options.k+1,1]); G(idNow+1:nSmpClass+idNow,2) = classIdx(idx(:)); G(idNow+1:nSmpClass+idNow,3) = dump(:); idNow = idNow+nSmpClass; clear dump idx end G = sparse(G(:,1),G(:,2),G(:,3),nSmp,nSmp); else G = zeros(nSmp,nSmp); for i=1:nLabel classIdx = find(options.gnd==Label(i)); G(classIdx,classIdx) = fea(classIdx,:)*fea(classIdx,:)'; end end if ~options.bSelfConnected for i=1:size(G,1) G(i,i) = 0; end end W = sparse(max(G,G')); otherwise error('WeightMode does not exist!'); end return; end if bCosine && ~options.bNormalized Normfea = NormalizeFea(fea); end if strcmpi(options.NeighborMode,'KNN') && (options.k > 0) if ~(bCosine && options.bNormalized) G = zeros(nSmp*(options.k+1),3); for i = 1:ceil(nSmp/BlockSize) if i == ceil(nSmp/BlockSize) smpIdx = (i-1)*BlockSize+1:nSmp; dist = EuDist2(fea(smpIdx,:),fea,0); if bSpeed nSmpNow = length(smpIdx); dump = zeros(nSmpNow,options.k+1); idx = dump; for j = 1:options.k+1 [dump(:,j),idx(:,j)] = min(dist,[],2); temp = (idx(:,j)-1)*nSmpNow+[1:nSmpNow]'; dist(temp) = 1e100; end else [dump idx] = sort(dist,2); % sort each row idx = idx(:,1:options.k+1); dump = dump(:,1:options.k+1); end if ~bBinary if bCosine dist = Normfea(smpIdx,:)*Normfea'; dist = full(dist); linidx = [1:size(idx,1)]'; dump = dist(sub2ind(size(dist),linidx(:,ones(1,size(idx,2))),idx)); else dump = exp(-dump/(2*options.t^2)); end end G((i-1)*BlockSize*(options.k+1)+1:nSmp*(options.k+1),1) = repmat(smpIdx',[options.k+1,1]); G((i-1)*BlockSize*(options.k+1)+1:nSmp*(options.k+1),2) = idx(:); if ~bBinary G((i-1)*BlockSize*(options.k+1)+1:nSmp*(options.k+1),3) = dump(:); else G((i-1)*BlockSize*(options.k+1)+1:nSmp*(options.k+1),3) = 1; end else smpIdx = (i-1)*BlockSize+1:i*BlockSize; dist = EuDist2(fea(smpIdx,:),fea,0); if bSpeed nSmpNow = length(smpIdx); dump = zeros(nSmpNow,options.k+1); idx = dump; for j = 1:options.k+1 [dump(:,j),idx(:,j)] = min(dist,[],2); temp = (idx(:,j)-1)*nSmpNow+[1:nSmpNow]'; dist(temp) = 1e100; end else [dump idx] = sort(dist,2); % sort each row idx = idx(:,1:options.k+1); dump = dump(:,1:options.k+1); end if ~bBinary if bCosine dist = Normfea(smpIdx,:)*Normfea'; dist = full(dist); linidx = [1:size(idx,1)]'; dump = dist(sub2ind(size(dist),linidx(:,ones(1,size(idx,2))),idx)); else dump = exp(-dump/(2*options.t^2)); end end G((i-1)*BlockSize*(options.k+1)+1:i*BlockSize*(options.k+1),1) = repmat(smpIdx',[options.k+1,1]); G((i-1)*BlockSize*(options.k+1)+1:i*BlockSize*(options.k+1),2) = idx(:); if ~bBinary G((i-1)*BlockSize*(options.k+1)+1:i*BlockSize*(options.k+1),3) = dump(:); else G((i-1)*BlockSize*(options.k+1)+1:i*BlockSize*(options.k+1),3) = 1; end end end W = sparse(G(:,1),G(:,2),G(:,3),nSmp,nSmp); else G = zeros(nSmp*(options.k+1),3); for i = 1:ceil(nSmp/BlockSize) if i == ceil(nSmp/BlockSize) smpIdx = (i-1)*BlockSize+1:nSmp; dist = fea(smpIdx,:)*fea'; dist = full(dist); if bSpeed nSmpNow = length(smpIdx); dump = zeros(nSmpNow,options.k+1); idx = dump; for j = 1:options.k+1 [dump(:,j),idx(:,j)] = max(dist,[],2); temp = (idx(:,j)-1)*nSmpNow+[1:nSmpNow]'; dist(temp) = 0; end else [dump idx] = sort(-dist,2); % sort each row idx = idx(:,1:options.k+1); dump = -dump(:,1:options.k+1); end G((i-1)*BlockSize*(options.k+1)+1:nSmp*(options.k+1),1) = repmat(smpIdx',[options.k+1,1]); G((i-1)*BlockSize*(options.k+1)+1:nSmp*(options.k+1),2) = idx(:); G((i-1)*BlockSize*(options.k+1)+1:nSmp*(options.k+1),3) = dump(:); else smpIdx = (i-1)*BlockSize+1:i*BlockSize; dist = fea(smpIdx,:)*fea'; dist = full(dist); if bSpeed nSmpNow = length(smpIdx); dump = zeros(nSmpNow,options.k+1); idx = dump; for j = 1:options.k+1 [dump(:,j),idx(:,j)] = max(dist,[],2); temp = (idx(:,j)-1)*nSmpNow+[1:nSmpNow]'; dist(temp) = 0; end else [dump idx] = sort(-dist,2); % sort each row idx = idx(:,1:options.k+1); dump = -dump(:,1:options.k+1); end G((i-1)*BlockSize*(options.k+1)+1:i*BlockSize*(options.k+1),1) = repmat(smpIdx',[options.k+1,1]); G((i-1)*BlockSize*(options.k+1)+1:i*BlockSize*(options.k+1),2) = idx(:); G((i-1)*BlockSize*(options.k+1)+1:i*BlockSize*(options.k+1),3) = dump(:); end end W = sparse(G(:,1),G(:,2),G(:,3),nSmp,nSmp); end if bBinary W(logical(W)) = 1; end if isfield(options,'bSemiSupervised') && options.bSemiSupervised tmpgnd = options.gnd(options.semiSplit); Label = unique(tmpgnd); nLabel = length(Label); G = zeros(sum(options.semiSplit),sum(options.semiSplit)); for idx=1:nLabel classIdx = tmpgnd==Label(idx); G(classIdx,classIdx) = 1; end Wsup = sparse(G); if ~isfield(options,'SameCategoryWeight') options.SameCategoryWeight = 1; end W(options.semiSplit,options.semiSplit) = (Wsup>0)*options.SameCategoryWeight; end if ~options.bSelfConnected W = W - diag(diag(W)); end if isfield(options,'bTrueKNN') && options.bTrueKNN else W = max(W,W'); end return; end % strcmpi(options.NeighborMode,'KNN') & (options.k == 0) % Complete Graph switch lower(options.WeightMode) case {lower('Binary')} error('Binary weight can not be used for complete graph!'); case {lower('HeatKernel')} W = EuDist2(fea,[],0); W = exp(-W/(2*options.t^2)); case {lower('Cosine')} W = full(Normfea*Normfea'); otherwise error('WeightMode does not exist!'); end if ~options.bSelfConnected for i=1:size(W,1) W(i,i) = 0; end end W = max(W,W');