最开始仿真和精度测试,基于 matlab 完成的。

Demo_MakeTable.m (生成 Hash 表)

%******* 设定参数: *****************
%*******  l : hash表个数    ********
%*******  k : 各表关键字个数 *******
clear all; close all; clc;
l = 3;
k = 15;
sData = textread('./Data/data.txt', '%s');
n = length(sData);
Data = false(n, 128);
for i = 1 : n
    Data(i, :) = sData{i} - '0';
clear sData;

Tables = lsh_1norm(l, k, Data);
save(['.\Data\' num2str(l) '_' num2str(k) '_Tables'], 'Tables');
% cd ./ErrorCompute
% Demo_ErrorStat
%% Write Hash keys and the Index
% 多线程
% matlabpool local 10
% for i = 1 : l
%     fprintf('第 %d 个哈希表.\n', i);
%     Write_Hash_Index(k, Tables(i), i);
% end
% matlabpool close

lsh_1norm.m (核心函数,我用其组织了整个索引结构生成过程)

function T2 = lsh_1norm(l, k, yy)
% parameters setting && getting
% [n d] = size(x);
%************ 数据集预处理 : 转 hamming 空间(维度小于100时使用) ********
%   fprintf('数据集转换到 Hamming space.\n');
%   tic
%   x = x'; % n x d   (d < 100)
%   C = max(x(:));
%   dim = size(x);
%   yy = false(dim(1), dim(2) * 255);
%   for i = 1 : dim(1)
%       for j = 1 : dim(2)
%           oneO = false(1,C);
%           oneO(1:x(i,j)) = 1;
%           yy(i,(j-1)*C+1 : (j-1)*C+C) = oneO; 
%       end
%   end 
%   clear oneO x;
%   toc
%   fprintf('转换 Hamming space 完成.\n');
fprintf('初始化 %d 个 Hash 表...\n', l);
%  matlabpool local 10
%  可并行
for i = 1 : l
    % creat and init Tables[i] = f(k, x);
    T1(i) = createTable(k, yy);
%  matlabpool close
matlabpool local 10
% save the index of feature data
%  insert(T, x);
for i = 1 : l
    fprintf('数据插入第 %d 个hash表\n', i);
    T2(i) = insert_data(T1(i), yy);
matlabpool close
% clc;



function T = createTable(k, x)
% M = size(x,1)+17; % length of second hashTable (hashTable2)
M = 587474;
d = size(x, 2);

select_d = unidrnd(d, 1, k);
I.d = select_d;
% I.threshold = unifrnd(0, 1, 1, k) * 255; % value interval [0 255]
I.k = k;
T.I = I;
T.randDigits = unidrnd(M, 1, k);
T.buckets = [];
T.index = {};
T.hashTable2 = cell(M,1); 


function T = insert_data(T, x)
%  M = size(x, 1) + 17;
M = 587474;

%  buck01 = x(T.I.d, :)' < repmat(T.I.threshold, size(x,2), 1);
 buck01 = x(:, T.I.d);
 [uBuck id1 id2] = unique(buck01,'rows');
 T.buckets = logical(uBuck);
 T.bucket_cnt = length(id1);
 key = mod(sum(bsxfun(@times, uBuck, T.randDigits),2), M) + 1; % matalb 下标从 1 开始  
 T.index = cell(length(id1), 1);
 for bb = 1 : length(id1)
     sameBucket = find(id2 == bb);
     T.index{bb} = [T.index{bb}; sameBucket']; 
     T.hashTable2{key(bb)} = [T.hashTable2{key(bb)} bb]; 

Demo_computeError.m (测试精确度)

%  clear all; clc;
 p = 1;  
 Data = Data';
 avgErr = zeros(1, 50);
 MissSum = zeros(1, 50);
 picErr = zeros(1,50); picErr(1) = 0.982;
 for K = 1 : 10
        MissCnt = 0;
        ratioCnt = 0;
        fid = fopen(['K_' num2str(K) '.txt'], 'w');
        for i = 1 : length(Q)  % the ith input.
%          fprintf(fid, '%-5d', i);
%          fprintf([num2str(i) ' ']);
         q = Q(i,:)';
         [Id1, Mis] = LSH_Search(q', K, Tables, Data, p);
         if Mis
             MissCnt = MissCnt + 1;
             fprintf(fid, 'Miss\n');
%          num = num + 1;
%          imwrite(uint8(input),['.\Data\','query_subset2\',num2str(num),'.bmp']);
         Id2 = Linear_Search(q, K, Data, p);
         Dlsh = lp_norm(q, Data(:, Id1), p); 
         Dcst = lp_norm(q, Data(:, Id2), p);
         id = Dlsh == Dcst;
         ratio = Dcst ./ (Dlsh + 0.00002);
         ratio(id) = 1;
%           for j = 1 : K
%               fprintf(fid, '%-8.2f', ratio(j)); 
%           end 
         fprintf(fid, 'error: %8.3f\n', ratio(K)); 
         ratioCnt = ratioCnt + ratio(K);
        fprintf(fid, 'Hit times: %-5d Avg. error: %.3f\n', length(Q)-MissCnt, ratioCnt/(length(Q)-MissCnt));
        avgErr(K) = ratioCnt/(length(Q)-MissCnt);
        fprintf(fid, 'Miss times: %-5d Miss ratio: %.3f', MissCnt, MissCnt/length(Q));
        MissSum(K) = MissCnt;
        picErr(K) = ratioCnt / (length(Q)-MissCnt);
%  figure,plot(1:50,avgErr,'-r.','MarkerFaceColor','g'),
%  xlabel('number of K'),ylabel('Error'),title('Error of LSH');
%  figure, plot(1:50,MissSum/146,'-r.','MarkerEdgeColor','k'),
%  xlabel('number of K'),ylabel('Miss ratio'),title('Qurey miss of LSH');
xlabel('number of K'),ylabel('Error'),title('Error of LSH');  


function Index = Linear_Search(q, K, DataSet, p)

D = feval('lp_norm', q, DataSet, p);
[~, id] = sort(D);
Index = id(1 : K);

 lp_norm.m  (此处使用了 Hamming distance, p = 2 时,可以调整为 l2 范式欧式距离)

%************   get the distance **********************
function distance = lp_norm(x0, x, p)
tem = repmat(x0, 1, size(x,2));
distance = sum((abs(tem - x) .^ p), 1) .^ (1/p);

  lookup.m (LSH 查找)

function Index = lookup(T, q) % x can be removed
%************     参数解释:     ***********************  
%************   T : 哈希表      ***********************
%************   x : 总数据集     **********************
%************   q: 查询

index = [];
% 可并行
for i = 1 : length(T)
    tem = getIndex(T(i), q);
    index = [index tem];
Index = unique(index);


function tableiIndex = getIndex(T, x0)

M = length(T.hashTable2);
tableiIndex = [];

seq01_x = x0(:, T.I.d); 
index_x = mod(sum(bsxfun(@times, seq01_x, T.randDigits),2), M) + 1;
if ~isempty(T.hashTable2{index_x}) 
    index_bucket = T.hashTable2{index_x};
    %for i = 1 : length(index_bucket)
    uni_index_bucket = index_bucket(find(all(bsxfun(@eq, seq01_x, T.buckets(index_bucket, :)), 2)));
    for i = 1 : length(uni_index_bucket)
        tableiIndex = [tableiIndex T.index{uni_index_bucket(i)}];

 Linear_Search.m (线性查找)

function Index = Linear_Search(q, K, DataSet, p)

D = feval('lp_norm', q, DataSet, p);
[~, id] = sort(D);
Index = id(1 : K);


               (横轴为 K-NN 中 K 的值,纵轴为准确度)



Algorithm proposed from Papers :   

(Indyk 1999) similarity search in hish dimensions via hashing.

(Indyk 2005) Locality-sensitive hashing scheme based on p-stable distributions.


