chimerge数据离散化算法

%%%
% Author: FesianXu @ UESTC
% Description: The data discrete method based on chi test
% Date: 2017/4/3
%%%
clc
clear all
close all
%% read IRIS datasets
path = 'G:\数据分析集合\Iris dataset\iris.data' ;
[attrib1, attrib2, attrib3, attrib4, class_str] = textread(path, '%f%f%f%f%s', 'delimiter', ',') ;
class_int = zeros(length(attrib1),1) ;
class_int(strcmp(class_str, 'Iris-setosa')) = 1 ;
class_int(strcmp(class_str, 'Iris-versicolor')) = 2 ;
class_int(strcmp(class_str, 'Iris-virginica')) = 3 ;
attrib = [attrib1'; attrib2'; attrib3'; attrib4'; class_int'] ;
attrib = attrib' ;
clear attrib1 attrib2 attrib3 attrib4 class_int class_str path
len = length(attrib(:,1)) ;
att_map = cell(4,1) ;
%% compute four labels's chi value
for i = 1:4
    pair_att_label = [attrib(:,i), attrib(:,5)] ;
    pair_att_label = sortrows(pair_att_label, 1) ;
    inner = 1 ;
    tmp_map = zeros(1,4) ; % att_value, label1_fre, label2_fre, label3_fre
    for j = 1:len
        num = pair_att_label(j, 1) ;
        loc = find(tmp_map(:,1) == num) ;
        if isempty(loc)
            tmp_map(inner, 1) = num ;
            tmp_map(inner, pair_att_label(j, 2)+1) = tmp_map(inner, pair_att_label(j, 2)+1)+1 ;
            inner = inner+1 ;
        else
            tmp_map(loc, pair_att_label(j, 2)+1) = tmp_map(loc, pair_att_label(j, 2)+1)+1 ;
        end
    end
    att_map{i} = tmp_map ;
end
clear num loc len pair_att_label inner i j tmp_map attrib
%% prepare dataset format att_map
max_interval = 6 ;
for i = 1:4
    data = att_map{i} ;
    corrent_interval = length(data(:,1)) ;
    while corrent_interval > max_interval
        chi2_mat = zeros(corrent_interval-1, 1) ;
        for j = 1:corrent_interval-1
            chi2 = chi2test(data(j,2:4), data(j+1,2:4)) ;
            chi2_mat(j) = chi2 ;
        end
        [minv, index] = min(chi2_mat) ;
        merge_loc_1 = index ;
        merge_loc_2 = index+1 ;
        data(merge_loc_1, 2:4) = data(merge_loc_1, 2:4)+data(merge_loc_2, 2:4) ; % 合并
        data(merge_loc_2, :) = [] ; % 除去合并的数据
        corrent_interval = corrent_interval-1 ;
    end
    fprintf('第%d属性的分割点如下:\r\n', i) 
    for loop = 1:6
        fprintf('No %d: %f\r\n', loop, data(loop, 1))
    end
end 



%%%
% Author: FesianXu @ UESTC
% Description: compute the chi2 value
% Date: 2017/4/3
%%%
function chi2 = chi2test(u, v)
len = length(u) ;
N = sum(u)+sum(v) ;
chi2 = 0 ;
box = [u; v] ;
for i = 1:2
    for j = 1:len
        Eij = sum(box(i,:))*sum(box(:, j))/N ;
        if Eij == 0
            tmp_chi = 0 ;
        else
            tmp_chi = (box(i,j)-Eij)^2/Eij ;
        end
        chi2 = chi2 + tmp_chi ;
    end
end


posted @ 2017-04-16 18:12  FesianXu  阅读(68)  评论(0编辑  收藏  举报