%%%
% Author: FesianXu @ UESTC
% Description: The data discrete method based on chi test
% Date: 2017/4/3
%%%
clc
clear all
close all
%% read IRIS datasets
path = 'G:\数据分析集合\Iris dataset\iris.data' ;
[attrib1, attrib2, attrib3, attrib4, class_str] = textread(path, '%f%f%f%f%s', 'delimiter', ',') ;
class_int = zeros(length(attrib1),1) ;
class_int(strcmp(class_str, 'Iris-setosa')) = 1 ;
class_int(strcmp(class_str, 'Iris-versicolor')) = 2 ;
class_int(strcmp(class_str, 'Iris-virginica')) = 3 ;
attrib = [attrib1'; attrib2'; attrib3'; attrib4'; class_int'] ;
attrib = attrib' ;
clear attrib1 attrib2 attrib3 attrib4 class_int class_str path
len = length(attrib(:,1)) ;
att_map = cell(4,1) ;
%% compute four labels's chi value
for i = 1:4
pair_att_label = [attrib(:,i), attrib(:,5)] ;
pair_att_label = sortrows(pair_att_label, 1) ;
inner = 1 ;
tmp_map = zeros(1,4) ; % att_value, label1_fre, label2_fre, label3_fre
for j = 1:len
num = pair_att_label(j, 1) ;
loc = find(tmp_map(:,1) == num) ;
if isempty(loc)
tmp_map(inner, 1) = num ;
tmp_map(inner, pair_att_label(j, 2)+1) = tmp_map(inner, pair_att_label(j, 2)+1)+1 ;
inner = inner+1 ;
else
tmp_map(loc, pair_att_label(j, 2)+1) = tmp_map(loc, pair_att_label(j, 2)+1)+1 ;
end
end
att_map{i} = tmp_map ;
end
clear num loc len pair_att_label inner i j tmp_map attrib
%% prepare dataset format att_map
max_interval = 6 ;
for i = 1:4
data = att_map{i} ;
corrent_interval = length(data(:,1)) ;
while corrent_interval > max_interval
chi2_mat = zeros(corrent_interval-1, 1) ;
for j = 1:corrent_interval-1
chi2 = chi2test(data(j,2:4), data(j+1,2:4)) ;
chi2_mat(j) = chi2 ;
end
[minv, index] = min(chi2_mat) ;
merge_loc_1 = index ;
merge_loc_2 = index+1 ;
data(merge_loc_1, 2:4) = data(merge_loc_1, 2:4)+data(merge_loc_2, 2:4) ; % 合并
data(merge_loc_2, :) = [] ; % 除去合并的数据
corrent_interval = corrent_interval-1 ;
end
fprintf('第%d属性的分割点如下:\r\n', i)
for loop = 1:6
fprintf('No %d: %f\r\n', loop, data(loop, 1))
end
end
%%%
% Author: FesianXu @ UESTC
% Description: compute the chi2 value
% Date: 2017/4/3
%%%
function chi2 = chi2test(u, v)
len = length(u) ;
N = sum(u)+sum(v) ;
chi2 = 0 ;
box = [u; v] ;
for i = 1:2
for j = 1:len
Eij = sum(box(i,:))*sum(box(:, j))/N ;
if Eij == 0
tmp_chi = 0 ;
else
tmp_chi = (box(i,j)-Eij)^2/Eij ;
end
chi2 = chi2 + tmp_chi ;
end
end