搜档网
当前位置:搜档网 › C4.5

C4.5

function [test_targets,tree] = C4_5(train_patterns, train_targets, test_patterns, inc_node)

% Classify using Quinlan's C4.5 algorithm
% Inputs:
% training_patterns - Train patterns
% training_targets - Train targets
% test_patterns - Test patterns
% inc_node - Percentage of incorrectly assigned samples at a node
% inc_node参数用于作为迭代结束的条件,我觉得就是叶子节点可以包含的最大的样本数
% inc_node参数太大的话会导致分类准确率下降,太小的话可能会导致过拟合
% 我的实验数据集 20个样本 5个特征 inc_node取值为 35 如果取值过小 程序会递归超过上限
% Outputs
% test_targets - Predicted targets

%NOTE: In this implementation it is assumed that a pattern vector with fewer than 10 unique values (the parameter Nu)
%is discrete, and will be treated as such. Other vectors will be treated as continuous
% NOTE: 该算法的数据集 列为 样本
[Ni, M] = size(train_patterns); % M是样本数,Ni是样本维数
inc_node = inc_node*M/100;
Nu = 10;

%Find which of the input patterns are discrete, and discretisize the corresponding
%dimension on the test patterns
discrete_dim = zeros(1,Ni);
for i = 1:Ni,
Nb = length(unique(train_patterns(i,:)));% 每一个特征下不同取值的个数
if (Nb <= Nu), % 如果 每个特征不同的取值个数小于等于10个 ,则认为该特征为离散的
%This is a discrete pattern
discrete_dim(i) = Nb;
[H, test_patterns(i,:)] = high_histogram(test_patterns(i,:), Nb);
end
end

%Build the tree recursively
disp('Building tree')
tree = make_tree(train_patterns, train_targets, inc_node, discrete_dim, max(discrete_dim), 0);

%Classify test samples
disp('Classify test samples using the tree')
test_targets = use_tree(test_patterns, 1:size(test_patterns,2), tree, discrete_dim, unique(train_targets));

%END

function targets = use_tree(patterns, indices, tree, discrete_dim, Uc)
%Classify recursively using a tree

targets = zeros(1, size(patterns,2));

if (tree.dim == 0)
%Reached the end of the tree
targets(indices) = tree.child;
return
end

%This is not the last level of the tree, so:
%First, find the dimension we are to work on
dim = tree.dim;
dims= 1:size(patterns,1);

%And classify according to it
if (discrete_dim(dim) == 0),
%Continuous pattern
in = indices(find(patterns(dim, indices) <= tree.split_loc));
targets = targets + use_tree(patterns(dims, :), in, tree.child(1), discrete_dim(dims), Uc);
in = indices(find(patterns(dim, indices) > tree.split_loc));
targets = targets + use_tree(patterns(dims, :), in, tree.child(2), discrete_dim(dims), Uc);
else
%Discrete pattern
Uf = unique(patterns(dim,:));
for i = 1:length(Uf),
in = indices(find(patterns(dim, indices) == Uf(i)));
targets = targets + use_tree(patterns(di

ms, :), in, tree.child(i), discrete_dim(dims), Uc);
end
end

%END use_tree

function tree = make_tree(patterns, targets, inc_node, discrete_dim, maxNbin, base)
%Build a tree recursively

[Ni, L] = size(patterns);%L为样本数,Ni为样本维数
Uc = unique(targets);
tree.dim = 0;
%tree.child(1:maxNbin) = zeros(1,maxNbin);
tree.split_loc = inf;

if isempty(patterns),
return
end

%When to stop: If the dimension is one or the number of examples is small
if ((inc_node > L) | (L == 1) | (length(Uc) == 1)),
H = hist(targets, length(Uc));
[m, largest] = max(H);%largest对应的是出现次数最多为m的类标签,
tree.child = Uc(largest);
return
end

%Compute the node's I
for i = 1:length(Uc),
Pnode(i) = length(find(targets == Uc(i))) / L;%计算每一类的样本数所占的比例
end
Inode = -sum(Pnode.*log(Pnode)/log(2));%计算熵

%For each dimension, compute the gain ratio impurity
%This is done separately for discrete and continuous patterns
delta_Ib = zeros(1, Ni);
split_loc = ones(1, Ni)*inf;

for i = 1:Ni, % 对每一个特征......
data = patterns(i,:);
Ud = unique(data);
Nbins = length(Ud);%Nbins是对应某个属性值取值的个数
if (discrete_dim(i)),%对于离散属性
%This is a discrete pattern
P = zeros(length(Uc), Nbins);
for j = 1:length(Uc),
for k = 1:Nbins,
indices = find((targets == Uc(j)) & (patterns(i,:) ==Ud(k)));
P(j,k) = length(indices);%对每一特征...,每一类中出现某个属性取值的个数
end
end
Pk = sum(P); % Pk 是每个属性值 在 第Ni个特征上的个数
P = P/L;% 对每一特征...,每一类中出现某个属性取值的个数在样本总数中所占比例
Pk = Pk/sum(Pk); % 对第Ni个特征的不同取值 占 样本总数比例
info = sum(-P.*log(eps+P)/log(2));
delta_Ib(i) = (Inode-sum(Pk.*info))/-sum(Pk.*log(eps+Pk)/log(2));%计算信息增益率,公式为Gain(A)/I(A),其中Gain(A)=Inode-sum(Pk.*info)就是属性A的信息增益
else %而I(A)=-sum(Pk.*log(eps+Pk)/log(2))为属性A的所包含的信息
%This is a continuous pattern
P = zeros(length(Uc), 2);

%Sort the patterns
[sorted_data, indices] = sort(data);
sorted_targets = targets(indices);

%Calculate the information for each possible split
I = zeros(1, L-1);
for j = 1:L-1,
for k =1:length(Uc),
P(k,1) = length(find(sorted_targets(1:j) == Uc(k)));
P(k,2) = length(find(sorted_targets(j+1:end) == Uc(k)));
end
Ps = sum(P)/L;
P = P/L;
info = sum(-P.*log(eps+P)/log(2));
I(j) = Inode - sum(info.*Ps);
end
[delta_Ib(i), s] = max(I)

;%找到信息增益最大的划分方法
split_loc(i) = sorted_data(s); %对应属性i的划分位置就是能使信息增益最大的划分值
end
end

%Find the dimension minimizing delta_Ib
[m, dim] = max(delta_Ib); % 找到 增益率最大的属性
dims = 1:Ni;
tree.dim = dim;

%Split along the 'dim' dimension
Nf = unique(patterns(dim,:));
Nbins = length(Nf); % 增益率最大属性的不同属性值的个数
if (discrete_dim(dim)),
%Discrete pattern
for i = 1:Nbins,
indices = find(patterns(dim, :) == Nf(i));%找到属性值为Nf(i)的索引值
tree.child(i) = make_tree(patterns(dims, indices), targets(indices), inc_node, discrete_dim(dims), maxNbin, base);
end
else
%Continuous pattern
tree.split_loc = split_loc(dim);%分叉值
indices1 = find(patterns(dim,:) <= split_loc(dim));
indices2 = find(patterns(dim,:) > split_loc(dim));
tree.child(1) = make_tree(patterns(dims, indices1), targets(indices1), inc_node, discrete_dim(dims), maxNbin);
tree.child(2) = make_tree(patterns(dims, indices2), targets(indices2), inc_node, discrete_dim(dims), maxNbin);
end



function [H, Bins, region] = high_histogram(data, Nbins, region)

%Find the histogram for high dimensional data
%
% Input:
% Data - The data matrix.
% Should be a DxN matrix, where D is the dimension of the data, and N number of examples
% Nbins - Number of bins (Optional, can be a vector)
% region - A vector containing the min and max values in each direction
%
% Output:
% H - Histogram
% Bins - The bin locations for the data

%Find how many bins to use
switch nargin % 在函数内部使用时,nargin 和 nargout分别表明有输入和输出参数数量。若在函数外部使用, nargin 和nargout对给定的函数,表明输入和输出参数数量。如果一个函数有可变数量的参数,参数数量为负值。
case 1,
Nbins = max(3,floor(size(data,2).^(1/3)));%fix(x):向0取整(也可以理解为向中间取整)floor(x):向左取整 ceil(x):向右取整
region = zeros(1,2*size(data,1));
for i = 1:size(data,1),
region(i*2-1) = min(data(i,:));
region(i*2) = max(data(i,:));
end
case 2,
region = zeros(1,2*size(data,1));
for i = 1:size(data,1),
region(i*2-1) = min(data(i,:));
region(i*2) = max(data(i,:));
end
case 3,
if isempty(Nbins),
Nbins = max(3,floor(size(data,2).^(1/3)));
end
end

%Find the appropriate bin for each data point
Bins = zeros(size(data));
for i = 1:size(data,1),
one_data = data(i,:);
spacing = linspace(region(i*2-1)-2*eps,region(i*2)+2*eps,Nbins(i)+1);
minSpacing = spacing(1:end-1);
maxSpacing = spacing(2:end);

overMin = zeros(Nbins(i),length(one_data));
underMax = zeros(Nbins(i),length(one_data));

for j = 1:Nbins(i),
overMin(j,:) = one_data > minSpacing(j);
underMax(j

,:) = one_data < maxSpacing(j);
end

[m, Bins(i,:)] = max((overMin.*underMax));
end

%Now that the data is in bins, find where they belong.
%I used three options for the dimensions, for better efficiency
switch size(data,1),
case 1,
%1-D data
H = zeros(Nbins,1);
for i = 1:Nbins,
H(i) = length(find(Bins == i));
end
case 2,
%2-D data
H = zeros(Nbins);
for i = 1:Nbins(1),
for j = 1:Nbins(2),
H(i,j) = length(find((Bins(1,:) == i) & (Bins(2,:) == j)));
end
end
otherwise
H = zeros(Nbins*ones(1,size(data,1)));
for i = 1:size(data,2),
line = num2str(Bins(:,i))';
line(size(line,1)+1,:) = ones(1,size(line,2))*',';
Fline = line(:)';
eval( [ ' H (' Fline(1:end-1) ') = H (' Fline(1:end-1) ') + 1; ' ] );
end
end

相关主题