Supplemental online material for “Clustering, Seriation, and Subset

advertisement
Supplemental online material for “Clustering, Seriation, and Subset Extraction of Confusion Data,”
by Michael J. Brusco and J. Douglas Steinley.
Program 1: PREPARE.M
function [s1, s2, s3, d1, d2, d3] = prepare(confuse)
%PREPARE2 reads in a raw confusion matrix 'confuse', and prepares new matrices
%Five output matrices are produced (all diagonal terms are zeroed out)
%Matrix s1 is a symmetric similarity matrix obtained by the arithmetic mean
%
of off diagonals
%Matrix s2 is a symmetric similarity matrix obtained by the arithmetic mean
%
of off diagonals
%Matrix s3 is a symmetric similarity matrix based on Luce's (1963) SCM
%Matrix d1 is a symmetric dissimilarity matrix based on s1
%Matrix d2 is a symmetric dissimilarity matrix based on s2
%Matrix d3 is Shepard's (1957) pseudo Euclidean distance, which is
%
computed as the natural log of the terms in s3.
epsilon = .000000000001; sse = 99999; [n,n1] = size(confuse); lambda =
ones(n,n);
s1 = confuse' + confuse; s1 = s1./2;
for i = 1:n-1
for j = 1+i:n
s2(i,j) = (confuse(i,j).*confuse(j,i)).^.5; s2(j,i) = s2(i,j);
end
end
trig = 0;
for i = 1:n
% Find out if there are any cases where
for j = 1:n
% a(i,j) = a(j,i) = 0
if confuse(i,j) == 0 && confuse(j,i)==0
trig = 1;
end
end
end
if trig == 1
% Use Heiser's "1/2 rule" for raw confusion data
for i = 1:n-1
% and Gilmore et al.'s rule for confusion pcts.
for j = i+1:n
if confuse(i,j) == 0 && confuse(j,i)==0
if trace(confuse) > 2.*n
confuse(i,j) = confuse(i,j) + .5; confuse(j,i) =
confuse(j,i) + .5;
else
confuse(i,j) = confuse(i,j) + .001; confuse(j,i) =
confuse(j,i) + .001;
end
end
end
end
end
% This block fits the Shepard-Luce similarity choice matrix (s2) by using
% iterative proportional fitting as described by Heiser (1988).
lambda_new = zeros(n,n); eta = zeros(n,n);
while sse > epsilon
lambprev = lambda;
csum = sum(sum(confuse(:,:)));
lambda_sum = sum(sum(lambda(:,:)));
lambda = (csum./lambda_sum).*lambda;
rowcsum = sum(confuse');
rowlambdasum = sum(lambda');
for i = 1:n
for j = 1:n
lambda_new(i,j) = (rowcsum(i)./rowlambdasum(i)).* lambda(i,j);
end
end
lambda = lambda_new;
colcsum = sum(confuse);
collambdasum = sum(lambda);
for j = 1:n
for i = 1:n
lambda_new(i,j) = (colcsum(j)./collambdasum(j)).* lambda(i,j);
end
end
lambda = lambda_new;
for i = 1:n
for j = 1:n
lambda_new(i,j) =
(confuse(i,j)+confuse(j,i)).*lambda(i,j)./(lambda(i,j)+lambda(j,i));
end
end
lambda = lambda_new;
diff = (lambda - lambprev).^2;
sse = sum(sum(diff(:,:)));
end
for i = 1:n
for j = 1:n
eta(i,j) = ((lambda(i,j).*lambda(j,i))./(lambda(i,i).*lambda(j,j))).^.5;
end
end
s3 = eta;
d3 = -log(s3); % Matrix d3 is Shepard's distance based on s3
%The block below will obtain dissimilarity matrices d1 and d2 based on
%matrices s1 and s2, respectively.
for i = 1:n
s1(i,i) = 0; s2(i,i) = 0;
end
maxs1 = max(max(s1)); maxs2 = max(max(s2)); d1 = maxs1 - s1; d2 = maxs2-s2;
for i = 1:n
d1(i,i) = 0; d2(i,i) = 0; d3(i,i) = 0;
end
Program 2: BBDIAM.M
function [partition, diameter] = bbdiam(a, num_clusters)
%BBDIAM find a minimum-diameter partition of 'e' objects corresponding to
%
the dissimilarity matrix 'a'. The number of clusters in the
%
partition is c = 'num_clusters'. The minimum diameter is
%
'diameter' and the 'partition' contains the optimal partition.
%
This program uses 10 replication of an exchange algorithm to get
%
a good upper bound on partition diameter.
%
%
%
It also reorders using the rule from Brusco and Cradit (2004), but
this is transparent to the user because the solution is remapped
backed to the original ordering prior to returning 'partition'
tic;
[n1,e] = size(a); epsilon = .0000001; c = num_clusters;
bestz = 99999999999;
for ijk = 1:10
pp = 1:c; pp = repmat(pp,1,e); qq = randperm(e.*c); pp = pp(qq); pp = pp(1:e);
for k = 1:c
numk(k)=sum(pp==k);
% Run exchange heuristic to get
end
% a good initial bound
trig = 0;
while trig == 0
trig = 1;
for i = 1:e
k1 = pp(i);
if numk(k1)==1
continue
end
k1max = 0;
for l = 1:e
if pp(l) == k1 && a(i,l) > k1max
k1max = a(i,l);
end
end
for k = 1:c
if k1 == k
continue
end
kmax = 0;
for l = 1:e
if pp(l) == k && a(i,l) > kmax
kmax = a(i,l);
end
end
if kmax < k1max
numk(k) = numk(k)+1; numk(k1) = numk(k1) - 1; k1max = kmax;
pp(i) = k; k1 = k; trig = 0;
end
end
end
end
z = 0;
for i = 1:e-1
for j = i+1:e
if pp(i)==pp(j) && a(i,j) > z
z = a(i,j);
end
end
end
if z < bestz
bestz = z;
end
end
ss = zeros(1,e);
for i = 1:e
ss(i) = sum(a(i,:)>=bestz);
end
[st,index] = sort(ss);
aa = a;
for i = 1:e
index2(i) = index(e-i+1);
end
a = a(index2,index2);
%
%
%
%
Reorder by looking at the number
of elements in each row of the
dissimilarity matrix that are
greater than the heuristic bound
p = 0; q = c; z = bestz+.01
% Initialize
x = zeros(1,e); xb = zeros(1,e);
n = zeros(1,c);
optfound = 0; trig1 = 0; trig2 = 0;
while optfound == 0
if trig1 == 0
% Branch Forward
p = p + 1; m = 1; n(m) = n(m) + 1; x(p) = m;
if n(m) == 1
q = q - 1;
end
trig1 = 1;
end
if trig2 == 0
if e - p >= q
% Check Feasibility
trig7 = 0;
for i = 1:p-1
k1 = x(i);
for j = i+1:p
k2 = x(j);
if k1 == k2 && a(i,j) > z - epsilon
trig7 = 1;
break
end
end
%
if k1 == m && a(i,p) > z-epsilon;
%
trig7 = 1;
%
break
%
end
end
if trig7 == 0 && q == 0
for j = p+1:e
ksum = zeros(1,c);
for i = 1:p
ki = x(i);
if a(i,j) > ksum(ki)
ksum(ki) = a(i,j);
end
end
kmin = min(ksum);
if kmin > z-epsilon
trig7 = 1;
break
end
end
end
if trig7 == 0
if p ~= e
trig1 = 0;
continue
else
xb = x; z = 0;
% Install new best solution
for i = 1:p-1
ki = x(i);
for j = i+1:p
kj = x(j);
if ki == kj && a(i,j) > z
z = a(i,j);
end
end
end
end
end
end
end
if m == c || n(m) == 1
% Dispensation (if m = c or n(m) = 1)
x(p) = 0;
% then Retract
n(m) = n(m) - 1; p = p - 1;
if n(m) == 0
q = q + 1;
end
if p == 0
optfound = 1;
continue
else
m = x(p); trig1 = 1; trig2 = 1;
end
else
% Otherwise, Branch Right
n(m) = n(m) - 1; m = m + 1; n(m) = n(m)+1; x(p) = m;
if n(m) == 1
q = q - 1;
end
trig1 = 1; trig2 = 0;
continue
end
end
diameter = z; a = aa;
% Remap everything back.
for i = 1:e
partition(index2(i)) = xb(i);
end
toc
Program 3: RANDOPT.M
function [ari,newa,newb] = randopt(mata,matb,parta,partb,diama,diamb)
% RANDOPT.M reads two dissimilarity matrices (mata and matb), two
% partitions of those matrices (parta, partb), and the diameters of the
% partitions (diama and diamb). An exchange algorithm is used to refine
% the partitions to maximize the adjusted Rand index, while maintaining
% minimum-diameter partitions. The best-found ARI and the new partitions
% (newa and newb) are returned as output.
datax = mata; datay = matb; xx = parta; yy= partb; boundx = diama;
boundy = diamb;
m = length(xx); a = 0; b = 0; c = 0; d = 0; x = xx; y = yy;
kx = max(xx); ky = max(yy);
n = m.*(m-1)./2;
% compute initial ARI
for i = 1:m-1
for j = i+1:m
if x(i) == x(j) && y(i) == y(j)
a = a + 1;
elseif x(i) ~= x(j) && y(i) ~= y(j)
d = d + 1;
elseif x(i) == x(j) && y(i) ~= y(j)
b = b + 1;
else
c = c + 1;
end
end
end
rx = (n.*(a+d)-((a+b).*(a+c)+(c+d).*(b+d)))./(n.*n-((a+b).*(a+c)+(c+d).*(b+d)))
xopt = xx; yopt = yy;
trig = 0;
% Begin exchange algorithm here.
while trig == 0
% If at least one improvement is
trig = 1; rb = rx;
% found during a cycle, then trig = 0
for ii = 1:m
for k = 1:kx
if k == xx(ii)
continue
end
x = xx; x(ii) = k; itest = 0; y = yy;
for iii = 1:m
if x(ii) == x(iii) && datax(ii,iii) > boundx
itest = 1; break
end
end
if itest == 1
continue
end
a = 0; b = 0; c = 0; d = 0;
for i = 1:m-1
for j = i+1:m
if x(i) == x(j) && y(i) == y(j)
a = a + 1;
elseif x(i) ~= x(j) && y(i) ~= y(j)
d = d + 1;
elseif x(i) == x(j) && y(i) ~= y(j)
b = b + 1;
else
c = c + 1;
end
end
end
rx = (n.*(a+d)-((a+b).*(a+c)+(c+d).*(b+d)))./
(n.*n-((a+b).*(a+c)+(c+d).*(b+d)));
if rx > rb
rb = rx; isel = ii; ksel = k; icol = 1; trig = 0;
end
end
end
for ii = 1:m
for k = 1:ky
if k == yy(ii)
continue
end
y = yy; y(ii) = k;itest = 0; x=xx;
for iii = 1:m
if y(ii) == y(iii) && datay(ii,iii) > boundy
itest = 1; break
end
end
if itest == 1
continue
end
a = 0; b = 0; c = 0; d = 0;
for i = 1:m-1
for j = i+1:m
if x(i) == x(j) && y(i) == y(j)
a = a + 1;
elseif x(i) ~= x(j) && y(i) ~= y(j)
d = d + 1;
elseif x(i) == x(j) && y(i) ~= y(j)
b = b + 1;
else
c = c + 1;
end
end
end
rx = (n.*(a+d)-((a+b).*(a+c)+(c+d).*(b+d)))./
(n.*n-((a+b).*(a+c)+(c+d).*(b+d)));
if rx > rb
isel = ii; ksel = k; icol = 2; trig = 0;
rb = rx;
end
end
end
if trig == 0
if icol == 1
xx(isel) = ksel; rx = rb
else
yy(isel) = ksel; rx = rb
end
end
end
newa = xx; newb = yy; ari=rb;
Program 4: BBDOM.M
function [domindex,permopt, lindx, con, incon] = bbdom(confusion_matrix)
% BBDOM.M uses a branch-and-bound algorithm to fins a reordering of the
% rows and columns of an asymmetric matrix so as to maximize the sum of the
% elements above the main diagonal of the reordered matrix. The CPU time
% for this algorithm begins to accelerate for matrices larger than 20x20
tic; a = confusion_matrix;
[n,n1]=size(a);
for i = 1:n
a(i,i) = 0;
end
for i = 1:n-1
for j = i+1:n
b(i,j) = max(a(i,j),a(j,i)); b(j,i) = b(i,j);
end
end
nreps = 100;
% Total number of replications
domindex=0;
% Best z-value across all replications
for iii = 1:nreps
s = randperm(n);
% Create a random permutation
z = sum(sum(triu(a(s,s))));
% Sum above the main diagonal
z = z - sum(diag(a));
zbest = z;
% Best z-value for particular replication
sb = s;
dtarg=1;
% Perform pairwise interchange to guarantee a
% local optimum with respect to that operation
while dtarg > eps
dtarg = 0;
for i = 1:n-1
for j = i+1:n
delta=a(s(j),s(i))-a(s(i),s(j));
for k = i+1:j-1
delta=delta-a(s(i),s(k))+a(s(j),s(k))+a(s(k),s(i))-a(s(k),s(j));
end
if delta > dtarg
dtarg = delta;
z = z + dtarg;
jdum = s(j);
s(j) = s(i);
s(i) = jdum;
end
end
end
end
if z > domindex
domindex = z;
end
end
z = domindex - 1
q = zeros(1,n+1); s = zeros(1,n+1);
m = 1; q(m) = 1; s(1) = 1; trig1 = 0; trigend = 0;
z1 = 0;
for j = 1:n
if j ~= q(m)
z1 = z1 + a(q(m),j);
end
end
while trigend == 0
if trig1 == 0
m = m + 1; trig1 = 1;
end
q(m) = q(m) + 1;
if s(q(m)) == 1
continue
end
% advance pointer
% redundancy
if m == 1 && q(m) > n
% terminate
trigend = 1;
continue
end
if m > 1 && q(m) > n
% retract
s(q(m)) = 0; q(m) = 0; m = m - 1;
for i = 1:n
if s(i)==1 && i ~= q(m)
continue
end
z1 = z1 - a(q(m),i);
end
s(q(m)) = 0;
continue
end
if m == n
%
for i = 1:n
%
for j = 1:n-1
%
if q(j) == i
%
break
%
end
%
q(n) = i;
%
if q(6) == 6 && q(7) == 10
%
junk = 1;
%
end
%
end
%
end
zbd = 0;
for i = 1:n-1
for j = i+1:n
zbd = zbd + a(q(i),q(j));
end
end
if zbd > z
z = zbd; x = q;
end
elseif m == 1
z1 = 0;
for j = 1:n
if j ~= q(m)
z1 = z1 + a(q(m),j);
end
end
trig1 = 0; s(q(m))=1;
else
if a(q(m),q(m-1)) > a(q(m-1),q(m))
continue
end
if a(q(m),q(m-1)) == a(q(m-1),q(m)) && q(m) < q(m-1)
continue
end
rtrig = 0;
for mm = m-2:-1:1
rdx = 0;
for i = mm:m-1
rdx = rdx + a(q(m),q(i)) - a(q(i),q(m));
end
if rdx > 0
rtrig = 1;
end
end
if rtrig == 1
continue
end
z2 = 0;
for i = 1:n
if s(i) == 0 && i ~= q(m)
z2 = z2 + a(q(m),i);
end
end
z3 = 0;
for i = 1:n-1
if s(i) == 1 || i == q(m)
continue
end
for j = i+1:n
if s(j) == 1 || j == q(m)
continue
end
z3 = z3 + b(i,j); %max(a(i,j),a(j,i));
end
end
if z1 + z2 + z3 > z
trig1 = 0; s(q(m)) = 1; z1 = z1 + z2;
end
end
end
x(n+1) = [];
reordered_matrix = a(x,x);
summat = sum(sum(a)); lindx = z./summat; con = 0; incon = 0;
for i = 1:n-1
for j = i+1:n
if reordered_matrix(i,j) > reordered_matrix(j,i)
con = con + 1;
end
if reordered_matrix(i,j) < reordered_matrix(j,i)
incon = incon + 1;
end
end
end
domindex = z; permopt = x;
toc;
Program 5: BBSUBSET.M
function [subset, index] = bbsubset(a, num_subsets, sub_size)
%BBSUBSET.M: This program reads an n-by-n matrix 'a' and will extract
%'num_subsets' each of size 'sub_size' based on the objective of maximizing
%the within subset sums of matrix elements minus the between subset sums of
%matrix elements. The program is fast when sub_size = 2, but can be much
%slower for sub_size > 2.
tic;
[n,n1]=size(a); ict = 0; c = num_subsets; gs = sub_size;
for k = 1:c
for h = 1:gs
ict = ict + 1; tau(ict) = k;
end
end
targ = ict;
ict = 0;
for i = 1:n-1
for j = i+1:n
ict = ict + 1; r(ict) = a(i,j); pair1(ict) = i; pair2(ict) = j;
end
end
% This block is designed to establish bounds for the between subset
% and within subset sums.
r = -r; [r,idx] = sort(r); r = -r; pair1 = pair1(idx); pair2 = pair2(idx);
delta = zeros(1,targ); omega = zeros(1,targ);
ws = 0; bs = 0;
ic = 0;
for k = 1:c
for h = 1:gs
ic = ic + 1;
for l = 1:h-1
ws = ws + 1;
end
delta(ic) = ws;
for l = 1:k-1
for u = 1:gs
bs = bs + 1;
end
end
omega(ic) = bs;
end
end
within_terms = 0; between_terms = 0;
for k = 1:c
within_terms = within_terms + gs.*(gs-1)./2;
end
for k = 1:c-1;
for h = k+1:c;
between_terms = between_terms + gs.*gs;
end
end
%This block is a greedy heuristic for obtaining an initial lower bound for
%the within subset sums minus the between subset sums.
sel = zeros(1,n);
prt = zeros(k,n);
for k = 1:c
amax = -99999999;
for i = 1:n-1
if sel(i) == 1
continue
end
for j = i+1:n
if sel(j) == 1
continue
end
asum = 0;
for h = 1:k-1
for l = 1:gs
asum = asum - a(i,prt(h,l)) - a(j,prt(h,l));
end
end
if a(i,j) + asum > amax
amax = a(i,j)+asum; isel = i; jsel = j;
end
end
end
prt(k,1) = isel; prt(k,2) = jsel; sel(isel) = 1; sel(jsel) = 1;
for i = 3:gs
amax = -99999999;
for j = 1:n
if sel(j) == 1
continue
end
asum = 0;
for h = 1:i-1
asum = asum + a(j,prt(k,h));
end
for h = 1:k-1
for l = 1:gs
asum = asum - a(j,prt(h,l));
end
end
if asum > amax
jsel = j; amax = asum;
end
end
prt(k,i) = jsel; sel(jsel) = 1;
end
end
z = 0;
for k = 1:c
for i = 1:gs-1
for j = i+1:gs
z = z + a(prt(k,i),prt(k,j));
end
end
end
for k = 1:c-1
for l = k+1:c
for i = 1:gs
for j = 1:gs
z = z - a(prt(k,i),prt(l,j));
end
end
end
end
z = z-1;
%The branch-and-bound algorithm begins here
q = zeros(1,n+1); s = zeros(1,n+1); test_sum = 0; tsum = zeros(1,n);
m = 1; q(m) = 1; s(1) = 1; trig1 = 0; trigend = 0;
while trigend == 0
if trig1 == 0
m = m + 1; trig1 = 1;
% advance pointer
end
q(m) = q(m) + 1;
if s(q(m)) ~= 0
% redundancy
continue
end
if m > 2
if tau(m-1)==tau(m) && q(m) < q(m-1)
% redundancy
continue
end
if tau(m-1)~=tau(m) && q(m) < q(m-gs)
% redundancy
continue
end
end
if m == 1 && q(m) > n
% terminate
trigend = 1;
continue
end
if m > 1 && q(m) > n
% retract
s(q(m)) = 0; q(m) = 0; m = m - 1; test_sum = test_sum - tsum(m); tsum(m) =
0;
s(q(m)) = 0;
continue
end
for i = 1:m-1
v = q(i);
if tau(i) == tau(m)
tsum(m) = tsum(m) + a(v,q(m));
else
tsum(m) = tsum(m) - a(v,q(m));
end
end
test_sum = test_sum + tsum(m);
w = within_terms - delta(m); % b = between_terms - omega(m);
rbb = 0; rff = 0; isum = 0; isel = 0;
while isel < w
isum = isum + 1; pr1 = pair1(isum); pr2 = pair2(isum);
if s(pr1)~=0 && s(pr2) ~= 0
continue
end
if s(pr1) > 0 && s(pr1) ~= tau(m)
continue
end
if s(pr2) > 0 && s(pr2) ~= tau(m)
continue
end
rbb = rbb + r(isum); isel = isel + 1;
end
if test_sum + rbb - rff > z
% Partial solution passes so
if m == targ
% update incumbent if an optimum
z = test_sum;
partition = q;
test_sum = test_sum - tsum(m); tsum(m) = 0;
continue
end
trig1 = 0; s(q(m))=tau(m);
% Otherwise, branch forward
else
test_sum = test_sum - tsum(m); tsum(m) = 0;
end
end
subset = partition; index = z;
toc
Download