Supplemental online material for “Clustering, Seriation, and Subset Extraction of Confusion Data,” by Michael J. Brusco and J. Douglas Steinley. Program 1: PREPARE.M function [s1, s2, s3, d1, d2, d3] = prepare(confuse) %PREPARE2 reads in a raw confusion matrix 'confuse', and prepares new matrices %Five output matrices are produced (all diagonal terms are zeroed out) %Matrix s1 is a symmetric similarity matrix obtained by the arithmetic mean % of off diagonals %Matrix s2 is a symmetric similarity matrix obtained by the arithmetic mean % of off diagonals %Matrix s3 is a symmetric similarity matrix based on Luce's (1963) SCM %Matrix d1 is a symmetric dissimilarity matrix based on s1 %Matrix d2 is a symmetric dissimilarity matrix based on s2 %Matrix d3 is Shepard's (1957) pseudo Euclidean distance, which is % computed as the natural log of the terms in s3. epsilon = .000000000001; sse = 99999; [n,n1] = size(confuse); lambda = ones(n,n); s1 = confuse' + confuse; s1 = s1./2; for i = 1:n-1 for j = 1+i:n s2(i,j) = (confuse(i,j).*confuse(j,i)).^.5; s2(j,i) = s2(i,j); end end trig = 0; for i = 1:n % Find out if there are any cases where for j = 1:n % a(i,j) = a(j,i) = 0 if confuse(i,j) == 0 && confuse(j,i)==0 trig = 1; end end end if trig == 1 % Use Heiser's "1/2 rule" for raw confusion data for i = 1:n-1 % and Gilmore et al.'s rule for confusion pcts. for j = i+1:n if confuse(i,j) == 0 && confuse(j,i)==0 if trace(confuse) > 2.*n confuse(i,j) = confuse(i,j) + .5; confuse(j,i) = confuse(j,i) + .5; else confuse(i,j) = confuse(i,j) + .001; confuse(j,i) = confuse(j,i) + .001; end end end end end % This block fits the Shepard-Luce similarity choice matrix (s2) by using % iterative proportional fitting as described by Heiser (1988). lambda_new = zeros(n,n); eta = zeros(n,n); while sse > epsilon lambprev = lambda; csum = sum(sum(confuse(:,:))); lambda_sum = sum(sum(lambda(:,:))); lambda = (csum./lambda_sum).*lambda; rowcsum = sum(confuse'); rowlambdasum = sum(lambda'); for i = 1:n for j = 1:n lambda_new(i,j) = (rowcsum(i)./rowlambdasum(i)).* lambda(i,j); end end lambda = lambda_new; colcsum = sum(confuse); collambdasum = sum(lambda); for j = 1:n for i = 1:n lambda_new(i,j) = (colcsum(j)./collambdasum(j)).* lambda(i,j); end end lambda = lambda_new; for i = 1:n for j = 1:n lambda_new(i,j) = (confuse(i,j)+confuse(j,i)).*lambda(i,j)./(lambda(i,j)+lambda(j,i)); end end lambda = lambda_new; diff = (lambda - lambprev).^2; sse = sum(sum(diff(:,:))); end for i = 1:n for j = 1:n eta(i,j) = ((lambda(i,j).*lambda(j,i))./(lambda(i,i).*lambda(j,j))).^.5; end end s3 = eta; d3 = -log(s3); % Matrix d3 is Shepard's distance based on s3 %The block below will obtain dissimilarity matrices d1 and d2 based on %matrices s1 and s2, respectively. for i = 1:n s1(i,i) = 0; s2(i,i) = 0; end maxs1 = max(max(s1)); maxs2 = max(max(s2)); d1 = maxs1 - s1; d2 = maxs2-s2; for i = 1:n d1(i,i) = 0; d2(i,i) = 0; d3(i,i) = 0; end Program 2: BBDIAM.M function [partition, diameter] = bbdiam(a, num_clusters) %BBDIAM find a minimum-diameter partition of 'e' objects corresponding to % the dissimilarity matrix 'a'. The number of clusters in the % partition is c = 'num_clusters'. The minimum diameter is % 'diameter' and the 'partition' contains the optimal partition. % This program uses 10 replication of an exchange algorithm to get % a good upper bound on partition diameter. % % % It also reorders using the rule from Brusco and Cradit (2004), but this is transparent to the user because the solution is remapped backed to the original ordering prior to returning 'partition' tic; [n1,e] = size(a); epsilon = .0000001; c = num_clusters; bestz = 99999999999; for ijk = 1:10 pp = 1:c; pp = repmat(pp,1,e); qq = randperm(e.*c); pp = pp(qq); pp = pp(1:e); for k = 1:c numk(k)=sum(pp==k); % Run exchange heuristic to get end % a good initial bound trig = 0; while trig == 0 trig = 1; for i = 1:e k1 = pp(i); if numk(k1)==1 continue end k1max = 0; for l = 1:e if pp(l) == k1 && a(i,l) > k1max k1max = a(i,l); end end for k = 1:c if k1 == k continue end kmax = 0; for l = 1:e if pp(l) == k && a(i,l) > kmax kmax = a(i,l); end end if kmax < k1max numk(k) = numk(k)+1; numk(k1) = numk(k1) - 1; k1max = kmax; pp(i) = k; k1 = k; trig = 0; end end end end z = 0; for i = 1:e-1 for j = i+1:e if pp(i)==pp(j) && a(i,j) > z z = a(i,j); end end end if z < bestz bestz = z; end end ss = zeros(1,e); for i = 1:e ss(i) = sum(a(i,:)>=bestz); end [st,index] = sort(ss); aa = a; for i = 1:e index2(i) = index(e-i+1); end a = a(index2,index2); % % % % Reorder by looking at the number of elements in each row of the dissimilarity matrix that are greater than the heuristic bound p = 0; q = c; z = bestz+.01 % Initialize x = zeros(1,e); xb = zeros(1,e); n = zeros(1,c); optfound = 0; trig1 = 0; trig2 = 0; while optfound == 0 if trig1 == 0 % Branch Forward p = p + 1; m = 1; n(m) = n(m) + 1; x(p) = m; if n(m) == 1 q = q - 1; end trig1 = 1; end if trig2 == 0 if e - p >= q % Check Feasibility trig7 = 0; for i = 1:p-1 k1 = x(i); for j = i+1:p k2 = x(j); if k1 == k2 && a(i,j) > z - epsilon trig7 = 1; break end end % if k1 == m && a(i,p) > z-epsilon; % trig7 = 1; % break % end end if trig7 == 0 && q == 0 for j = p+1:e ksum = zeros(1,c); for i = 1:p ki = x(i); if a(i,j) > ksum(ki) ksum(ki) = a(i,j); end end kmin = min(ksum); if kmin > z-epsilon trig7 = 1; break end end end if trig7 == 0 if p ~= e trig1 = 0; continue else xb = x; z = 0; % Install new best solution for i = 1:p-1 ki = x(i); for j = i+1:p kj = x(j); if ki == kj && a(i,j) > z z = a(i,j); end end end end end end end if m == c || n(m) == 1 % Dispensation (if m = c or n(m) = 1) x(p) = 0; % then Retract n(m) = n(m) - 1; p = p - 1; if n(m) == 0 q = q + 1; end if p == 0 optfound = 1; continue else m = x(p); trig1 = 1; trig2 = 1; end else % Otherwise, Branch Right n(m) = n(m) - 1; m = m + 1; n(m) = n(m)+1; x(p) = m; if n(m) == 1 q = q - 1; end trig1 = 1; trig2 = 0; continue end end diameter = z; a = aa; % Remap everything back. for i = 1:e partition(index2(i)) = xb(i); end toc Program 3: RANDOPT.M function [ari,newa,newb] = randopt(mata,matb,parta,partb,diama,diamb) % RANDOPT.M reads two dissimilarity matrices (mata and matb), two % partitions of those matrices (parta, partb), and the diameters of the % partitions (diama and diamb). An exchange algorithm is used to refine % the partitions to maximize the adjusted Rand index, while maintaining % minimum-diameter partitions. The best-found ARI and the new partitions % (newa and newb) are returned as output. datax = mata; datay = matb; xx = parta; yy= partb; boundx = diama; boundy = diamb; m = length(xx); a = 0; b = 0; c = 0; d = 0; x = xx; y = yy; kx = max(xx); ky = max(yy); n = m.*(m-1)./2; % compute initial ARI for i = 1:m-1 for j = i+1:m if x(i) == x(j) && y(i) == y(j) a = a + 1; elseif x(i) ~= x(j) && y(i) ~= y(j) d = d + 1; elseif x(i) == x(j) && y(i) ~= y(j) b = b + 1; else c = c + 1; end end end rx = (n.*(a+d)-((a+b).*(a+c)+(c+d).*(b+d)))./(n.*n-((a+b).*(a+c)+(c+d).*(b+d))) xopt = xx; yopt = yy; trig = 0; % Begin exchange algorithm here. while trig == 0 % If at least one improvement is trig = 1; rb = rx; % found during a cycle, then trig = 0 for ii = 1:m for k = 1:kx if k == xx(ii) continue end x = xx; x(ii) = k; itest = 0; y = yy; for iii = 1:m if x(ii) == x(iii) && datax(ii,iii) > boundx itest = 1; break end end if itest == 1 continue end a = 0; b = 0; c = 0; d = 0; for i = 1:m-1 for j = i+1:m if x(i) == x(j) && y(i) == y(j) a = a + 1; elseif x(i) ~= x(j) && y(i) ~= y(j) d = d + 1; elseif x(i) == x(j) && y(i) ~= y(j) b = b + 1; else c = c + 1; end end end rx = (n.*(a+d)-((a+b).*(a+c)+(c+d).*(b+d)))./ (n.*n-((a+b).*(a+c)+(c+d).*(b+d))); if rx > rb rb = rx; isel = ii; ksel = k; icol = 1; trig = 0; end end end for ii = 1:m for k = 1:ky if k == yy(ii) continue end y = yy; y(ii) = k;itest = 0; x=xx; for iii = 1:m if y(ii) == y(iii) && datay(ii,iii) > boundy itest = 1; break end end if itest == 1 continue end a = 0; b = 0; c = 0; d = 0; for i = 1:m-1 for j = i+1:m if x(i) == x(j) && y(i) == y(j) a = a + 1; elseif x(i) ~= x(j) && y(i) ~= y(j) d = d + 1; elseif x(i) == x(j) && y(i) ~= y(j) b = b + 1; else c = c + 1; end end end rx = (n.*(a+d)-((a+b).*(a+c)+(c+d).*(b+d)))./ (n.*n-((a+b).*(a+c)+(c+d).*(b+d))); if rx > rb isel = ii; ksel = k; icol = 2; trig = 0; rb = rx; end end end if trig == 0 if icol == 1 xx(isel) = ksel; rx = rb else yy(isel) = ksel; rx = rb end end end newa = xx; newb = yy; ari=rb; Program 4: BBDOM.M function [domindex,permopt, lindx, con, incon] = bbdom(confusion_matrix) % BBDOM.M uses a branch-and-bound algorithm to fins a reordering of the % rows and columns of an asymmetric matrix so as to maximize the sum of the % elements above the main diagonal of the reordered matrix. The CPU time % for this algorithm begins to accelerate for matrices larger than 20x20 tic; a = confusion_matrix; [n,n1]=size(a); for i = 1:n a(i,i) = 0; end for i = 1:n-1 for j = i+1:n b(i,j) = max(a(i,j),a(j,i)); b(j,i) = b(i,j); end end nreps = 100; % Total number of replications domindex=0; % Best z-value across all replications for iii = 1:nreps s = randperm(n); % Create a random permutation z = sum(sum(triu(a(s,s)))); % Sum above the main diagonal z = z - sum(diag(a)); zbest = z; % Best z-value for particular replication sb = s; dtarg=1; % Perform pairwise interchange to guarantee a % local optimum with respect to that operation while dtarg > eps dtarg = 0; for i = 1:n-1 for j = i+1:n delta=a(s(j),s(i))-a(s(i),s(j)); for k = i+1:j-1 delta=delta-a(s(i),s(k))+a(s(j),s(k))+a(s(k),s(i))-a(s(k),s(j)); end if delta > dtarg dtarg = delta; z = z + dtarg; jdum = s(j); s(j) = s(i); s(i) = jdum; end end end end if z > domindex domindex = z; end end z = domindex - 1 q = zeros(1,n+1); s = zeros(1,n+1); m = 1; q(m) = 1; s(1) = 1; trig1 = 0; trigend = 0; z1 = 0; for j = 1:n if j ~= q(m) z1 = z1 + a(q(m),j); end end while trigend == 0 if trig1 == 0 m = m + 1; trig1 = 1; end q(m) = q(m) + 1; if s(q(m)) == 1 continue end % advance pointer % redundancy if m == 1 && q(m) > n % terminate trigend = 1; continue end if m > 1 && q(m) > n % retract s(q(m)) = 0; q(m) = 0; m = m - 1; for i = 1:n if s(i)==1 && i ~= q(m) continue end z1 = z1 - a(q(m),i); end s(q(m)) = 0; continue end if m == n % for i = 1:n % for j = 1:n-1 % if q(j) == i % break % end % q(n) = i; % if q(6) == 6 && q(7) == 10 % junk = 1; % end % end % end zbd = 0; for i = 1:n-1 for j = i+1:n zbd = zbd + a(q(i),q(j)); end end if zbd > z z = zbd; x = q; end elseif m == 1 z1 = 0; for j = 1:n if j ~= q(m) z1 = z1 + a(q(m),j); end end trig1 = 0; s(q(m))=1; else if a(q(m),q(m-1)) > a(q(m-1),q(m)) continue end if a(q(m),q(m-1)) == a(q(m-1),q(m)) && q(m) < q(m-1) continue end rtrig = 0; for mm = m-2:-1:1 rdx = 0; for i = mm:m-1 rdx = rdx + a(q(m),q(i)) - a(q(i),q(m)); end if rdx > 0 rtrig = 1; end end if rtrig == 1 continue end z2 = 0; for i = 1:n if s(i) == 0 && i ~= q(m) z2 = z2 + a(q(m),i); end end z3 = 0; for i = 1:n-1 if s(i) == 1 || i == q(m) continue end for j = i+1:n if s(j) == 1 || j == q(m) continue end z3 = z3 + b(i,j); %max(a(i,j),a(j,i)); end end if z1 + z2 + z3 > z trig1 = 0; s(q(m)) = 1; z1 = z1 + z2; end end end x(n+1) = []; reordered_matrix = a(x,x); summat = sum(sum(a)); lindx = z./summat; con = 0; incon = 0; for i = 1:n-1 for j = i+1:n if reordered_matrix(i,j) > reordered_matrix(j,i) con = con + 1; end if reordered_matrix(i,j) < reordered_matrix(j,i) incon = incon + 1; end end end domindex = z; permopt = x; toc; Program 5: BBSUBSET.M function [subset, index] = bbsubset(a, num_subsets, sub_size) %BBSUBSET.M: This program reads an n-by-n matrix 'a' and will extract %'num_subsets' each of size 'sub_size' based on the objective of maximizing %the within subset sums of matrix elements minus the between subset sums of %matrix elements. The program is fast when sub_size = 2, but can be much %slower for sub_size > 2. tic; [n,n1]=size(a); ict = 0; c = num_subsets; gs = sub_size; for k = 1:c for h = 1:gs ict = ict + 1; tau(ict) = k; end end targ = ict; ict = 0; for i = 1:n-1 for j = i+1:n ict = ict + 1; r(ict) = a(i,j); pair1(ict) = i; pair2(ict) = j; end end % This block is designed to establish bounds for the between subset % and within subset sums. r = -r; [r,idx] = sort(r); r = -r; pair1 = pair1(idx); pair2 = pair2(idx); delta = zeros(1,targ); omega = zeros(1,targ); ws = 0; bs = 0; ic = 0; for k = 1:c for h = 1:gs ic = ic + 1; for l = 1:h-1 ws = ws + 1; end delta(ic) = ws; for l = 1:k-1 for u = 1:gs bs = bs + 1; end end omega(ic) = bs; end end within_terms = 0; between_terms = 0; for k = 1:c within_terms = within_terms + gs.*(gs-1)./2; end for k = 1:c-1; for h = k+1:c; between_terms = between_terms + gs.*gs; end end %This block is a greedy heuristic for obtaining an initial lower bound for %the within subset sums minus the between subset sums. sel = zeros(1,n); prt = zeros(k,n); for k = 1:c amax = -99999999; for i = 1:n-1 if sel(i) == 1 continue end for j = i+1:n if sel(j) == 1 continue end asum = 0; for h = 1:k-1 for l = 1:gs asum = asum - a(i,prt(h,l)) - a(j,prt(h,l)); end end if a(i,j) + asum > amax amax = a(i,j)+asum; isel = i; jsel = j; end end end prt(k,1) = isel; prt(k,2) = jsel; sel(isel) = 1; sel(jsel) = 1; for i = 3:gs amax = -99999999; for j = 1:n if sel(j) == 1 continue end asum = 0; for h = 1:i-1 asum = asum + a(j,prt(k,h)); end for h = 1:k-1 for l = 1:gs asum = asum - a(j,prt(h,l)); end end if asum > amax jsel = j; amax = asum; end end prt(k,i) = jsel; sel(jsel) = 1; end end z = 0; for k = 1:c for i = 1:gs-1 for j = i+1:gs z = z + a(prt(k,i),prt(k,j)); end end end for k = 1:c-1 for l = k+1:c for i = 1:gs for j = 1:gs z = z - a(prt(k,i),prt(l,j)); end end end end z = z-1; %The branch-and-bound algorithm begins here q = zeros(1,n+1); s = zeros(1,n+1); test_sum = 0; tsum = zeros(1,n); m = 1; q(m) = 1; s(1) = 1; trig1 = 0; trigend = 0; while trigend == 0 if trig1 == 0 m = m + 1; trig1 = 1; % advance pointer end q(m) = q(m) + 1; if s(q(m)) ~= 0 % redundancy continue end if m > 2 if tau(m-1)==tau(m) && q(m) < q(m-1) % redundancy continue end if tau(m-1)~=tau(m) && q(m) < q(m-gs) % redundancy continue end end if m == 1 && q(m) > n % terminate trigend = 1; continue end if m > 1 && q(m) > n % retract s(q(m)) = 0; q(m) = 0; m = m - 1; test_sum = test_sum - tsum(m); tsum(m) = 0; s(q(m)) = 0; continue end for i = 1:m-1 v = q(i); if tau(i) == tau(m) tsum(m) = tsum(m) + a(v,q(m)); else tsum(m) = tsum(m) - a(v,q(m)); end end test_sum = test_sum + tsum(m); w = within_terms - delta(m); % b = between_terms - omega(m); rbb = 0; rff = 0; isum = 0; isel = 0; while isel < w isum = isum + 1; pr1 = pair1(isum); pr2 = pair2(isum); if s(pr1)~=0 && s(pr2) ~= 0 continue end if s(pr1) > 0 && s(pr1) ~= tau(m) continue end if s(pr2) > 0 && s(pr2) ~= tau(m) continue end rbb = rbb + r(isum); isel = isel + 1; end if test_sum + rbb - rff > z % Partial solution passes so if m == targ % update incumbent if an optimum z = test_sum; partition = q; test_sum = test_sum - tsum(m); tsum(m) = 0; continue end trig1 = 0; s(q(m))=tau(m); % Otherwise, branch forward else test_sum = test_sum - tsum(m); tsum(m) = 0; end end subset = partition; index = z; toc