-V ,."". -,,. , ,i :::,_- : Il .,-I,,,,, ,,-,` I - -,-.." - I-11I-I,II : - . ,--,F-.I -,.--, "-,,, , _a11 ,-, 1,-, -,- : I.;- --:,;-I, I-..,-., , --. : --I:.- --,l--,p, :-"-.; ., ,.-,-.,,- 7-,-,q. ----::, I IC ,11"; ,-, I:I- -,. -I II -.. ,,,-..1I--,., ,71-,,- r . ---1 .:-.,,.I---II,-, . ,: --. ,1, -,: ",-,,-,-, 11,-I.I,".:-,-,,i, ,-I ,',..I 1, --,,--,"-, :III,,,-,, -,,il ',.,--,,-,,,-.:,:,- -, -' , -, ,I-,: ' , ,"--,,- --. -, ,--,",-- -:,I l -,--,,;-,. - ,l-., -, ,,--, 7-:';:-,-', ,,:'.,----, : -,-,,-,1,, ,,-,-, " --. ., ,,.,,.,,,, --. -,,:". '--'--,`---; , Z":f, ",, ,-, ;:, -,,,--_,, - ,K'-, ,-rII ; -,,-,,,, ,,r'-- , --2, 2,;, F,,,,,, ,.F,,-,,I ,,.--,.,',-. ,,--I" . --",-' --,--..',,,.-, I w1-1:.,, ,, -.-A -, -, -,,fJ,.I----,-,-, - ,-"-, ,II," , `,, -,," -,, -,--,,- .......L-,,, .- ::-I--,,,:---wII--- -,, .- --', ,--, I,---::-,. ,7 -I'I -,,, , !,' ,--,,-, .-:, ,-,,,-," , , -, -,-_, ;,. ,--r"-,,-,,,--,, :--- ,,,---,-,,,,, -.i ,-- ,,,, -.,4--,.,r-rr-,.:C----:-. -'I---,- --: ,." .7,", ,,I,,.,-----7. 1,,,,,-I,, -,! ,,--,,,-, ,, ,, '.-,-,,-,---,I:.-4. -I,,:--, -. ",---:,,.,, ,-I ,".,,-, -,,,--' 4-, 4,, -, -,---,-I"'-,-w,,-, ----, ;'--, "--': ,-` ",--,,, -,------ ,--,----I-': -, ,,,,, -,,-: -,,i,Z,--".-, I .., ,-:,-.,-,"---' it ,I-,, , -,,, ,-C,,,,,I -,,"' :,--,, ,, , --.-,I---,,,,-I-, ,--L-,II ,---"." "I""; , ,: -'----.,--I.--- ' ,-,,.,-,-,-, , ,,,`,'i-, ,, ---_,,, -',--', " , ----,, ,i,,-,,,...- -.,-, '. ,"". -,,:------,,,I , 3' -. ,o , , , -II,-, -.-,; -, ',,-_, a--i ; -. "., -,-, _-,,--4, ,,,, ,,--,-4,-,,7`A , , ,, ,; ., ,,-.I-,-"' -,,i... --,.,,.1i "I,,- .-----11" -, -",, ,,,,-I-, " ,; ,,,, ,, ",, ,,, ,,., : ". ; ,-Z--::-,',-',, , .-- ----.--' .,-,",,, .i-; . ,-` -:"--.-,I--,:C,I ,',,, ':-, -: ,.,-, ,,,' -'," , -,---ml--, - -, , ,,' -r ,", ,--. ,,,,,, ,.., -, ,-,,., ,, -,-,:,,-'.-!Z,,I-1,111` ,---,- ,-," ,----- ,,,-, ,,-", ,-.-, ,:":,--, , ,---':-:.---.,,-,-',--F7 ",, -I-, I,-,,--,----,', -:,-',',, -,,''-,-P".,`,'-",-_ ,"'-"" ,,-I. ,Zi --,,-,'-",,',-,;. ,,," ,-,. . 6,-,-.., ,-' ,-: -., -, ,'--,--" -4., 7z ,,.,,-,,-----", ,, ,-,,', , -1, ,' --, ,I- I,-i- ,,", --`J, lr --,,-": - ,-,,,---"----;, ',, ,,: 1, -̀-,.',", !I,, ,-':-,-,-,I:-"',-,-"'- -, -- ,!-----,--, ,,-.,-,.-, -- zt,-,'.-,,,',, ,` W, , --,,.,-, ,", . ---,,,---,,,,, -, ,----,,,,-: -,-I --F ,,,, -,"", , , --"-,':-',, -, --,,, ,",--,,,-----'. ,.,-,,,, , -1----.' `1,,-,,,.-F--: --.,,, .., ",,-, 7, " `-1 ,,-,, ,.,-" -,,e.,-f---,---. : ,.-"---,-,,7. ,---. ,-, ,, ,--,, , --. '.,,-,-, -, ,-.',-----.,,.-,,,,-",-,,, -,-I, -" "- .I,-,,,,--, .',--,-,, ,. "-", -1 '-, ,--" II--`", ,;' ,,.,, -,--I',-,,C--',', , - , ", ""' ,-:-----e-,,,-,.'",, ,_ -I-, 7,-,A' -,-,,,,I, `,V ---- .1:, ,-, ---,-', ----,7 ';z,-, , -,., ,--,,-,,',,,-, '. ,,",, , St', -;',,,;, -,-, ,-,-,., '.-I -V ,,l ,', ,A .-,-,,-. ll' --; ','.,,;, ', ;,", , ---" ------:.,-,,-,7,---z ---."-,, , -', ,:; -,,,-_: : ---,,-----l---;-----F, ,, ,, ,-; "-,,,-;, ,,,,,.-11 , , -,i ,-.. .,-,---, : " ""',-,---,",,,, ,, . ", ,I---,',-,-, .-- ,, ----,-,,-", .z,--.,,- ,,i-:!-,. - I,-,,, ,' ,,--,--,, ,,.-, -, ,,-, ,,F: ,,,,,,, ,,"'.,-, ,,:,,.-"-,-,-,,,-----,-,--,,.,-, -, ,,, ,-, --",-,z -, ., -,-,.- ... ".,,,,,-,I -4 ,,, ,,",--, 1 ,i-,. ,:", -:. '-' , F, ,-%'-,' , '., ,, --' ----,"'--,--,,::; --,,,-,, "-, ,,, ,--,-,-, !, -__1 ,-. -.-. 0-, 1,I-,"", -,,,,-? """ I-1"',, "' -'l-,"-- ,.,. ,-" I:-, I --.,-"-, '--,: ,-,,-,-,-.,,--. " -,-,,-6,-., .", - -,,". ,N' -,-, ,_ -1 , --,F!'-I.---"-4--,,,----,, - , --II, -:,.,, -,N.T -,,,:-,"-----. ,,, - .-, --,,,--; ,I.,_"-"--.",...i,,,.,,, ,,-, ,,,-'.--,,,II.-,"A -e-,-, , ,", -II"--.,-,,.I,, _0 ,.. J,---,,.,--,,-,C,,--',i -I I,,', , ,,--,--,,,----,.,i.,,,.',",,,,'-, --,- , , ,,_",-".. ,-'---,. -,,--, .,,,--,-.,,. -,- , "! -- , - -. ,,,,,,, ---. , .:, .-,-,,--,,-,,,-,--,-, " -:,-., ,; ., ---- -,,, , ---,, -,---"-,,-,--,, -..,I-,, ,- ", --- -"., ,,'l---'llll;,. -,-, -,-,-,. -, ,-,, -,i-1- ," 6-,,,",,", ;_. " ----,".o : "-,-, " `-W-1"-,-,'--,,`-, -I---`,-, ,-------,,---, , ,.-,." ,-,-,,---,:", .---. ,-,-,--- ---:...:', ,- ,,, -.,-,_,--,,-,,,-,:-Z-----,T-.,.---" -., ,-'. ,. --I.,,, ---. ,-, -,,-,-, . I,, -,,,-,-,.-11-.-,, ..-,,-. .,, -- , -,J,,",-,,,,: '. ,--"--t,----,,,. :.- ,-, ,,. ,j,,,,,,-----, ,-I -,;'.-'--, , `,,, -, .,11,.1--I ,", :-, ,- , -,,,, ,"-,.,, :.. - --,, ", ,'i,,,- -, q-,r.,----,,." -,-,, -,, ,....! -.", '-i---,-.", -,-,,1,--:-"-,; ,,,, ,-,j,-,,. M, ,.-,,-, 1, ,,,,,,, ---," -,--, -1 ,.,-,--: -1 ",." 1, . -. -,,--e -, -',, -_,F,:,"', -,.", ,,,.,, -,,-'i',,, --, ,,,-,-"", ,"', "-, -vI ,-,I--,-,-,,-,,--.,, -: ,'.-, ,i, .:-,,,,,--,, , ,,.,., ,Z f '. -,-,, ,, ---"-,-, .,-,,,-_i5 -, , -. : '-,. "---L-,,----, -, :,- ," ",,,,---,, ,I -; i. -II -, , -, "d ",,-,--------l -: , -", ,--.. t', , ,, I, , ,-FI-, ,;.,; ,,, ,-, , .I ".1 . ,, ',----,I.I.,-,, " -,;:z , ,; .- --,, ---,, -... ` ,-7," ,-; ,:, -, ,,--",,-,-.,... .,,,,-, ---,.-F-., - "--,- ,,, -", ..',--,' ,,-I --',,'.--,I,-,.-III--, %,-----,Z --, :,, .. "-, ,:F,,,-,Z,- !-, ,-.,,,-"- ,, -," ",,, ,,.,--, ,, '-' ' .,:r ' ,' I,'.,-. 7:-,: `,. 2Y'I' -, ,-, r.,-',, ,:i -, " -;,1,4,, ,--, , .-i:,,,,,.,I-I".. , . I. 1.-Al ; "-. -',,,.'- ,,,,-, o Iri -:--,, :--,,,,I-,, ---"--'-.'.,, ,,, " .-",,, ,,--.,-, ,:i -,,-.;I-,--- --.-,,--'] ,, -, --,--,--"-, -,"", ,. "',_ ,-- -- - i ", ,,-- ,:,I -`,-r ,,-T1, I '-',,','_,,"i--j,;.-,, =,----, -'. 4 ,!-,; -,,-.-,.I.",,4, 1 ,--.,.1,,.,. --_,--,----I ,---T,,-,:--,,,-,----I- . ...,I- I-.-,-,,II-',-- , -,-,' '. .,,-,,, ,,-T,.-, ,,,,") ,,, ,- --, ,t I-,-, -. - " ,-, "-- , ----. -,---,,,,,,-:T ,,,,;, -,, -,-,--, --,, 3.;;,411 ,":, -. ,,-,,; -X: ,,I',- I'; -,.-.-,-, - --"-, --,,--, ---t, ,-, _-::-,,-:,T -1.--'-,, -,"-, --,,;--,5 ,- ,-,:"-w--7,,-",-,, ..-, ': ,,,,,,-.,I-, -,,-"--,--.T, , ",,.,. ,., .'--,--, -',, ,,j, , -. ' -- ,;I -.:, ,----. 1, 4,, , , ,-.,-F7,--,, -.-,!,--1, -. ;,--,. ,-,,,,. ,,, -,',-,, ,-,,Z,1, F-,,---Z :, ", ,-..? , ,,.-u,,, -:,li 1--, ,.., ,,, , I ,--, -, . Z,, ,1,, ,, .11 -,. .,--i,',, -.: , t,I,,., ! ;;,, ,--, , b' ,,I ., -k', -j-,-!-- ,7 -, -;, i,, --,--,-.--,,,,-,-,-,,," `-` :.,,.I,,,,t,,,",,I--,,. -,,,-,--,--I,-1. , ,--,,-,, ,.- 'r-,---- ----,A,-".,,lj.-, , I:, .-- :.: ,-,,- I,.q,I ---, -4.,t , I-,-,"! ".,,", -,,-- -'.`z...,.,, --,-,i-.-,--, , -,-,-, ",.",-: ;,,':, ", -',--,,,,-,-,,, ,,, "-, , -4 -,--,-, .---, , ".,----, ,' "-,,--, -'F ,,--"--,-," .',:, -'-,"".,'7,,. : _- , , ,-,..J,--- -- I---,,,W,,26 ,'I-", -, -,7" ..--'-/-.-, ,-,, , .;Z;, .,Vl ,..,,.,-, ,; T---:.-,--, -,-,--, -. ",----, ,Z:` -, -.- -, ,,--;-.' --, -i,' -,- 1,; , ,'. :, "--"-, .,, .,,, --, .J, -,--' ,,", ,- ",", -i, -,,,. ee, _,,-,-" ,..,,-,,-,,,,,,-. ,.-C-,:; .Ir, I-,. -,-,--, , ,-, : -,,",,-,, -, -,,,,,, -,,,,11-- ,-, ,,, ------,,,, , " .--, ,-"I-I ,-,: -. J, ,-, ,. 1-,,", ,'F,-i' I i- IK-,,-, --, "', ;",.,-, `,',"-., ",t. , ', -, ,: , I,, , , , ,, " -,, ,',,, -", --,i. ,",-: ",,, -,--z,-,,,--,-.,,11 - ,.,:-:, -,:-," ,, -- -, -,, .-. L.,.- -,.; ,-.-,.- ,,-, ., ,';', -,,-:,. i.,.,,, -I',-7-" .II,--, --'-,._ _ ,_-- ",,,--, -,-,-, - -`-, - --, ,,, .I-, ,I j,,-,' '',,--,,:-,., - -,Z' , , ,---_,,; -I--,,.,:." ,.. i.., ,.'-- : - ,,., -- ---, F""'-Z,-I----,--,--, ,j,-,o, V, ,.-,.,,',----, i-10"- -',,F,,-, , . ,,,ie" ", -,.--1,,. _,, , :" ,.,, -,...,--. , , -, ,-,--,,,-,-, -. "--, ,,, -,, ,,-,. ,-,. ,-,,..., -,a-V-,"_--, . . -,,: ,,,,,,-,-_--, .II-,-. --,: .:,,-P.,-.:. . -, ,,:Z, , " ,",- -,,--,2',-, -,-- --,. ",-,,--, --. - ', ';-, - :, -, -,,- -,;', -,I. ..--.-,--, -, - ,;__ "7 `"-,, -': ";-, "",, 7,t,'F--,, j: -I-I -, 1, , ,,'' ,-- - ;--!--,I., F--. ,.,: -,',-, ,-, ":, ,,,--Ank',,, ",----,, ;,!,:-,- - , ", ,,',, --,,, -I.I:,., " --.,.- , ,-" :",,.,;., , ; -- --,_ _., ., -.,I ,-_'' -,,--,,-,-, -. -,-, ,-,,,,, -., --, ---. -S.,-,.. --,-,p ,:- -,-- Z --'. -,,, Z, -,.-, -;-,, ,_,,;,-,,.,,, -_,-,--, ,,I` -, . -,- Z,, I-. .-, ,- ,Z, e ',--,' - ,-. -,.II-.--;,I-, .2 -4, , , ,, ;-Z"",---., , :----,F'-.,,--. -,., I-.i ,-,' 1,t--"", "..,-'; ., .I ... I ," ,-, -:v".1; , , 1. m:--l`--.lll'l II ,-, ,--.--,"--,- ,, -1:,-I --, ,Z.I-, " ----,, ,. , '-' ,-4,-'-, il,I,,,1 ,,,---m, , J, ,,-! ,-,-,-.':,:-,,----, ---,-"r."!--,% -- .1, ,,-, --!,,,-I, -- ,,,, -,-,-,-., I.I 15'. -,, , :,-ol-, , ,-', I,--A , .,,I: ,, ,- ,., , ;,-, " t--,,-, -I'mI..11--,-.I-, " ---' ,.I - ,-,,-I,1,i. -. -, -,'-,II---, I--, ,,-.,,'l -jII 1.11-I--,.--, I!, "` ,,.,,--,, 1,, ,:-II'. ,II-1I -j, --.. ,,.--f,-I-,',, -, ---- %--I,,t-, -- --,,,F : :-, ,---'-I :- ----, -I -:I--,,, -I-I,.4. I-1 -'' , ,, ,,-=; ,III " --,, -'t .-, '.,,,, --. ` f-,---' ,, -.- ,,-,,,,,-,-,.--,, ,'-.F""I"', - -,-, ,i,, --,",-,,r-.---- An Optimal Parallel Implementation of a Quadratic Transportation Algorithm * Mike McKenna Thinking Machines Corporation 245 First Street Cambridge, MA 02142. Stavros A. Zenios Decision Sciences Department The Wharton School University of Pennsylvania, Philadelphia, PA 19104. March 7, 1990 *Research partially supported by NSF grants ECS-8718971 and CCR-8811135, AFOSR grant 89-0145 and Thinking Machines Corporation. We would like to acknowledge several useful discussions with Jill Mesirov. 1 -1111-11---~111~I Abstract We discuss the implementation of a quadratic transportation algorithm on massively parallel computer architectures with hypercube communication networks. The implementation is optimal in the sense that it requires effectively O(mporn) operations to perform one iteration of an mo x mD problem using P processors. Peak computing rates of 3 GFLOPS are achieved by the algorithm on a 64K Connection Machine CM-2. 1 Introduction We consider in this paper quadratic optimization problems over transportation constraints. Such problems find applications in logistics, traffic management, and matrix balancing models in economics and regional planning. The dual, row-action algorithm of Zenios and Censor [1989] is well suited for implementation on massively parallel computer architectures. In this report we describe an optimal implementation of this algorithm on a massively parallel architecture with a hypercube communication network. The implementation is optimal in the sense that it requires effectively O(-mp ) operations to perform one iteration of an mo x mD problem using P processors. Computational experiments on a Connection Machine CM-2 indicate that the implementation achieves a computing rate of 3 GFLOPS when solving dense 1024 x 1024 problems on a 64K system. 2 The Quadratic Transportation Algorithm Let < m > denote the set {1,2,3,...,m}. Denote by Rm the m-dimensional Euclidean = (Vo,VD,£), where Vo =< space. A transportation graph is defined as the triplet C {(i,j) I i E Vo,j E VD}. Vo and VD are the sets of mo >, VD =< mD > and is the set of n origin and destination nodes of cardinality mo and mD respectively. directed arcs (i,j), with origin node i and destination node j, which belong to the graph, n < momD. Let also z = (ij) E Rn be the vector of flows; u = (uij) E R" be the vector of upper bounds on the flows; s = (si) E Rmo, for i E Vo, be the vector of supplies; M for j E VD, be the vector of demands; rO = (9O) E RmO, for i E Vo, be d = (dj) E R mD the vector of dual prices for origin nodes; rD = (7r?) E -mD, for j E VD, be the vector of dual prices for destination nodes; r = (rij) Rn be the vector of dual prices for the bound constraints; 6+ = {j E VD I (i,j) E E} be the set of destination nodes which have arcs with origin node i, and 6jf = {i E Vo I (i,j) E £}, the set of origin nodes which have arcs with destination node j. With this notation we define the quadratic transportation problem as follows: Minimize F(x) = f + (1) (ij)E[ Subject to : E zij = jE si,Vi Vo, (2) = d j ,VjEVD, (3) < uij, V (i,j) + zi2j E6j O zij __ E £. (4) where {wij} and {cij} are given positive real numbers. The following dual, row-action, algorithm for this problem was developed in Zenios and Censor [1989], where its asymptotic convergence was established. Step 0: (Initialization) Set k 0. Get z, (O),(rD)O, r such that: - 1 [Cj + (7r0)0 + (,D)+ O + Ai Si= j] 3 . Wi3- (5) Step 1: (Iterative step over constraint set (2)). ~ [ k'1 ZCjE+ tfi1/wij Ps k Si ~- z k (w)k+ = (ir)k 1 ,,Si i WijI s a3iE 6 - Z z(j 3 (6) J · C,(7) +--p,, (8) _ p. Step 2: (Iterative step over constraint set (3)). 1 = ij , /Wj dj - (9) (9) +k zij (,rD)k+ I~~k l I(10) Wij -ak. (?) (11) Step 3: (Iterative step over constraint set (4)). mid {rkj,wi-j(uij -zk), -wijk}, ak. ik+jl = k i (12) (13) Wij rk+l Step 4: Replace k 3 +- r.k -t k (14) k + 1 and return to Step 1. An Optimal Implementation on the Connection Machine The key to the implementation of the algorithm on the Connection Machine CM-2 hypercube is the configuration of the processing elements into a two-dimensional grid. If the grid size is larger than the number of processing elements then we configure the machine into an mo x mD grid of virtual processors (VP), and each physical processing element performs the work of several virtual processors. The dimension of the grid is set equal to the number of origin nodes mo rounded up to the nearest integer that is a power of two, and the other dimension is set equal to the number of destination nodes m D rounded up in the same fashion. Virtual processors outside the mo x mD grid are disabled and do not participate in the computations. The memory of virtual processor with grid coordinates (i,j) is partitioned into the following data fields: (1) Supply and demand, s and d, (2) dual prices, wr° , rD and r, (3) Upper bound u, (The lower bound I is assumed to be equal to zero and hence it is treated as a constant.), (4) current iterate z, (5) a scaling factor or intermediate results, and (6) two fields IW and OW hold the scale used to store p, respectively. The algorithm is executed as follows: 1 / and terms 1/wii 1/~:i Step 1: A spread-sum operation along the -axis of the grid computes the partial sums EjEct 2 for each origin node (i.e., each row of the grid). This result is spread to the scale memory fields of all VP in the same row. A sub-mult operation (i.e., an optimized implementation of a(z - b)) is used to compute the scaling factor p (equation (6)). The scaling factor updates the local field z by addition and the local field r ° by subtraction (equations (7) and (8) respectively). Step 2: This step is similar to Step 1, with a spread-sum along the 0-axis. Step 3: A combination of min and max operations computes the mid(.) of equation (9), which is then used to update the local field z by a mult-add operation (equation (11)) and local field r (equation (11)). 3.1 The Hypercube Implementation In this section we describe how Steps 1-3 of the algorithm are implemented to run on P ). physical processors of the CM-2 hypercube, so that the execution time is in effect O( p Let the of two. are powers and a~ mo, mD that assume To simplify our discussion, vertices of the hypercube be labeled with the integers 0 through P - 1, where vertices i and j of the cube are connected with a wire when the absolute value of i - j is a power of two. We initially configure the mo x mD grid of virtual processors so that each physical x M subgrid of virtual processors. In figure processor is assigned to do the work for an 1 we have mo = 32, mD = 32, P = 16 and each physical processor does the work for an 8 x 8 subgrid of virtual processors. In our configuration, we lay out the physical processors in a a/~ x V two-dimensional grid. Let Pij be the the physical processor that is in row i and column j of the grid. We place physical processor Pij at vertex iP + j of the cube. In the example of figure 1, the 16 physical processors are configured in a 4 by 4 grid, and each physical processor is placed at vertex 4i + j of the hypercube. The arcs in figure 2 illustrate how the CM-2 hypercube connects the VP/ x Jv1 grid of physical processors. Most of the operations in Steps 1-3 require no communication between virtual processors. For these operations, each virtual processor spends 0(1) time to work on its local memory, and each physical processor performs the work of mo"pD virtual processors. Therefore the local operations require O( moptn ) time. The operations that do require communications are the spread-sums, where we add the elements in each row of the virtual grid, and copy the results back to every processor of each virtual row. Technically, a spread-sum is an operation where a field scalej is allocated in each virtual processor, and scaleij = Eki o0 ik. If the transportation graph is not complete, then for each missing edge (ij), zij is set to zero before performing the spread-sum. In the ) time. rest of this section, we show how a spread-sum is performed, effectively, in O( mrn ___I_ _ _ Let Zijab denote the value of z that lies at position (a,b) in the subgrid of physical processor Pij. In figure 1, physical processor Pij holds the values Zij,00, Xij,01, zij,02, Zij,10, zij,ll ziji 2 · .' , 2ij,07, ' -zij,27, . i 2ij,71, Zij,70, Zij,72 ij,77 ... A spread-sum is performed in three phases. In the first phase, each physical processor finds the subtotal for each row in its subgrid. For example, in figure 1, each physical processor performs the following 8 additions: Z1j,00 = Zij,00 zij,10 = zij,10 zij, 7 0 = 2ij,70 + + ij,01 Zij,7 1 + Tij,02 + + Zij,12 + + Zij,72 + ' ... -.. + Zij,07 + Zij, 1 7 + Zij, 7 7 Once these additions have been performed, each physical processor holds a column vector of subtotals (the shaded elements in figure 1). Obtaining these subtotals requires O( PQ n) time. Let Cij denote the column of subtotals that lies in physical processor Pij. In the second phase of the spread-sum, the processors in each physical row i coordinate to perform efficiently the vector sum SUMi = Cio + Cil + ... + Ci(_l). The resulting vector SUMi holds the spread-sums for the rows of the virtual grid that are assigned to physical row i. In figure 1, each vector SUMi would hold the spread-sums for virtual rows 8i through 8i + 7. Now note that the physical processors Pio, Pil, Pi2 , ... of physical row i all lie in a physical subcube of the machine that spans log 2 P dimensions. (In figure 2, the thick arcs show how each row of the physical grid spans a two-dimensional subcube.) Let Cij,k denote the kth element in column vector Cij, and let SUMik denote the kth element in column vector SUMi. Given the log 2Vi-dimensional subcube, we use the procedure listed below to obtain the first log 2 v'P elements of vector SUMi. (Note that the CM-2 hypercube wires are bi-directional and can be activated simultaneously.) Let = log2 V; For k = 0 through - 1 do (S1) Let each processor Pj send Cij,o along dimension (k + 0) mod I of the subcube, Cij,1 along dimension (k + 1) mod I of the subcube, Cij,-l 1 along dimension (k + - 1) mod of the subcube; (S2) Let each processor Pij receive a value tempo along dimension (k + 0) mod of the subcube, a value templ along dimension (k + 1) mod I of the subcube, a value temp_l along dimension (k + (S3) For q = 0 through - 1 do Let Cj,q = Cij,q + tenpq; - 1) mod of the subcube; _.111_1111_1 ------_ I---· I^-l Problem Itns. VP Real time CM time Real GFLOPS CM GFLOPS TEST1 5000 ratio 256 (seconds) 556.49 (seconds) 532.92 (64K CM-2) 3.012 (64K CM-2) 3.148 TEST2 4600 128 270.24 255.62 2.856 3.019 TEST2 4000 256 730.00 676.00 1.839 1.986 Table 1: Performance of the quadratic transportation algorithm using 1024 x 1024. First and second rows report results with the optimal implementation of the algorithm. Third row reports results using C/Paris instructions from the CM-2 library, release 5.2. At the end of this procedure, each value Cijk for k = 0,..., I - 1 equals SUMik. If we repeat this procedure for consecutive groups of L elements along vector Cij, then eventually each vector Cij equals SUMi, which completes phase two of the spread-sum. Each execution of steps (S1) and (S2) requires O(1) time. On a Connection Machine, the execution time for the loop in step (S3) is negligible when compared to the communication time in steps (S1) and (S2); so we treat the loop in step (S3) as an O(1) operation. The outer loop of this procedure iterates times; so the whole procedure requires O(t) time to execute. The procedure itself is repeated for ~ groups of L elements; so phase two of the spread sum requires O( p) time. If step (S3) of the above procedure was not dominated by steps (S1) and (S2), then we would have to say that phase two of the spread-sum requires O( f log time. In that 2v) case, we could remove the log2 / factor by using a straightforward variant of the more complex one-to-all broadcast algorithm that is given by Johnsson and Ho [1989]. In phase three of the spread-sum, we copy the column vector Cij (which now equals SUMi) across processor Pij's subgrid, so that each virtual processor holds the result of a spread-sum. This phase requires O( !~pR ) time. Adding the execution times for the three phases of the spread-sum, we get an execution time that is effectively O( mp ). 3.2 Numerical Results We implemented the algorithm of Section 2 as explained in Section 3 in C/Paris, using CMIS instructions, on a CM-2 with 32-bit floating point accelerators. The performance of the parallel implementation was evaluated using two randomly generated test problems of dimension 1024 x 1024. The implementation uses virtual processors at a ratio of 256 and 128 virtual processors per physical processor, on a 4K and 8K CM-2 respectively. The computing rate of the algorithm under different virtual processing ratios is summarized in Table 1. The last row of the same table provides, as a benchmark, the performance of a C/Paris implementation of the algorithm without using the techniques of section 3. Observe that the computing rate estimated based on CM time is consistently in excess of 3 GFLOPS. Computing rates estimated based on real time exceed 3 GFLOPS for higher virtual processing ratios. This discrepancy was anticipated since the first phase of spread-sum requires no communications, while hypercube communications are needed in the second phase of the operation. The C/Paris implementation is significantly slower than the optimal implementation. The difference in the number of iterations between the two runs for problem TEST2 is due to the difference in the terminal tolerance specified. Both implementations achieve identical final error in exactly the same number of iterations. - --- -- - -- -- :11.1 ....... I.. 1 .... 1 1 I...... . .I ...... ...... :.... .... I...... 1 i Figure 1: Virtual Processor Configuration. 4 Figure 2: Physical Processor Configuration. Conclusions Gigaflop performance has been quite common in several areas of large scale scientific computing using massively (and other) parallel architectures. Unfortunately such performance is difficult to achieve in numerical optimization. Optimization problems do not usually have nice structures at a micro level, and optimization algorithms need frequent communications. We have shown that for some well structured problems we may overcome the communication bottleneck and achieve performance in the range of 3 GFLOPS. An implementation that is effectively optimal was achieved using a simpler scheme than the one of Johnsson and Ho [1989]. References [1] S.L. Johnsson and C.T. Ho., "Optimum Broadcasting and Personalized Communication in Hypercubes", IEEE Transactions on Computers, vol. 38, no. 9, Sept. 1989, p. 12491268. [2] S.A. Zenios and Y. Censor. Massively Parallel Row-Action Algorithms for Some Nonlinear TransportationProblems. Report 89-09-10, Decision Sciences Department, The Wharton School, University of Pennsylvania, Philadelphia, PA, 1989. 7 .