-V .- ,-- : Il

advertisement
-V
,."".
-,,.
, ,i
:::,_- : Il
.,-I,,,,,
,,-,`
I - -,-.." - I-11I-I,II
: - .
,--,F-.I
-,.--,
"-,,,
, _a11
,-, 1,-,
-,- : I.;- --:,;-I,
I-..,-.,
, --.
: --I:.- --,l--,p,
:-"-.;
., ,.-,-.,,- 7-,-,q.
----::,
I IC ,11"; ,-,
I:I- -,. -I II -..
,,,-..1I--,.,
,71-,,- r
. ---1
.:-.,,.I---II,-,
. ,: --.
,1,
-,: ",-,,-,-,
11,-I.I,".:-,-,,i,
,-I
,',..I
1,
--,,--,"-,
:III,,,-,,
-,,il
',.,--,,-,,,-.:,:,- -,
-' , -,
,I-,:
'
, ,"--,,- --.
-, ,--,",-- -:,I l -,--,,;-,.
- ,l-.,
-,
,,--, 7-:';:-,-', ,,:'.,----,
: -,-,,-,1,,
,,-,-,
"
--.
., ,,.,,.,,,, --.
-,,:".
'--'--,`---;
, Z":f,
",,
,-, ;:, -,,,--_,, - ,K'-,
,-rII
; -,,-,,,,
,,r'-- , --2,
2,;, F,,,,,, ,.F,,-,,I
,,.--,.,',-.
,,--I" .
--",-'
--,--..',,,.-, I w1-1:.,,
,,
-.-A
-,
-, -,,fJ,.I----,-,-,
- ,-"-,
,II," , `,, -,,"
-,,
-,--,,- .......L-,,,
.- ::-I--,,,:---wII--- -,, .- --',
,--,
I,---::-,.
,7
-I'I -,,,
, !,'
,--,,-,
.-:,
,-,,,-,"
,
, -,
-,-_,
;,. ,--r"-,,-,,,--,,
:--- ,,,---,-,,,,,
-.i ,-- ,,,,
-.,4--,.,r-rr-,.:C----:-.
-'I---,- --:
,."
.7,", ,,I,,.,-----7.
1,,,,,-I,,
-,!
,,--,,,-,
,, ,,
'.-,-,,-,---,I:.-4.
-I,,:--,
-. ",---:,,.,,
,-I
,".,,-,
-,,,--' 4-, 4,, -, -,---,-I"'-,-w,,-,
----,
;'--,
"--': ,-` ",--,,,
-,------ ,--,----I-':
-, ,,,,,
-,,-:
-,,i,Z,--".-,
I .., ,-:,-.,-,"---'
it
,I-,,
, -,,, ,-C,,,,,I
-,,"'
:,--,,
,, , --.-,I---,,,,-I-,
,--L-,II
,---"."
"I"";
, ,:
-'----.,--I.--- '
,-,,.,-,-,-,
, ,,,`,'i-,
,,
---_,,,
-',--',
"
, ----,,
,i,,-,,,...- -.,-, '.
,"".
-,,:------,,,I
,
3'
-.
,o
,
,
,
-II,-,
-.-,;
-,
',,-_,
a--i
;
-.
".,
-,-,
_-,,--4,
,,,,
,,--,-4,-,,7`A
,
,
,, ,; .,
,,-.I-,-"' -,,i...
--,.,,.1i
"I,,- .-----11"
-,
-",,
,,,,-I-,
"
,; ,,,, ,,
",,
,,,
,,., : ".
;
,-Z--::-,',-',,
,
.-- ----.--' .,-,",,,
.i-;
. ,-`
-:"--.-,I--,:C,I ,',,,
':-, -: ,.,-,
,,,'
-',"
, -,---ml--, - -, , ,,' -r
,", ,--. ,,,,,,
,..,
-,
,-,,.,
,,
-,-,:,,-'.-!Z,,I-1,111`
,---,- ,-,"
,----- ,,,-,
,,-",
,-.-, ,:":,--,
, ,---':-:.---.,,-,-',--F7
",,
-I-,
I,-,,--,----,',
-:,-',',,
-,,''-,-P".,`,'-",-_
,"'-""
,,-I.
,Zi
--,,-,'-",,',-,;.
,,,"
,-,. .
6,-,-.., ,-'
,-: -.,
-,
,'--,--"
-4.,
7z ,,.,,-,,-----",
,,
,-,,',
,
-1,
,' --,
,I- I,-i- ,,",
--`J,
lr --,,-":
- ,-,,,---"----;, ',, ,,:
1,
-̀-,.',",
!I,,
,-':-,-,-,I:-"',-,-"'- -,
-- ,!-----,--,
,,-.,-,.-,
-- zt,-,'.-,,,',,
,` W,
,
--,,.,-,
,", .
---,,,---,,,,,
-,
,----,,,,-:
-,-I
--F
,,,,
-,"",
,
,
--"-,':-',,
-,
--,,,
,",--,,,-----'.
,.,-,,,,
,
-1----.'
`1,,-,,,.-F--:
--.,,,
..,
",,-,
7,
"
`-1
,,-,,
,.,-"
-,,e.,-f---,---.
:
,.-"---,-,,7.
,---.
,-,
,,
,--,,
,
--. '.,,-,-, -,
,-.',-----.,,.-,,,,-",-,,,
-,-I, -" "- .I,-,,,,--,
.',--,-,,
,.
"-",
-1
'-, ,--"
II--`",
,;' ,,.,,
-,--I',-,,C--',',
, - , ",
""'
,-:-----e-,,,-,.'",,
,_ -I-,
7,-,A'
-,-,,,,I,
`,V
---- .1:,
,-,
---,-',
----,7
';z,-,
,
-,., ,--,,-,,',,,-,
'.
,,",,
,
St',
-;',,,;,
-,-,
,-,-,.,
'.-I
-V
,,l
,',
,A
.-,-,,-.
ll'
--;
','.,,;,
',
;,",
,
---"
------:.,-,,-,7,---z
---."-,,
,
-',
,:;
-,,,-_:
:
---,,-----l---;-----F,
,,
,,
,-;
"-,,,-;,
,,,,,.-11
,
,
-,i
,-..
.,-,---,
: " ""',-,---,",,,,
,, . ",
,I---,',-,-, .-- ,,
----,-,,-",
.z,--.,,- ,,i-:!-,.
- I,-,,, ,' ,,--,--,, ,,.-,
-, ,,-,
,,F:
,,,,,,, ,,"'.,-,
,,:,,.-"-,-,-,,,-----,-,--,,.,-,
-,
,,,
,-,
--",-,z
-,
., -,-,.- ...
".,,,,,-,I
-4
,,,
,,",--,
1
,i-,.
,:", -:.
'-' , F,
,-%'-,'
,
'.,
,, --'
----,"'--,--,,::;
--,,,-,, "-,
,,,
,--,-,-,
!, -__1 ,-.
-.-.
0-,
1,I-,"",
-,,,,-?
"""
I-1"',,
"'
-'l-,"-- ,.,.
,-"
I:-,
I --.,-"-,
'--,:
,-,,-,-,-.,,--.
"
-,-,,-6,-.,
.", - -,,".
,N'
-,-,
,_
-1
, --,F!'-I.---"-4--,,,----,,
- ,
--II,
-:,.,,
-,N.T
-,,,:-,"-----. ,,, - .-,
--,,,--;
,I.,_"-"--.",...i,,,.,,,
,,-,
,,,-'.--,,,II.-,"A
-e-,-,
, ,",
-II"--.,-,,.I,,
_0
,..
J,---,,.,--,,-,C,,--',i
-I
I,,', ,
,,--,--,,,----,.,i.,,,.',",,,,'-, --,- ,
, ,,_",-"..
,-'---,. -,,--,
.,,,--,-.,,.
-,- , "!
-- ,
- -.
,,,,,,,
---.
,
.:, .-,-,,--,,-,,,-,--,-, "
-:,-.,
,; ., ---- -,,,
,
---,,
-,---"-,,-,--,,
-..,I-,,
,- ",
--- -".,
,,'l---'llll;,.
-,-,
-,-,-,.
-,
,-,,
-,i-1- ,"
6-,,,",,",
;_.
"
----,".o
:
"-,-,
"
`-W-1"-,-,'--,,`-,
-I---`,-, ,-------,,---, , ,.-,." ,-,-,,---,:",
.---.
,-,-,--- ---:...:',
,- ,,,
-.,-,_,--,,-,,,-,:-Z-----,T-.,.---" -., ,-'.
,.
--I.,,,
---.
,-, -,,-,-,
.
I,,
-,,,-,-,.-11-.-,, ..-,,-. .,, -- ,
-,J,,",-,,,,:
'.
,--"--t,----,,,.
:.- ,-,
,,.
,j,,,,,,-----,
,-I -,;'.-'--, , `,,,
-,
.,11,.1--I
,", :-,
,- , -,,,,
,"-,.,,
:.. - --,, ",
,'i,,,- -,
q-,r.,----,,."
-,-,,
-,,
,....!
-.",
'-i---,-.",
-,-,,1,--:-"-,;
,,,,
,-,j,-,,.
M,
,.-,,-, 1,
,,,,,,,
---,"
-,--,
-1
,.,-,--:
-1
",."
1,
.
-.
-,,--e
-,
-',,
-_,F,:,"',
-,.",
,,,.,,
-,,-'i',,,
--,
,,,-,-"",
,"',
"-, -vI
,-,I--,-,-,,-,,--.,,
-: ,'.-, ,i, .:-,,,,,--,, ,
,,.,.,
,Z f '.
-,-,,
,,
---"-,-,
.,-,,,-_i5
-, , -.
:
'-,.
"---L-,,----,
-, :,- ,"
",,,,---,,
,I
-;
i.
-II
-,
,
-,
"d
",,-,--------l
-:
,
-",
,--..
t',
,
,,
I,
,
,-FI-,
,;.,;
,,,
,-,
,
.I
".1
.
,, ',----,I.I.,-,,
"
-,;:z
, ,; .- --,,
---,,
-...
` ,-7," ,-; ,:, -, ,,--",,-,-.,... .,,,,-,
---,.-F-.,
- "--,- ,,,
-", ..',--,'
,,-I
--',,'.--,I,-,.-III--,
%,-----,Z
--, :,, ..
"-, ,:F,,,-,Z,- !-,
,-.,,,-"- ,, -,"
",,, ,,.,--,
,, '-' ' .,:r
' ,' I,'.,-.
7:-,:
`,. 2Y'I'
-,
,-, r.,-',, ,:i -, "
-;,1,4,,
,--,
,
.-i:,,,,,.,I-I"..
, .
I. 1.-Al
; "-.
-',,,.'- ,,,,-, o Iri -:--,, :--,,,,I-,,
---"--'-.'.,, ,,, "
.-",,, ,,--.,-,
,:i -,,-.;I-,--- --.-,,--']
,, -,
--,--,--"-,
-,"",
,.
"',_
,-- -- - i ",
,,-- ,:,I
-`,-r
,,-T1,
I '-',,','_,,"i--j,;.-,,
=,----,
-'. 4 ,!-,; -,,-.-,.I.",,4,
1
,--.,.1,,.,.
--_,--,----I
,---T,,-,:--,,,-,----I- . ...,I- I-.-,-,,II-',-- ,
-,-,' '.
.,,-,,,
,,-T,.-, ,,,,")
,,, ,- --,
,t
I-,-,
-. - "
,-, "-- , ----.
-,---,,,,,,-:T
,,,,;, -,,
-,-,--,
--,, 3.;;,411
,":, -. ,,-,,;
-X: ,,I',- I'; -,.-.-,-,
- --"-,
--,,--,
---t, ,-, _-::-,,-:,T
-1.--'-,,
-,"-, --,,;--,5 ,- ,-,:"-w--7,,-",-,,
..-,
': ,,,,,,-.,I-, -,,-"--,--.T,
, ",,.,. ,.,
.'--,--,
-',,
,,j,
,
-.
' -- ,;I -.:, ,----.
1,
4,,
, , ,-.,-F7,--,,
-.-,!,--1,
-. ;,--,.
,-,,,,.
,,, -,',-,,
,-,,Z,1,
F-,,---Z
:,
",
,-..?
,
,,.-u,,,
-:,li
1--,
,..,
,,,
,
I
,--,
-,
.
Z,,
,1,,
,, .11
-,. .,--i,',, -.:
, t,I,,., !
;;,,
,--, ,
b' ,,I .,
-k',
-j-,-!-- ,7 -,
-;,
i,,
--,--,-.--,,,,-,-,-,,,"
`-` :.,,.I,,,,t,,,",,I--,,.
-,,,-,--,--I,-1.
, ,--,,-,, ,.- 'r-,---- ----,A,-".,,lj.-, , I:,
.-- :.: ,-,,- I,.q,I
---,
-4.,t
, I-,-,"!
".,,",
-,,-- -'.`z...,.,,
--,-,i-.-,--,
, -,-,-,
",.",-:
;,,':, ",
-',--,,,,-,-,,,
,,, "-,
,
-4
-,--,-,
.---,
,
".,----,
,' "-,,--,
-'F
,,--"--,-,"
.',:,
-'-,"".,'7,,.
: _- , , ,-,..J,--- -- I---,,,W,,26
,'I-",
-,
-,7"
..--'-/-.-,
,-,, , .;Z;,
.,Vl
,..,,.,-,
,; T---:.-,--,
-,-,--,
-.
",----,
,Z:` -, -.- -, ,,--;-.'
--, -i,'
-,- 1,; , ,'.
:, "--"-,
.,,
.,,,
--,
.J,
-,--'
,,",
,- ",",
-i, -,,,. ee,
_,,-,-"
,..,,-,,-,,,,,,-.
,.-C-,:;
.Ir,
I-,.
-,-,--,
,
,-,
: -,,",,-,,
-,
-,,,,,, -,,,,11-- ,-,
,,,
------,,,,
,
"
.--,
,-"I-I
,-,:
-.
J,
,-,
,.
1-,,",
,'F,-i' I
i- IK-,,-,
--,
"',
;",.,-,
`,',"-.,
",t.
,
',
-,
,:
,
I,,
,
,
,
,,
"
-,,
,',,,
-",
--,i.
,",-: ",,, -,--z,-,,,--,-.,,11
- ,.,:-:,
-,:-," ,, -- -,
-,,
.-.
L.,.- -,.; ,-.-,.- ,,-,
.,
,';',
-,,-:,.
i.,.,,,
-I',-7-" .II,--, --'-,._
_
,_-- ",,,--,
-,-,-,
- -`-,
- --,
,,, .I-,
,I
j,,-,' '',,--,,:-,., - -,Z'
, ,
,---_,,; -I--,,.,:."
,.. i..,
,.'-- :
- ,,.,
-- ---,
F""'-Z,-I----,--,--,
,j,-,o,
V,
,.-,.,,',----,
i-10"- -',,F,,-,
,
.
,,,ie"
",
-,.--1,,.
_,,
,
:"
,.,,
-,...,--.
,
,
-,
,-,--,,,-,-,
-.
"--,
,,,
-,,
,,-,.
,-,.
,-,,...,
-,a-V-,"_--,
.
.
-,,:
,,,,,,-,-_--,
.II-,-.
--,:
.:,,-P.,-.:.
.
-,
,,:Z, ,
"
,",- -,,--,2',-,
-,-- --,. ",-,,--,
--.
- ', ';-,
- :, -,
-,,- -,;', -,I. ..--.-,--,
-, - ,;__
"7 `"-,, -':
";-,
"",,
7,t,'F--,,
j: -I-I
-,
1,
,
,,'' ,-- - ;--!--,I., F--.
,.,: -,',-,
,-, ":,
,,,--Ank',,,
",----,,
;,!,:-,- - , ",
,,',,
--,,, -I.I:,., " --.,.- , ,-"
:",,.,;.,
, ; -- --,_ _.,
., -.,I
,-_'' -,,--,,-,-, -.
-,-,
,-,,,,, -., --,
---.
-S.,-,..
--,-,p
,:- -,-- Z
--'.
-,,,
Z, -,.-,
-;-,,
,_,,;,-,,.,,,
-_,-,--, ,,I` -,
. -,- Z,, I-.
.-, ,- ,Z, e ',--,' - ,-.
-,.II-.--;,I-,
.2 -4,
,
, ,, ;-Z"",---.,
,
:----,F'-.,,--.
-,.,
I-.i ,-,'
1,t--"",
"..,-'; .,
.I
...
I ,"
,-,
-:v".1; , , 1.
m:--l`--.lll'l
II ,-,
,--.--,"--,- ,,
-1:,-I
--,
,Z.I-,
"
----,,
,.
,
'-'
,-4,-'-,
il,I,,,1
,,,---m,
,
J,
,,-!
,-,-,-.':,:-,,----,
---,-"r."!--,%
-- .1,
,,-,
--!,,,-I,
-- ,,,,
-,-,-,-.,
I.I 15'.
-,,
, :,-ol-, , ,-',
I,--A
,
.,,I:
,,
,- ,., ,
;,-,
"
t--,,-,
-I'mI..11--,-.I-,
" ---'
,.I
- ,-,,-I,1,i.
-.
-,
-,'-,II---,
I--,
,,-.,,'l
-jII
1.11-I--,.--,
I!,
"`
,,.,,--,,
1,, ,:-II'.
,II-1I
-j,
--..
,,.--f,-I-,',,
-, ---- %--I,,t-,
-- --,,,F
: :-,
,---'-I :- ----,
-I
-:I--,,,
-I-I,.4.
I-1
-''
, ,,
,,-=;
,III
"
--,,
-'t
.-,
'.,,,,
--.
` f-,---'
,, -.- ,,-,,,,,-,-,.--,, ,'-.F""I"',
- -,-,
,i,,
--,",-,,r-.----
An Optimal Parallel Implementation
of a Quadratic Transportation Algorithm *
Mike McKenna
Thinking Machines Corporation
245 First Street
Cambridge, MA 02142.
Stavros A. Zenios
Decision Sciences Department
The Wharton School
University of Pennsylvania,
Philadelphia, PA 19104.
March 7, 1990
*Research partially supported by NSF grants ECS-8718971 and CCR-8811135, AFOSR grant 89-0145
and Thinking Machines Corporation. We would like to acknowledge several useful discussions with Jill
Mesirov.
1
-1111-11---~111~I
Abstract
We discuss the implementation of a quadratic transportation algorithm on massively
parallel computer architectures with hypercube communication networks. The implementation is optimal in the sense that it requires effectively O(mporn) operations to
perform one iteration of an mo x mD problem using P processors. Peak computing
rates of 3 GFLOPS are achieved by the algorithm on a 64K Connection Machine CM-2.
1
Introduction
We consider in this paper quadratic optimization problems over transportation constraints.
Such problems find applications in logistics, traffic management, and matrix balancing
models in economics and regional planning. The dual, row-action algorithm of Zenios and
Censor [1989] is well suited for implementation on massively parallel computer architectures.
In this report we describe an optimal implementation of this algorithm on a massively
parallel architecture with a hypercube communication network. The implementation is
optimal in the sense that it requires effectively O(-mp ) operations to perform one iteration
of an mo x mD problem using P processors. Computational experiments on a Connection
Machine CM-2 indicate that the implementation achieves a computing rate of 3 GFLOPS
when solving dense 1024 x 1024 problems on a 64K system.
2
The Quadratic Transportation Algorithm
Let < m > denote the set {1,2,3,...,m}. Denote by Rm the m-dimensional Euclidean
= (Vo,VD,£), where Vo =<
space. A transportation graph is defined as the triplet
C {(i,j) I i E Vo,j E VD}. Vo and VD are the sets of
mo >, VD =< mD > and
is the set of n
origin and destination nodes of cardinality mo and mD respectively.
directed arcs (i,j), with origin node i and destination node j, which belong to the graph,
n < momD. Let also z = (ij) E Rn be the vector of flows; u = (uij) E R" be the vector
of upper bounds on the flows; s = (si) E Rmo, for i E Vo, be the vector of supplies;
M
for j E VD, be the vector of demands; rO = (9O) E RmO, for i E Vo, be
d = (dj) E R mD
the vector of dual prices for origin nodes; rD = (7r?) E -mD, for j E VD, be the vector of
dual prices for destination nodes; r = (rij) Rn be the vector of dual prices for the bound
constraints; 6+ = {j E VD I (i,j) E E} be the set of destination nodes which have arcs with
origin node i, and 6jf = {i E Vo I (i,j) E £}, the set of origin nodes which have arcs with
destination node j. With this notation we define the quadratic transportation problem as
follows:
Minimize
F(x)
=
f
+
(1)
(ij)E[
Subject to :
E zij =
jE
si,Vi Vo,
(2)
=
d j ,VjEVD,
(3)
<
uij, V (i,j)
+
zi2j
E6j
O zij
__
E
£.
(4)
where {wij} and {cij} are given positive real numbers. The following dual, row-action,
algorithm for this problem was developed in Zenios and Censor [1989], where its asymptotic
convergence was established.
Step 0: (Initialization) Set k
0. Get z, (O),(rD)O, r such that:
-
1 [Cj + (7r0)0 + (,D)+
O
+
Ai
Si=
j]
3 .
Wi3-
(5)
Step 1: (Iterative step over constraint set (2)).
~ [
k'1
ZCjE+ tfi1/wij
Ps
k
Si
~-
z k
(w)k+
=
(ir)k
1
,,Si
i
WijI
s
a3iE
6
-
Z
z(j
3
(6)
J · C,(7)
+--p,,
(8)
_ p.
Step 2: (Iterative step over constraint set (3)).
1
=
ij
,
/Wj dj -
(9)
(9)
+k
zij
(,rD)k+
I~~k
l
I(10)
Wij
-ak.
(?)
(11)
Step 3: (Iterative step over constraint set (4)).
mid {rkj,wi-j(uij -zk), -wijk},
ak.
ik+jl
=
k
i
(12)
(13)
Wij
rk+l
Step 4: Replace k
3
+-
r.k
-t
k
(14)
k + 1 and return to Step 1.
An Optimal Implementation on the Connection Machine
The key to the implementation of the algorithm on the Connection Machine CM-2 hypercube is the configuration of the processing elements into a two-dimensional grid. If the
grid size is larger than the number of processing elements then we configure the machine
into an mo x mD grid of virtual processors (VP), and each physical processing element
performs the work of several virtual processors. The dimension of the grid is set equal to
the number of origin nodes mo rounded up to the nearest integer that is a power of two,
and the other dimension is set equal to the number of destination nodes m D rounded up
in the same fashion. Virtual processors outside the mo x mD grid are disabled and do not
participate in the computations. The memory of virtual processor with grid coordinates
(i,j) is partitioned into the following data fields: (1) Supply and demand, s and d, (2)
dual prices, wr° , rD and r, (3) Upper bound u, (The lower bound I is assumed to be equal
to zero and hence it is treated as a constant.), (4) current iterate z, (5) a scaling factor
or intermediate results, and (6) two fields IW and OW hold the
scale used to store p,
respectively. The algorithm is executed as follows:
1 /
and
terms
1/wii
1/~:i
Step 1: A spread-sum operation along the -axis of the grid computes the partial sums
EjEct 2 for each origin node (i.e., each row of the grid). This result is spread
to the scale memory fields of all VP in the same row. A sub-mult operation (i.e.,
an optimized implementation of a(z - b)) is used to compute the scaling factor p
(equation (6)). The scaling factor updates the local field z by addition and the local
field r ° by subtraction (equations (7) and (8) respectively).
Step 2: This step is similar to Step 1, with a spread-sum along the 0-axis.
Step 3: A combination of min and max operations computes the mid(.) of equation (9), which
is then used to update the local field z by a mult-add operation (equation (11)) and
local field r (equation (11)).
3.1
The Hypercube Implementation
In this section we describe how Steps 1-3 of the algorithm are implemented to run on P
).
physical processors of the CM-2 hypercube, so that the execution time is in effect O( p
Let
the
of
two.
are
powers
and
a~
mo,
mD
that
assume
To simplify our discussion,
vertices of the hypercube be labeled with the integers 0 through P - 1, where vertices i
and j of the cube are connected with a wire when the absolute value of i - j is a power of
two. We initially configure the mo x mD grid of virtual processors so that each physical
x M subgrid of virtual processors. In figure
processor is assigned to do the work for an
1 we have mo = 32, mD = 32, P = 16 and each physical processor does the work for an
8 x 8 subgrid of virtual processors.
In our configuration, we lay out the physical processors in a a/~ x V two-dimensional
grid. Let Pij be the the physical processor that is in row i and column j of the grid. We
place physical processor Pij at vertex iP + j of the cube. In the example of figure 1, the
16 physical processors are configured in a 4 by 4 grid, and each physical processor is placed
at vertex 4i + j of the hypercube. The arcs in figure 2 illustrate how the CM-2 hypercube
connects the VP/ x Jv1 grid of physical processors.
Most of the operations in Steps 1-3 require no communication between virtual processors.
For these operations, each virtual processor spends 0(1) time to work on its local memory,
and each physical processor performs the work of mo"pD virtual processors. Therefore the
local operations require O( moptn ) time.
The operations that do require communications are the spread-sums, where we add the
elements in each row of the virtual grid, and copy the results back to every processor of each
virtual row. Technically, a spread-sum is an operation where a field scalej is allocated in
each virtual processor, and scaleij = Eki o0 ik. If the transportation graph is not complete,
then for each missing edge (ij), zij is set to zero before performing the spread-sum. In the
) time.
rest of this section, we show how a spread-sum is performed, effectively, in O( mrn
___I_ _
_
Let Zijab denote the value of z that lies at position (a,b) in the subgrid of physical
processor Pij. In figure 1, physical processor Pij holds the values
Zij,00,
Xij,01,
zij,02,
Zij,10,
zij,ll
ziji
2
· .'
,
2ij,07,
'
-zij,27,
.
i
2ij,71,
Zij,70,
Zij,72
ij,77
...
A spread-sum is performed in three phases. In the first phase, each physical processor
finds the subtotal for each row in its subgrid. For example, in figure 1, each physical
processor performs the following 8 additions:
Z1j,00
=
Zij,00
zij,10
=
zij,10
zij, 7 0
=
2ij,70
+
+
ij,01
Zij,7 1
+
Tij,02
+
+
Zij,12
+
+
Zij,72
+
'
...
-..
+
Zij,07
+
Zij, 1 7
+
Zij, 7 7
Once these additions have been performed, each physical processor holds a column vector of
subtotals (the shaded elements in figure 1). Obtaining these subtotals requires O( PQ n)
time.
Let Cij denote the column of subtotals that lies in physical processor Pij. In the
second phase of the spread-sum, the processors in each physical row i coordinate to perform
efficiently the vector sum SUMi = Cio + Cil + ... + Ci(_l). The resulting vector SUMi
holds the spread-sums for the rows of the virtual grid that are assigned to physical row
i. In figure 1, each vector SUMi would hold the spread-sums for virtual rows 8i through
8i + 7.
Now note that the physical processors Pio, Pil, Pi2 , ... of physical row i all lie in a
physical subcube of the machine that spans log 2 P dimensions. (In figure 2, the thick arcs
show how each row of the physical grid spans a two-dimensional subcube.) Let Cij,k denote
the kth element in column vector Cij, and let SUMik denote the kth element in column
vector SUMi. Given the log 2Vi-dimensional subcube, we use the procedure listed below
to obtain the first log 2 v'P elements of vector SUMi. (Note that the CM-2 hypercube wires
are bi-directional and can be activated simultaneously.)
Let = log2 V;
For k = 0 through - 1 do
(S1) Let each processor Pj send
Cij,o along dimension (k + 0) mod I of the subcube,
Cij,1
along dimension (k + 1) mod I of the subcube,
Cij,-l 1 along dimension (k + - 1) mod of the subcube;
(S2) Let each processor Pij receive
a value tempo along dimension (k + 0) mod of the subcube,
a value templ
along dimension (k + 1) mod I of the subcube,
a value temp_l along dimension (k +
(S3) For q = 0 through - 1 do
Let Cj,q = Cij,q + tenpq;
- 1) mod
of the subcube;
_.111_1111_1
------_
I---·
I^-l
Problem
Itns.
VP
Real time
CM time
Real GFLOPS
CM GFLOPS
TEST1
5000
ratio
256
(seconds)
556.49
(seconds)
532.92
(64K CM-2)
3.012
(64K CM-2)
3.148
TEST2
4600
128
270.24
255.62
2.856
3.019
TEST2
4000
256
730.00
676.00
1.839
1.986
Table 1: Performance of the quadratic transportation algorithm using 1024 x 1024. First
and second rows report results with the optimal implementation of the algorithm. Third
row reports results using C/Paris instructions from the CM-2 library, release 5.2.
At the end of this procedure, each value Cijk for k = 0,..., I - 1 equals SUMik. If we
repeat this procedure for consecutive groups of L elements along vector Cij, then eventually
each vector Cij equals SUMi, which completes phase two of the spread-sum.
Each execution of steps (S1) and (S2) requires O(1) time. On a Connection Machine, the
execution time for the loop in step (S3) is negligible when compared to the communication
time in steps (S1) and (S2); so we treat the loop in step (S3) as an O(1) operation. The
outer loop of this procedure iterates times; so the whole procedure requires O(t) time to
execute. The procedure itself is repeated for ~
groups of L elements; so phase two of the
spread sum requires O( p) time.
If step (S3) of the above procedure was not dominated by steps (S1) and (S2), then we
would have to say that phase two of the spread-sum requires O( f log
time. In that
2v)
case, we could remove the log2 / factor by using a straightforward variant of the more
complex one-to-all broadcast algorithm that is given by Johnsson and Ho [1989].
In phase three of the spread-sum, we copy the column vector Cij (which now equals
SUMi) across processor Pij's subgrid, so that each virtual processor holds the result of a
spread-sum. This phase requires O( !~pR ) time. Adding the execution times for the three
phases of the spread-sum, we get an execution time that is effectively O( mp ).
3.2
Numerical Results
We implemented the algorithm of Section 2 as explained in Section 3 in C/Paris, using
CMIS instructions, on a CM-2 with 32-bit floating point accelerators. The performance
of the parallel implementation was evaluated using two randomly generated test problems
of dimension 1024 x 1024. The implementation uses virtual processors at a ratio of 256
and 128 virtual processors per physical processor, on a 4K and 8K CM-2 respectively. The
computing rate of the algorithm under different virtual processing ratios is summarized in
Table 1. The last row of the same table provides, as a benchmark, the performance of a
C/Paris implementation of the algorithm without using the techniques of section 3. Observe
that the computing rate estimated based on CM time is consistently in excess of 3 GFLOPS.
Computing rates estimated based on real time exceed 3 GFLOPS for higher virtual processing ratios. This discrepancy was anticipated since the first phase of spread-sum requires no
communications, while hypercube communications are needed in the second phase of the
operation. The C/Paris implementation is significantly slower than the optimal implementation. The difference in the number of iterations between the two runs for problem TEST2
is due to the difference in the terminal tolerance specified. Both implementations achieve
identical final error in exactly the same number of iterations.
-
---
-- -
--
--
:11.1 ....... I.. 1 ....
1 1 I...... . .I ...... ...... :.... .... I...... 1
i
Figure 1: Virtual Processor Configuration.
4
Figure 2: Physical Processor Configuration.
Conclusions
Gigaflop performance has been quite common in several areas of large scale scientific computing using massively (and other) parallel architectures. Unfortunately such performance
is difficult to achieve in numerical optimization. Optimization problems do not usually have
nice structures at a micro level, and optimization algorithms need frequent communications.
We have shown that for some well structured problems we may overcome the communication bottleneck and achieve performance in the range of 3 GFLOPS. An implementation
that is effectively optimal was achieved using a simpler scheme than the one of Johnsson
and Ho [1989].
References
[1] S.L. Johnsson and C.T. Ho., "Optimum Broadcasting and Personalized Communication
in Hypercubes", IEEE Transactions on Computers, vol. 38, no. 9, Sept. 1989, p. 12491268.
[2] S.A. Zenios and Y. Censor. Massively Parallel Row-Action Algorithms for Some Nonlinear TransportationProblems. Report 89-09-10, Decision Sciences Department, The
Wharton School, University of Pennsylvania, Philadelphia, PA, 1989.
7
.
Download