Lecture 7

advertisement
Why Derived Data Types
j
A[100][80][50]
struct _tagStudent {
int id;
double grade;
char note[100];
};
i
struct _tagStudent Students[25];
k
Surface [i][j][0]
 Message data contains different data types
 Can use several separate messages  performance may not be
good
 Message data involves non-contiguous memory
locations
 Can copy non-contiguous data to a contiguous storage, then
communicate  additional memory copies
1
Derived Data Type
MPI’s solution: derived data type
No additional memory copy
Transfer directly of data with various shape and size
Idea: Specify the memory layout of data and
corresponding basic data types.
Usage:
Construct derived data type
Commit derived data type
Use it in communication routines where
MPI_Datatype argument is required.
Free derived data type
2
Type Map & Type Signature
 A general data type consists of
 A sequence of basic data types
 A sequence of byte displacements
 Type map: sequence of pairs (basic data type,
displacement) for the general data type
 E.g. double A[2]  {(MPI_DOUBLE,0),
(MPI_DOUBLE,8)}
 _tagStudent  {(MPI_INT,0), (MPI_DOUBLE,8),
(MPI_CHAR,16), …}
 Type signature: sequence of basic data types for the
general data type
 E.g. double A[2]  {MPI_DOUBLE, MPI_DOUBLE}
 _tagStudent  {MPI_INT, MPI_DOUBLE, MPI_CHAR …}
3
Communication Buffer
Given a type map
{(type0,disp0),(type1,disp1)} and
base address buf, the communication buffer:
Consists of 2 entries
1st entry at address buf+disp0, of type type0;
2nd entry at address buf+disp1, of type type1.
E.g. double A[2]  1st entry at A, of type
MPI_DOUBLE; 2nd entry at A+8, of type
MPI_DOUBLE.
If type map contains n entries  similar
semantics
4
Type Constructor
int MPI_Type_contiguous(int count, MPI_Datatype oldtype,
MPI_Datatype *newtype)
MPI_TYPE_CONTIGUOUS(COUNT, OLDTYPE, NEWTYPE, IERROR)
integer COUNT, OLDTYPE, NEWTYPE, IERROR
newtype is a concatenation of count copies of
oldtype
oldtype can be a basic data type or a derived
data type
j
A[100][80][50]
Surface: A[0][:][:]
MPI_Datatype face_jk;
MPI_Type_contiguous(80*50, MPI_DOUBLE, &face_jk);
MPI_Type_commit(&face_jk);
MPI_Send(&A[0][0][0],1,face_jk,rank,tag,comm);
// MPI_Send(&A[0][0][0],80*50,MPI_DOUBLE,rank,tag,comm);
MPI_Send(&A[99][0][0],1,face_jk,rank,tag,comm);
...
MPI_Type_free(&face_jk);
i
k
5
Type Constructor
int MPI_Type_vector(int count, int blocklength, int stride,
MPI_Datatype oldtype, MPI_Datatype *newtype)
blocklength
stride
 count – number of blocks
 blocklength – number of elements in each block, in terms of oldtype
 stride – number of elements between start of each block, in terms of
oldtype
 oldtype – old data type, can be basic or derived data type
 newtype – created new data type
 Data consists of equally spaced blocks: same oldtype, same block
length, same spacing in terms of oldtype
 Each block is a concatenation of blocklength copies of old datatype
 Spacing between blocks is stride number of oldtype.
6
Example
double A[4][4];
MPI_Datatype column;
MPI_Type_vector(4,1,4,MPI_DOUBLE, &column);
MPI_Type_commit(&column);
MPI_Send(&A[0][1],1,column,rank,tag,comm);
MPI_Send(&A[0][3],1,column, rank, tag, comm);
...
A[4][4]
Surface: A[:][0][:]
j
double A[100][80][50];
MPI_Datatype face_ik;
A[100][80][50]
MPI_Type_vector(100,50,80*50,MPI_DOUBLE,&face_ik);
MPI_Type_commit(&face_ik);
MPI_Send(&A[0][0][0],1,face_ik,rank,tag,comm);
MPI_Send(&A[0][1][0],1,face_ik,rank,tag,comm);
MPI_Send(&A[0][79][0],1,face_ik,rank,tag,comm);
...
i
k
7
Type Constructor
int MPI_Type_hvector(int count, int blocklength, MPI_Aint stride,
MPI_Datatype oldtype, MPI_Datatype *newtype)
blocklength
stride
 Same as MPI_Type_vector, except that stride is in
terms of number of bytes, not number of elements of
oldtype.
 blocklength is still in terms of number of elements of
oldtype.
 Same oldtype in different blocks; same block lengths;
same spacing between neighboring blocks, but in terms
of bytes (not in terms of oldtype)
8
Example
double A[4][4];
MPI_Datatype column;
MPI_Type_hvector(4,1,4*sizeof(double),
MPI_DOUBLE, &column);
MPI_Type_commit(&column);
MPI_Send(&A[0][1],1,column,rank,tag,comm);
...
A[4][4]
Surface: A[:][:][49]
j
double A[100][80][50];
MPI_Datatype face_ij, line_j;
A[100][80][50]
MPI_Type_vector(80,1,50,MPI_DOUBLE,&line_j);
MPI_Type_hvector(100,1,80*50*sizeof(double),
line_j, &face_ij);
MPI_Type_commit(&face_ij);
MPI_Send(&A[0][0][49],1,face_ij,rank,tag,comm);
...
i
k
9
Type Constructor
int MPI_Type_indexed(int count, int *array_blocklen, int *array_disp,
MPI_Datatype oldtype, MPI_Datatype *newtype)
blocklen[i]
disp[i]
 count – number of blocks
 array_blocklen – number of elements per block in term s of oldtype,
dimension: count.
 array_disp – displacements of each block in terms of number of elements
of oldtype, dimension: count
 oldtype – old data type
 newtype – new data type
 Data consists of count blocks of oldtype: same oldtype; different
block lengths; different spacing between blocks
 block i has length array_blocklen[i]
 Block i has displacement array_disp[i], in terms of number of oldtype
elements.
10
Example
1
2
3
4
Upper triangle of matrix A[4][4]
5
6
7
8
double A[4][4];
MPI_Datatype upper_tri;
int blocklen[4], disp[4];
int i;
for(i=0;i<4;i++) {
blocklen[i] = 4-i;
disp[i] = (4+1)*i;
}
MPI_Type_indexed(4,blocklen,disp,MPI_DOUBLE,&upper_tri);
MPI_Type_commit(&upper_tri);
MPI_Send(&A[0][0], 1, upper_tri, rank, tag, comm);
...
9
10 11 12
13 14 15 16
// Strict lower triangular
MPI_Type lower_tri;
for(i=0;i<3;i++) {
blocklen[i] = i+1;
disp[i] = (i+1)*4;
}
MPI_Type_indexed(3, blocklen, disp, MPI_DOUBLE, &lower_tri);
...
11
Type Constructor
int MPI_Type_hindexed(int count, int *array_blocklen,
MPI_Aint *array_disp,
MPI_Datatype oldtype, MPI_Datatype *newtype)
blocklen[i]
disp[i]
Same as MPI_Type_indexed, except that
array_disp is specified in terms of number of
bytes instead of number of oldtype.
Same oldtype; Different block lengths; different
spacing between blocks, displacement in terms
of bytes, instead of number of oldtype elements
12
Example
1
2
3
4
Upper triangle of matrix A[4][4]
5
6
7
8
double A[4][4];
9 10 11 12
MPI_Datatype upper_tri;
int blocklen[4];
13 14 15 16
MPI_Aint disp[4];
int i;
for(i=0;i<4;i++) {
blocklen[i] = 4-i;
disp[i] = (4+1)*i*sizeof(double);
}
MPI_Type_hindexed(4,blocklen,disp,MPI_DOUBLE,&upper_tri);
MPI_Type_commit(&upper_tri);
MPI_Send(A, 1, upper_tri, rank, tag, comm);
...
13
Address Calculation
int MPI_Address(void *location, MPI_Aint *address)
MPI_ADDRESS(location, address)
<type> location(*)
integer address
 Returns the address of the memory location (or variable)
 The difference between two addresses gives the number of bytes
between these two memory locations.
 Address is different from pointers in C/C++
 Cannot do pointer subtraction
 Pointer + (or -) an integer n  new location: n*sizeof(data-type)
Struct _tagStudent{
int id;
double grade;
char note[100];
} A_Student;
MPI_Aint addr1, addr2, disp;
MPI_Address(&A_Student.id, &addr1);
MPI_Address(&A_Student.grade, &addr2);
Disp = addr2 – addr1;
14
Type Constructor
Int MPI_Type_struct(int count, int *array_blocklen, MPI_Aint *array_disp,
int *array_types, MPI_Datatype *new type)
blocklen[i]
type[i]
disp[i]
 count – number of blocks
 array_blocklen – array, number of elements in each block, in terms of
oldtype; dimension: count
 array_disp – array, displacements of each block, in terms of number of
bytes; dimension: count
 array_types, array, data types of each block; dimension: count
 newtype – new data type
 Different oldtype; different block lengths; different spacing between blocks,
displacement in terms of bytes
 Each block may have different data types
 Most general
15
struct _tagStudent {
int id;
double grade;
char note[100];
};
Example
struct _tagStudent Students[25];
MPI_Datatype one_student, all_students;
int block_len[3];
MPI_Datatype types[3];
MPI_Aint disp[3];
block_len[0] = block_len[1] = 1;
block_len[2] = 100;
types[0] = MPI_INT;
types[1] = MPI_DOUBLE;
types[2] = MPI_CHAR;
MPI_Address(&Students[0].id, &disp[0]); // memory address
MPI_Address(&Students[0].grade, &disp[1]);
MPI_Address(&Students[0].note[0],&disp[2]);
disp[1] = disp[1]-disp[0];
disp[2] = disp[2]-disp[0];
disp[0] = 0;
MPI_Type_struct(3, block_len, disp, types, &one_student);
MPI_Type_contiguous(25, one_student, &all_students);
MPI_Type_commit(&all_students);
MPI_Send(Students, 1, all_students, rank, tag, comm);
// MPI_Type_commit(&one_student);
// MPI_Send(Students, 25, one_student, rank, tag, comm);
...
16
Type Extent
 “Length” of a data type in terms of bytes
 E.g. double – MPI_DOUBLE – extent is 8 or sizeof(double)
 int – MPI_INT – extent is 2 or sizeof(int)
 Situation more complex for derived data types; There are
two cases
 Case 1: derived data types encountered so far (no
boundary markers MPI_UB or MPI_LB)
 Distance between first byte and the last byte of data type, plus
some increment for memory alignment.
• Memory alignment: A basic data type of length n will only be
allocated in memory starting from an address of a multiple of n
{(MPI_DOUBLE,0), (MPI_CHAR, 8)}
Double – 8 bytes, byte 0-7
Char – 1 byte, byte 8
Increment – 7 bytes, to round off to next multiple of 8
Extent is: 8+1+7 = 16
17
Type Extent
 Case 2: boundary marker(s) appear in data type
definition
 Pre-defined type MPI_LB marks lower boundary of data type;
MPI_UB marks upper boundary of data type.
 Length of MPI_LB and MPI_UB is zero.
 Extent: distance between boundary markers
 If only MPI_UB appears, extent is distance between first byte and
MPI_UB
 If only MPI_LB appears, extent is distance between MPI_LB and
last byte, plus increment for memory alignment
{(MPI_DOUBLE,0) (MPI_CHAR,8) (MPI_UB,8)}
Extent of data type is 8 instead of 16.
{(MPI_LB,-8) (MPI_DOUBLE,0) (MPI_CHAR,8)}
Extent is: 8+8+1+7 = 24
{(MPI_LB,-8) (MPI_DOUBLE,0) (MPI_CHAR,8) (MPI_UB 9)}
Extent: 9+8 = 17
Can use MPI_LB and MPI_UB to modify the extent to suit one’s needs
18
Example
double A[4][4];
MPI_Datatype column;
MPI_Type_vector(4,1,4,MPI_DOUBLE, &column);
MPI_Type_commit(&column);
// Extent of column is 13*sizeof(double)=104 bytes
// Now modify extent of column to be sizeof(double)=8 using MPI_LB, MPI_UB
// Create a new type, same as column, but with extent 8
//
{(column, 0) (MPI_UB, 8)}
MPI_Datatype modified_column;
MPI_Datatype types[2];
MPI_Aint disp[2];
int block_len[2];
types[0] = column;
types[1] = MPI_UB;
block_len[0] = block_len[1] = 1;
disp[0] = 0;
disp[1] = sizeof(double);
MPI_Type_struct(2, block_len, disp, types, &modified_column);
// Now modified_column is same as column, but extent is sizeof(double)=8.
19
Type Extent is Important
Concatenation of derived data types is
based on their type extent
extent
extent
A_type
B_type
MPI_Send(buf, 2, A_type, …);
or
MPI_Type_contiguous(2, A_type, &B_type);
Modify extent of A_type using MPI_UB, MPI_LB
extent
A_type
extent
B_type
20
Example
extent
extent
A_type
1.0
2.0
3.0
4.0
5.0
6.0
MPI_Send(buf,2,A_type,...)
…
Actual data send out:
4 numbers: 1.0, 3.0, 4.0, 6.0
buf
extent
extent
A_type
MPI_Send(buf,2,A_type,...)
MPI_Send(buf, 4, MPI_DOUBLE, ...)
Actual data sent out:
4 numbers: 1.0, 3.0, 2.0, 4.0
Actual data sent out:
4 numbers: 1.0, 2.0, 3.0, 4.0
21
Data arrived:
4 numbers: 1.0, 2.0, 3.0, 4.0
Example
extent
extent
A_type
1.0
2.0
3.0
4.0
MPI_Recv(buf,2,A_type,...)
…
buf
extent
extent
A_type
MPI_Recv(buf,2,A_type,...)
1.0
buf
3.0
2.0
4.0
MPI_Recv(buf, 4, MPI_DOUBLE, ...)
1.0
buf
2.0
3.0
4.0
22
Type Commit & Free
int MPI_Type_commit(MPI_Datatype &datatype)
int MPI_Type_free(MPI_Datatype &datatype)
A derived data type must be committed
before being used in communication.
Once committed, can be used comm
routines same as pre-defined data types.
If not used any more, need to free the
derived data type
23
Type Matching
Type matching rules need to be
generalized with derived data types
New rule: the type signature of the data
sent must match the type signature of the
that specified in receive routine
Sequence of basic data types must match
Number of basic elements in message sent
can be smaller than that specified in receive,
but must match.
24
Example
A
1.0
B
1.0
C
1.0
D
1.0
2.0
3.0
2.0
4.0
3.0
4.0
Cpu 0: A  cpu 1: B
Cpu 0: C  cpu 1: D
2.0
2.0
double A[4], B[8];
double C[2], D[8];
int my_rank;
...
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Datatype recv_type;
if(my_rank==1) {
MPI_Type_vector(4, 1, 2, MPI_DOUBLE, &recv_type);
MPI_Commit(&recv_type);
MPI_Recv(B, 1, recv_type, 0, tag, MPI_COMM_WORLD, &stat);
MPI_Recv(D, 1, recv_type, 0, tag, MPI_COMM_WORLD, &stat);
MPI_Type_free(&recv_type);
}
else if (my_rank==0) {
MPI_Send(A, 4, MPI_DOUBLE, 1, tag, MPI_COMM_WORLD);
MPI_Send(C, 2, MPI_DOUBLE, 1, tag, MPI_COMM_WORLD);
}
25
Example
B
A
1.0
C
1.0
2.0
3.0
2.0
3.0
double A[N][N], B[N][N], C[N];
MPI_Datatype diag;
...
MPI_Type_vector(N, 1, N+1, MPI_DOUBLE, &diag);
MPI_Type_commit(&diag);
if(my_rank==0) {
MPI_Send(&A[0][0], 1, diag, 1, tag, MPI_COMM_WORLD);
MPI_Send(&A[0][0], 1, diag, 1, tag, MPI_COMM_WORLD);
}
else if(my_rank==1) {
MPI_Recv(&B[0][0], 1, diag, 0, tag, MPI_COMM_WORLD, &stat);
MPI_Recv(&C[0], N, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &stat);
}
MPI_Type_free(&diag);
26
Example
Cpu0: A^T  cpu 1: B
B
A
1.0
2.0
3.0
1.0
4.0
7.0
4.0
5.0
6.0
2.0
5.0
8.0
7.0
8.0
9.0
3.0
6.0
9.0
double A[N][N], B[N][N];
MPI_Datatype column, mat_transpose;
...
MPI_Type_vector(N, 1, N, MPI_DOUBLE, &column);
MPI_Type_hvector(N, 1, sizeof(double), column, &mat_transpose);
// MPI_Datatype column_modified, types[2];
// int block_len[2];
// MPI_Aint disp[2];
// types[0] = column; types[1] = MPI_UB;
// block_len[0] = block_len[1] = 1;
// disp[0] = 0; disp[1] = sizeof(double);
// MPI_Type_struct(2,block_len,disp,types,&column_modified);
// MPI_Type_contiguous(N, column_modified, &mat_transpose);
MPI_Type_commit(&mat_transpose);
if(my_rank==0) {
MPI_Send(&A[0][0], N*N, MPI_DOUBLE, 1, tag, MPI_COMM_WORLD);
}
else if(my_rank==1) {
MPI_Recv(&B[0][0], 1, mat_transpose, 0, tag, MPI_COMM_WORLD, &stat);
}
MPI_Type_free(&mat_transpose);
27
Matrix Transpose Revisited
A
B
T
A11
A12
A13
A11
A21
A22
A23
A12
A31
A32
A33
A21
T
A31
T
A22
T
A32
A13
A23
A33
T
T
T
B = AT
B also distributed on P cpus
Rwo-wise decomposition
Aij – (N/P)x(N/P) matrices
Bij=AjiT
Local transpose
A11T A12T A13T
A21T A22T A23T
T
T
A – NxN matrix
Distributed on P cpus
Row-wise decomposition
All-to-all
Input:
A[i]][j] = 2*i+j
A31T A32T A33T
28
Example: Matrix Transpose
0
1
2
3
0
4
0
4
4
5
6
7
1
5
1
5
0
1
2
3
2
6
2
6
4
5
6
7
3
7
3
7
Three steps:
1. Divide A into blocks;
2. Transpose each
block locally;
3. All-to-all comm;
4. Merge blocks locally;
On each cpu, A is (N/P)xN matrix; First need to first re-write
to P blocks of (N/P)x(N/P) matrices, then can do local
transpose
A: 2x4
0
1
2
3
4
5
6
7
Two 2x2
blocks
0
1
4
5
2
3
6
7
After all-to-all comm, have P
blocks of (N/P)x(N/P) matrices;
Need to merge into a (N/P)xN
matrix
29
Transpose
A
extent
B
All-to-all
Read data column by column
Receive data block by block
Need to be careful about extent
Careful about extent
Create derived data types for send and receive; No additional local
manipulations
30
#include
#include
#include
#include
<stdio.h>
<string.h>
<mpi.h>
"dmath.h"
#define DIM 1000 // global A[DIM], B[DIM]
Matrix
Transposition
int main(int argc, char **argv)
{
int ncpus, my_rank, i, j, iblock;
int Nx, Ny; // Nx=DIM/ncpus, Ny=DIM, local array: A[Nx][Ny], B[Nx][Ny]
double **A, **B;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
if(DIM%ncpus != 0) { // make sure DIM can be divided by ncpus
if(my_rank==0)
printf("ERROR: DIM cannot be divided by ncpus!\n");
MPI_Finalize();
return -1;
}
Nx = DIM/ncpus;
Ny = DIM;
A = DMath::newD(Nx, Ny); // allocate memory
B = DMath::newD(Nx, Ny);
for(i=0;i<Nx;i++)
for(j=0;j<Ny;j++) A[i][j] = 2*(my_rank*Nx+i) + j;
memset(&B[0][0], '\0', sizeof(double)*Nx*Ny); // zero out B
31
// Create derived data types
MPI_Datatype type_send, type_recv;
MPI_Datatype type_line1, type_block;
MPI_Aint displ[2];
MPI_Datatype types[2];
int block_len[2];
MPI_Type_vector(Nx, 1, Ny, MPI_DOUBLE, &type_line1); // a column in A
types[0] = type_line1; types[1] = MPI_UB; // modify the extent of column to be 1 double
block_len[0] = block_len[1] = 1;
displ[0] = 0; displ[1] = sizeof(double);
MPI_Type_struct(2, block_len, displ, types, &type_send); // modified column
MPI_Type_commit(&type_send); // Now A is a concatenation of type_send
MPI_Type_vector(Nx, Nx, Ny, MPI_DOUBLE, &type_block); // submatrix block
types[0] = type_block; types[1] = MPI_UB; // modify extent of type_block
block_len[0] = block_len[1] = 1;
displ[0] = 0; displ[1] = Nx*sizeof(double);
MPI_Type_struct(2, block_len, displ, types, &type_recv); // modified block
MPI_Type_commit(&type_recv); // Now B is a cancatenation of type_recv
// send/recv data
MPI_Alltoall(&A[0][0], Nx, type_send, &B[0][0], 1, type_recv, MPI_COMM_WORLD);
// clean up
MPI_Type_free(&type_send);
MPI_Type_free(&type_recv);
DMath::del(A);
DMath::del(B);
MPI_Finalize();
return 0;
}
32
Download