Why Derived Data Types j A[100][80][50] struct _tagStudent { int id; double grade; char note[100]; }; i struct _tagStudent Students[25]; k Surface [i][j][0] Message data contains different data types Can use several separate messages performance may not be good Message data involves non-contiguous memory locations Can copy non-contiguous data to a contiguous storage, then communicate additional memory copies 1 Derived Data Type MPI’s solution: derived data type No additional memory copy Transfer directly of data with various shape and size Idea: Specify the memory layout of data and corresponding basic data types. Usage: Construct derived data type Commit derived data type Use it in communication routines where MPI_Datatype argument is required. Free derived data type 2 Type Map & Type Signature A general data type consists of A sequence of basic data types A sequence of byte displacements Type map: sequence of pairs (basic data type, displacement) for the general data type E.g. double A[2] {(MPI_DOUBLE,0), (MPI_DOUBLE,8)} _tagStudent {(MPI_INT,0), (MPI_DOUBLE,8), (MPI_CHAR,16), …} Type signature: sequence of basic data types for the general data type E.g. double A[2] {MPI_DOUBLE, MPI_DOUBLE} _tagStudent {MPI_INT, MPI_DOUBLE, MPI_CHAR …} 3 Communication Buffer Given a type map {(type0,disp0),(type1,disp1)} and base address buf, the communication buffer: Consists of 2 entries 1st entry at address buf+disp0, of type type0; 2nd entry at address buf+disp1, of type type1. E.g. double A[2] 1st entry at A, of type MPI_DOUBLE; 2nd entry at A+8, of type MPI_DOUBLE. If type map contains n entries similar semantics 4 Type Constructor int MPI_Type_contiguous(int count, MPI_Datatype oldtype, MPI_Datatype *newtype) MPI_TYPE_CONTIGUOUS(COUNT, OLDTYPE, NEWTYPE, IERROR) integer COUNT, OLDTYPE, NEWTYPE, IERROR newtype is a concatenation of count copies of oldtype oldtype can be a basic data type or a derived data type j A[100][80][50] Surface: A[0][:][:] MPI_Datatype face_jk; MPI_Type_contiguous(80*50, MPI_DOUBLE, &face_jk); MPI_Type_commit(&face_jk); MPI_Send(&A[0][0][0],1,face_jk,rank,tag,comm); // MPI_Send(&A[0][0][0],80*50,MPI_DOUBLE,rank,tag,comm); MPI_Send(&A[99][0][0],1,face_jk,rank,tag,comm); ... MPI_Type_free(&face_jk); i k 5 Type Constructor int MPI_Type_vector(int count, int blocklength, int stride, MPI_Datatype oldtype, MPI_Datatype *newtype) blocklength stride count – number of blocks blocklength – number of elements in each block, in terms of oldtype stride – number of elements between start of each block, in terms of oldtype oldtype – old data type, can be basic or derived data type newtype – created new data type Data consists of equally spaced blocks: same oldtype, same block length, same spacing in terms of oldtype Each block is a concatenation of blocklength copies of old datatype Spacing between blocks is stride number of oldtype. 6 Example double A[4][4]; MPI_Datatype column; MPI_Type_vector(4,1,4,MPI_DOUBLE, &column); MPI_Type_commit(&column); MPI_Send(&A[0][1],1,column,rank,tag,comm); MPI_Send(&A[0][3],1,column, rank, tag, comm); ... A[4][4] Surface: A[:][0][:] j double A[100][80][50]; MPI_Datatype face_ik; A[100][80][50] MPI_Type_vector(100,50,80*50,MPI_DOUBLE,&face_ik); MPI_Type_commit(&face_ik); MPI_Send(&A[0][0][0],1,face_ik,rank,tag,comm); MPI_Send(&A[0][1][0],1,face_ik,rank,tag,comm); MPI_Send(&A[0][79][0],1,face_ik,rank,tag,comm); ... i k 7 Type Constructor int MPI_Type_hvector(int count, int blocklength, MPI_Aint stride, MPI_Datatype oldtype, MPI_Datatype *newtype) blocklength stride Same as MPI_Type_vector, except that stride is in terms of number of bytes, not number of elements of oldtype. blocklength is still in terms of number of elements of oldtype. Same oldtype in different blocks; same block lengths; same spacing between neighboring blocks, but in terms of bytes (not in terms of oldtype) 8 Example double A[4][4]; MPI_Datatype column; MPI_Type_hvector(4,1,4*sizeof(double), MPI_DOUBLE, &column); MPI_Type_commit(&column); MPI_Send(&A[0][1],1,column,rank,tag,comm); ... A[4][4] Surface: A[:][:][49] j double A[100][80][50]; MPI_Datatype face_ij, line_j; A[100][80][50] MPI_Type_vector(80,1,50,MPI_DOUBLE,&line_j); MPI_Type_hvector(100,1,80*50*sizeof(double), line_j, &face_ij); MPI_Type_commit(&face_ij); MPI_Send(&A[0][0][49],1,face_ij,rank,tag,comm); ... i k 9 Type Constructor int MPI_Type_indexed(int count, int *array_blocklen, int *array_disp, MPI_Datatype oldtype, MPI_Datatype *newtype) blocklen[i] disp[i] count – number of blocks array_blocklen – number of elements per block in term s of oldtype, dimension: count. array_disp – displacements of each block in terms of number of elements of oldtype, dimension: count oldtype – old data type newtype – new data type Data consists of count blocks of oldtype: same oldtype; different block lengths; different spacing between blocks block i has length array_blocklen[i] Block i has displacement array_disp[i], in terms of number of oldtype elements. 10 Example 1 2 3 4 Upper triangle of matrix A[4][4] 5 6 7 8 double A[4][4]; MPI_Datatype upper_tri; int blocklen[4], disp[4]; int i; for(i=0;i<4;i++) { blocklen[i] = 4-i; disp[i] = (4+1)*i; } MPI_Type_indexed(4,blocklen,disp,MPI_DOUBLE,&upper_tri); MPI_Type_commit(&upper_tri); MPI_Send(&A[0][0], 1, upper_tri, rank, tag, comm); ... 9 10 11 12 13 14 15 16 // Strict lower triangular MPI_Type lower_tri; for(i=0;i<3;i++) { blocklen[i] = i+1; disp[i] = (i+1)*4; } MPI_Type_indexed(3, blocklen, disp, MPI_DOUBLE, &lower_tri); ... 11 Type Constructor int MPI_Type_hindexed(int count, int *array_blocklen, MPI_Aint *array_disp, MPI_Datatype oldtype, MPI_Datatype *newtype) blocklen[i] disp[i] Same as MPI_Type_indexed, except that array_disp is specified in terms of number of bytes instead of number of oldtype. Same oldtype; Different block lengths; different spacing between blocks, displacement in terms of bytes, instead of number of oldtype elements 12 Example 1 2 3 4 Upper triangle of matrix A[4][4] 5 6 7 8 double A[4][4]; 9 10 11 12 MPI_Datatype upper_tri; int blocklen[4]; 13 14 15 16 MPI_Aint disp[4]; int i; for(i=0;i<4;i++) { blocklen[i] = 4-i; disp[i] = (4+1)*i*sizeof(double); } MPI_Type_hindexed(4,blocklen,disp,MPI_DOUBLE,&upper_tri); MPI_Type_commit(&upper_tri); MPI_Send(A, 1, upper_tri, rank, tag, comm); ... 13 Address Calculation int MPI_Address(void *location, MPI_Aint *address) MPI_ADDRESS(location, address) <type> location(*) integer address Returns the address of the memory location (or variable) The difference between two addresses gives the number of bytes between these two memory locations. Address is different from pointers in C/C++ Cannot do pointer subtraction Pointer + (or -) an integer n new location: n*sizeof(data-type) Struct _tagStudent{ int id; double grade; char note[100]; } A_Student; MPI_Aint addr1, addr2, disp; MPI_Address(&A_Student.id, &addr1); MPI_Address(&A_Student.grade, &addr2); Disp = addr2 – addr1; 14 Type Constructor Int MPI_Type_struct(int count, int *array_blocklen, MPI_Aint *array_disp, int *array_types, MPI_Datatype *new type) blocklen[i] type[i] disp[i] count – number of blocks array_blocklen – array, number of elements in each block, in terms of oldtype; dimension: count array_disp – array, displacements of each block, in terms of number of bytes; dimension: count array_types, array, data types of each block; dimension: count newtype – new data type Different oldtype; different block lengths; different spacing between blocks, displacement in terms of bytes Each block may have different data types Most general 15 struct _tagStudent { int id; double grade; char note[100]; }; Example struct _tagStudent Students[25]; MPI_Datatype one_student, all_students; int block_len[3]; MPI_Datatype types[3]; MPI_Aint disp[3]; block_len[0] = block_len[1] = 1; block_len[2] = 100; types[0] = MPI_INT; types[1] = MPI_DOUBLE; types[2] = MPI_CHAR; MPI_Address(&Students[0].id, &disp[0]); // memory address MPI_Address(&Students[0].grade, &disp[1]); MPI_Address(&Students[0].note[0],&disp[2]); disp[1] = disp[1]-disp[0]; disp[2] = disp[2]-disp[0]; disp[0] = 0; MPI_Type_struct(3, block_len, disp, types, &one_student); MPI_Type_contiguous(25, one_student, &all_students); MPI_Type_commit(&all_students); MPI_Send(Students, 1, all_students, rank, tag, comm); // MPI_Type_commit(&one_student); // MPI_Send(Students, 25, one_student, rank, tag, comm); ... 16 Type Extent “Length” of a data type in terms of bytes E.g. double – MPI_DOUBLE – extent is 8 or sizeof(double) int – MPI_INT – extent is 2 or sizeof(int) Situation more complex for derived data types; There are two cases Case 1: derived data types encountered so far (no boundary markers MPI_UB or MPI_LB) Distance between first byte and the last byte of data type, plus some increment for memory alignment. • Memory alignment: A basic data type of length n will only be allocated in memory starting from an address of a multiple of n {(MPI_DOUBLE,0), (MPI_CHAR, 8)} Double – 8 bytes, byte 0-7 Char – 1 byte, byte 8 Increment – 7 bytes, to round off to next multiple of 8 Extent is: 8+1+7 = 16 17 Type Extent Case 2: boundary marker(s) appear in data type definition Pre-defined type MPI_LB marks lower boundary of data type; MPI_UB marks upper boundary of data type. Length of MPI_LB and MPI_UB is zero. Extent: distance between boundary markers If only MPI_UB appears, extent is distance between first byte and MPI_UB If only MPI_LB appears, extent is distance between MPI_LB and last byte, plus increment for memory alignment {(MPI_DOUBLE,0) (MPI_CHAR,8) (MPI_UB,8)} Extent of data type is 8 instead of 16. {(MPI_LB,-8) (MPI_DOUBLE,0) (MPI_CHAR,8)} Extent is: 8+8+1+7 = 24 {(MPI_LB,-8) (MPI_DOUBLE,0) (MPI_CHAR,8) (MPI_UB 9)} Extent: 9+8 = 17 Can use MPI_LB and MPI_UB to modify the extent to suit one’s needs 18 Example double A[4][4]; MPI_Datatype column; MPI_Type_vector(4,1,4,MPI_DOUBLE, &column); MPI_Type_commit(&column); // Extent of column is 13*sizeof(double)=104 bytes // Now modify extent of column to be sizeof(double)=8 using MPI_LB, MPI_UB // Create a new type, same as column, but with extent 8 // {(column, 0) (MPI_UB, 8)} MPI_Datatype modified_column; MPI_Datatype types[2]; MPI_Aint disp[2]; int block_len[2]; types[0] = column; types[1] = MPI_UB; block_len[0] = block_len[1] = 1; disp[0] = 0; disp[1] = sizeof(double); MPI_Type_struct(2, block_len, disp, types, &modified_column); // Now modified_column is same as column, but extent is sizeof(double)=8. 19 Type Extent is Important Concatenation of derived data types is based on their type extent extent extent A_type B_type MPI_Send(buf, 2, A_type, …); or MPI_Type_contiguous(2, A_type, &B_type); Modify extent of A_type using MPI_UB, MPI_LB extent A_type extent B_type 20 Example extent extent A_type 1.0 2.0 3.0 4.0 5.0 6.0 MPI_Send(buf,2,A_type,...) … Actual data send out: 4 numbers: 1.0, 3.0, 4.0, 6.0 buf extent extent A_type MPI_Send(buf,2,A_type,...) MPI_Send(buf, 4, MPI_DOUBLE, ...) Actual data sent out: 4 numbers: 1.0, 3.0, 2.0, 4.0 Actual data sent out: 4 numbers: 1.0, 2.0, 3.0, 4.0 21 Data arrived: 4 numbers: 1.0, 2.0, 3.0, 4.0 Example extent extent A_type 1.0 2.0 3.0 4.0 MPI_Recv(buf,2,A_type,...) … buf extent extent A_type MPI_Recv(buf,2,A_type,...) 1.0 buf 3.0 2.0 4.0 MPI_Recv(buf, 4, MPI_DOUBLE, ...) 1.0 buf 2.0 3.0 4.0 22 Type Commit & Free int MPI_Type_commit(MPI_Datatype &datatype) int MPI_Type_free(MPI_Datatype &datatype) A derived data type must be committed before being used in communication. Once committed, can be used comm routines same as pre-defined data types. If not used any more, need to free the derived data type 23 Type Matching Type matching rules need to be generalized with derived data types New rule: the type signature of the data sent must match the type signature of the that specified in receive routine Sequence of basic data types must match Number of basic elements in message sent can be smaller than that specified in receive, but must match. 24 Example A 1.0 B 1.0 C 1.0 D 1.0 2.0 3.0 2.0 4.0 3.0 4.0 Cpu 0: A cpu 1: B Cpu 0: C cpu 1: D 2.0 2.0 double A[4], B[8]; double C[2], D[8]; int my_rank; ... MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Datatype recv_type; if(my_rank==1) { MPI_Type_vector(4, 1, 2, MPI_DOUBLE, &recv_type); MPI_Commit(&recv_type); MPI_Recv(B, 1, recv_type, 0, tag, MPI_COMM_WORLD, &stat); MPI_Recv(D, 1, recv_type, 0, tag, MPI_COMM_WORLD, &stat); MPI_Type_free(&recv_type); } else if (my_rank==0) { MPI_Send(A, 4, MPI_DOUBLE, 1, tag, MPI_COMM_WORLD); MPI_Send(C, 2, MPI_DOUBLE, 1, tag, MPI_COMM_WORLD); } 25 Example B A 1.0 C 1.0 2.0 3.0 2.0 3.0 double A[N][N], B[N][N], C[N]; MPI_Datatype diag; ... MPI_Type_vector(N, 1, N+1, MPI_DOUBLE, &diag); MPI_Type_commit(&diag); if(my_rank==0) { MPI_Send(&A[0][0], 1, diag, 1, tag, MPI_COMM_WORLD); MPI_Send(&A[0][0], 1, diag, 1, tag, MPI_COMM_WORLD); } else if(my_rank==1) { MPI_Recv(&B[0][0], 1, diag, 0, tag, MPI_COMM_WORLD, &stat); MPI_Recv(&C[0], N, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &stat); } MPI_Type_free(&diag); 26 Example Cpu0: A^T cpu 1: B B A 1.0 2.0 3.0 1.0 4.0 7.0 4.0 5.0 6.0 2.0 5.0 8.0 7.0 8.0 9.0 3.0 6.0 9.0 double A[N][N], B[N][N]; MPI_Datatype column, mat_transpose; ... MPI_Type_vector(N, 1, N, MPI_DOUBLE, &column); MPI_Type_hvector(N, 1, sizeof(double), column, &mat_transpose); // MPI_Datatype column_modified, types[2]; // int block_len[2]; // MPI_Aint disp[2]; // types[0] = column; types[1] = MPI_UB; // block_len[0] = block_len[1] = 1; // disp[0] = 0; disp[1] = sizeof(double); // MPI_Type_struct(2,block_len,disp,types,&column_modified); // MPI_Type_contiguous(N, column_modified, &mat_transpose); MPI_Type_commit(&mat_transpose); if(my_rank==0) { MPI_Send(&A[0][0], N*N, MPI_DOUBLE, 1, tag, MPI_COMM_WORLD); } else if(my_rank==1) { MPI_Recv(&B[0][0], 1, mat_transpose, 0, tag, MPI_COMM_WORLD, &stat); } MPI_Type_free(&mat_transpose); 27 Matrix Transpose Revisited A B T A11 A12 A13 A11 A21 A22 A23 A12 A31 A32 A33 A21 T A31 T A22 T A32 A13 A23 A33 T T T B = AT B also distributed on P cpus Rwo-wise decomposition Aij – (N/P)x(N/P) matrices Bij=AjiT Local transpose A11T A12T A13T A21T A22T A23T T T A – NxN matrix Distributed on P cpus Row-wise decomposition All-to-all Input: A[i]][j] = 2*i+j A31T A32T A33T 28 Example: Matrix Transpose 0 1 2 3 0 4 0 4 4 5 6 7 1 5 1 5 0 1 2 3 2 6 2 6 4 5 6 7 3 7 3 7 Three steps: 1. Divide A into blocks; 2. Transpose each block locally; 3. All-to-all comm; 4. Merge blocks locally; On each cpu, A is (N/P)xN matrix; First need to first re-write to P blocks of (N/P)x(N/P) matrices, then can do local transpose A: 2x4 0 1 2 3 4 5 6 7 Two 2x2 blocks 0 1 4 5 2 3 6 7 After all-to-all comm, have P blocks of (N/P)x(N/P) matrices; Need to merge into a (N/P)xN matrix 29 Transpose A extent B All-to-all Read data column by column Receive data block by block Need to be careful about extent Careful about extent Create derived data types for send and receive; No additional local manipulations 30 #include #include #include #include <stdio.h> <string.h> <mpi.h> "dmath.h" #define DIM 1000 // global A[DIM], B[DIM] Matrix Transposition int main(int argc, char **argv) { int ncpus, my_rank, i, j, iblock; int Nx, Ny; // Nx=DIM/ncpus, Ny=DIM, local array: A[Nx][Ny], B[Nx][Ny] double **A, **B; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &ncpus); if(DIM%ncpus != 0) { // make sure DIM can be divided by ncpus if(my_rank==0) printf("ERROR: DIM cannot be divided by ncpus!\n"); MPI_Finalize(); return -1; } Nx = DIM/ncpus; Ny = DIM; A = DMath::newD(Nx, Ny); // allocate memory B = DMath::newD(Nx, Ny); for(i=0;i<Nx;i++) for(j=0;j<Ny;j++) A[i][j] = 2*(my_rank*Nx+i) + j; memset(&B[0][0], '\0', sizeof(double)*Nx*Ny); // zero out B 31 // Create derived data types MPI_Datatype type_send, type_recv; MPI_Datatype type_line1, type_block; MPI_Aint displ[2]; MPI_Datatype types[2]; int block_len[2]; MPI_Type_vector(Nx, 1, Ny, MPI_DOUBLE, &type_line1); // a column in A types[0] = type_line1; types[1] = MPI_UB; // modify the extent of column to be 1 double block_len[0] = block_len[1] = 1; displ[0] = 0; displ[1] = sizeof(double); MPI_Type_struct(2, block_len, displ, types, &type_send); // modified column MPI_Type_commit(&type_send); // Now A is a concatenation of type_send MPI_Type_vector(Nx, Nx, Ny, MPI_DOUBLE, &type_block); // submatrix block types[0] = type_block; types[1] = MPI_UB; // modify extent of type_block block_len[0] = block_len[1] = 1; displ[0] = 0; displ[1] = Nx*sizeof(double); MPI_Type_struct(2, block_len, displ, types, &type_recv); // modified block MPI_Type_commit(&type_recv); // Now B is a cancatenation of type_recv // send/recv data MPI_Alltoall(&A[0][0], Nx, type_send, &B[0][0], 1, type_recv, MPI_COMM_WORLD); // clean up MPI_Type_free(&type_send); MPI_Type_free(&type_recv); DMath::del(A); DMath::del(B); MPI_Finalize(); return 0; } 32