GMAC Global Memory for Accelerators Isaac Gelado, John E. Stone, Javier Cabezas, Nacho Navarro and Wen-mei W. Hwu GTC 2010 GMAC in a nutshell • GMAC: Unified Virtual Address Space for CUDA – Simplifies the CPU code – Exploits advanced CUDA features for free • Vector addition example – Really simple kernel code __global__ void vector(float *c, float *a, float *b, size_t size) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if(idx < size) c[idx] = a[idx] + b[idx]; } – But, what about the CPU code? 9/23/10 GTC 2010 2 CPU CUDA code (I) • Read from disk, transfer to GPU and compute int main(int argc, char *argv[]) { float *h_a, *h_b, *h_c, *d_a, *d_b, *d_c; size_t size = LENGTH * sizeof(float); assert((h_a = malloc(size) != NULL); assert((h_b = malloc(size) != NULL); assert((h_c = malloc(size) != NULL); assert(cudaMalloc((void **)&d_a, size) == cudaSuccess)); assert(cudaMalloc((void **)&d_b, size) == cudaSuccess)); assert(cudaMalloc((void **)&d_c, size) == cudaSuccess)); read_file(argv[A], h_a); read_file(argv[B], h_b); assert(cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice) == cudaSuccess); assert(cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice) == cudaSuccess); 9/23/10 GTC 2010 3 CPU CUDA code (and II) • Read from disk, transfer to GPU and compute Db(BLOCK_SIZE); Dg(LENGTH / BLOCK_SIZE); if(LENGTH % BLOCK_SIZE) Dg.x++; vector<<<Dg, Db>>>(d_c, d_a, d_b, LENGTH); assert(cudaThreadSynchronize() == cudaSuccess); assert(cudaMemcpy(d_c, h_c, LENGTH * sizeof(float), cudaMemcpyDeviceToHost) == cudaSuccess); save_file(argv[C], h_c); free(h_a); cudaFree(d_a); free(h_b); cudaFree(d_b); free(h_c); cudaFree(d_c); return 0; } 9/23/10 GTC 2010 4 CPU GMAC code int main(int argc, char *argv[]) { float *a, *b, *c; size_t size = LENGTH * sizeof(float); assert(gmacMalloc((void **)&a, size) == gmacSuccess)); assert(gmacMalloc((void **)&b, size) == gmacSuccess)); assert(gmacMalloc((void **)&c, size) == gmacSuccess)); read_file(argv[A], a); read_file(argv[B], b); There is no memory copy Db(BLOCK_SIZE); Dg(LENGTH / BLOCK_SIZE); if(LENGTH % BLOCK_SIZE) Dg.x++; vector<<<Dg, Db>>>(c, a, b, LENGTH); assert(gmacThreadSynchronize() == gmacSuccess); save_file(argv[C], c); gmacFree(a); gmacFree(b); gmacFree(c); There is no memory copy return 0; } 9/23/10 GTC 2010 5 Getting GMAC • GMAC is at http://adsm.googlecode.com/ • Debian / Ubuntu binary and development .deb files • UNIX (also MacOS X) source code package • Experimental versions from mercurial repository 9/23/10 GTC 2010 6 Outline • Introduction • GMAC Memory Model – Asymmetric Memory – Global Memory • GMAC Execution Model – Multi-threading – Inter-thread communication • Conclusions 9/23/10 GTC 2010 7 GMAC Memory Model • Unified CPU / GPU virtual address space • Asymmetric address space accessibility Shared Data CPU CPU Data 9/23/10 GPU Memory GTC 2010 8 GMAC Consistency Model • Implicit acquire / release primitives at accelerator call / return boundaries CPU ACC CPU 9/23/10 ACC GTC 2010 9 GMAC Memory API • Allocate shared memory gmacError_t gmacMalloc(void **ptr, size_t size) – Allocated memory address (returned by reference) – Gets the size of the data to be allocated – Error code, gmacSuccess if no error • Example usage #include <gmac.h> int main(int argc, char *argv[]) { float *foo = NULL; gmacError_t error; if((error = gmacMalloc((void **)&foo, FOO_SIZE)) != gmacSuccess) FATAL(“Error allocating memory %s”, gmacErrorString(error)); . . . } 9/23/10 GTC 2010 10 GMAC Memory API • Release shared memory gmacError_t gmacFree(void *ptr) – Memory address to be released – Error code, gmacSuccess if no error • Example usage #include <gmac.h> int main(int argc, char *argv[]) { float *foo = NULL; gmacError_t error; if((error = gmacMalloc((void **)&foo, FOO_SIZE)) != gmacSuccess) FATAL(“Error allocating memory %s”, gmacErrorString(error)); . . . gmacFree(foo); } 9/23/10 GTC 2010 11 GMAC Unified Address Space • Use fixed-size segments to map accelerator memory • Implement and export Accelerator Virtual Memory 0x00100000 CPU 9/23/10 0x00100000 System Memory Accelerator Memory Accelerator GTC 2010 12 GMAC Memory API • Translate shared memory (multi-GPU) void *gmacPtr(void *ptr) template<typename T> T *gmacPtr(T *ptr) – Receives CPU memory address – Returns GPU memory address • Example usage #include <gmac.h> int main(int argc, char *argv[]) { . . . kernel<<<Dg, Db>>>(gmacPtr(buffer), size); . . . } 9/23/10 GTC 2010 13 GMAC Example Code (I) int fdtd(FILE *fpMat, FILE *fpMed, int N) { /* Read and create data structures */ MaterialList materials if(readMaterials(fpMat, materials) == 0) return -1; Media media; if(readMedia(fpMed, media) == 0) return -1; Field field; if(createField(media.dim, field) == 0) return -1; for(int n = 0; n < N; n++) { . . . updateElectic<<<Dg, Db>>>(materials, media, field); . . . n++; updateMagnetic<<<Dg, Db>>>(materials, media, field); . . . } } 9/23/10 GTC 2010 14 GMAC Example Code (II) typedef struct { float Ke[3][3], km[3][3]; } Material; typedef struct { size_t n; Material *data; } MaterialList; /* Read materials from disk */ size_t readMaterials(FILE *fp, MaterialList &list) { uint16_t n = 0; fread(&n, sizeof(n), 1, fp); ret = gmacMalloc((void **)&list.data, n * sizeof(Material)); if(ret != gmacSuccess) return 0; fread(list.data, sizeof(Material), n, fp); return n; } /* Read media description from file */ typedef struct { dim3 dim; uint16_t *data } Media; void readMedia(FILE *fp, Media &media); /* Allocate a electromagnetic field */ typedef struct { dim3 dim; float3 *e; float3 *h; float3 *p; float3 *m } Field; void allocateField(Field &f, dim3 dim); 9/23/10 GTC 2010 15 GMAC I/O Handling • Functions overridden (interposition) by GMAC: – Memory: memset(), memcpy() – I/O: fread(), fwrite(), read(), write() – MPI: MPI_Send(), MPI_Receive • Get advanced CUDA features for free – Asynchronous data transfers – Pinned memory Asynchronous Copies to device memory 9/23/10 Pinned memory for I/O transfers GTC 2010 16 GMAC Example Code (III) __global__ void updateElectric(Materials mats, Media media, Field f) { int Idx = threadIdx.x + blockDim.x * blockIdx.x; int Idy = threadIdx.y + blockDim.y * blockIdx.y; for(int Idz = 0; Idz < f.dim.z; Idz++) { int pos = Idx + Idy * f.dim.x + Idz * f.dim.x * float3 E = f.e[pos]; Material m = mats[media[pos]]; float3 P; P.x = E.x * m.ke[0][0] + E.y * m.ke[0][1] + E.z P.y = E.x * m.ke[1][0] + E.y * m.ke[1][1] + E.z P.z = E.x * m.ke[2][0] + E.y * m.ke[2][1] + E.z f.p[pos] = P; } f.dim.y; * m.ke[0][2]; * m.ke[1][2]; * m.ke[2][2]; } 9/23/10 GTC 2010 17 Outline • Introduction • GMAC Memory Model – Asymmetric Memory – Global Memory • GMAC Execution Model – Multi-threading – Inter-thread communication • Conclusions 9/23/10 GTC 2010 18 GMAC Global Memory • For multi-GPU systems • Data accessible by all accelerators, but owned by the CPU GPU CPU Memory 9/23/10 GPU GTC 2010 19 GMAC Global memory API • Allocate global shared Memory gmacError_t gmacGlobalMalloc(void **ptr, size_t size) – Allocated memory address (returned by reference) – Gets the size of the data to be allocated – Error code, gmacSuccess if no error • Example usage #include <gmac.h> int main(int argc, char *argv[]) { float *foo = NULL; gmacError_t error; if((error = gmacGlobalMalloc((void **)&foo, FOO_SIZE)) != gmacSuccess) FATAL(“Error allocating memory %s”, gmacErrorString(error)); . . . } 9/23/10 GTC 2010 20 GMAC Example Code (I) typedef struct { float Ke[3][3], km[3][3]; } Material; typedef struct { size_t n; Material *data; } MaterialList; /* Read materials from disk */ size_t readMaterials(FILE *fp, MaterialList &list) { uint16_t n = 0; fread(&n, sizeof(n), 1, fp); ret = gmacGlobalMalloc((void **)&list.data, n * sizeof(Material)); if(ret != gmacSuccess) return 0; fread(list.data, sizeof(Material), n, fp); return n; } /* Read media description from file */ typedef struct { dim3 dim; uint16_t *data } Media; void readMedia(FILE *fp, Media &media); /* Allocate a electromagnetic field */ typedef struct { dim3 dim; float3 *e; float3 *h; float3 *p; float3 *m } Field; void allocateField(Field &f, dim3 dim); 9/23/10 GTC 2010 21 Outline • Introduction • GMAC Memory Model – Asymmetric Memory – Global Memory • GMAC Execution Model – Multi-threading – Inter-thread communication • Conclusions 9/23/10 GTC 2010 22 GMAC and Multi-threading • In the past, one host thread had one CPU • In GMAC, each host thread has: – One CPU – One GPU • A GMAC thread is running at GPU or at the CPU, but not in both at the same time • Create threads using what you already know – pthread_create(...) 9/23/10 GTC 2010 23 GMAC and Multi-threading • Virtual memory accessibility: – Complete address space in CPU mode – Partial address space in GPU mode 9/23/10 CPU CPU GPU GPU Memory GTC 2010 24 Getting Full-duplex PCIe • Use multi-threading to fully utilize the PCIe – One CPU thread launch kernels – One CPU thread writes to shared memory – Once CPU thread reads from shared memory CPU GPU PCIe System Memory 9/23/10 GPU Memory GTC 2010 25 Outline • Introduction • GMAC Memory Model – Asymmetric Memory – Global Memory • GMAC Execution Model – Multi-threading – Inter-thread communication • Conclusions 9/23/10 GTC 2010 26 GPU Handoff and Copying • GPU handoff: – Send the thread’s virtual GPU to another thread – Do not move data, move computation • API Calls – Virtual GPU sending gmacError_t gmacSend(thread_id dest) – Virtual GPU receiving gmacError_t gmacReceive() – Virtual GPU copying gmacError_t gmacCopy(thread_id dest) 9/23/10 GTC 2010 27 GPU virtual GPUs use Case • Exploit data locality in the CPU and GPU • Example: MPEG-4 Encoder: – Each GMAC thread executes one stage – Then, moves to the GPU where the input data is 9/23/10 GPU GPU GPU GPU Motion Estimation DCT and Quantization Motion Compensation Dequantization and IDCT GTC 2010 28 Outline • Introduction • GMAC Memory Model – Asymmetric Memory – Global Memory • GMAC Execution Model – Multi-threading – Inter-thread communication • Conclusions 9/23/10 GTC 2010 29 GMAC Performance 9/23/10 GTC 2010 30 GMAC on Actual Applications (I) • Reverse Time Migration (BSC / Repsol) – Six months – one programmer – Currently in use by Repsol • Single-GPU using CUDA Run-time – Can live with it: double-allocations, memory consistency – Nightmare: overlap GPU computation and data transfers (CUDA streams and double-buffering with pinned memory) • Multi-GPU using CUDA Run-time – Can live with it: lack of IDE for Linux – Nightmare: everything else 9/23/10 GTC 2010 31 GMAC on Actual Applications (II) • Multi-GPU using GMAC: – Double-buffering and pinned memory for free • Disk transfers • GPU to GPU (inter-domain) communication • MPI communication – Clean threading model • One task per CPU thread • Well-know synchronization primitives • It took shorter than the single-GPU version 9/23/10 GTC 2010 32 Conclusions • Single virtual address space for CPUs and GPUs • Use CUDA advanced features – Automatic overlap data communication and computation – Get access to any GPU from any CPU thread • Get more performance from your application more easily • Go: http://adsm.googlecode.com 9/23/10 GTC 2010 33 Future Features • OpenCL and Windows 7 support coming soon • Data-dependence tracking: – Avoid transferring data to the GPU when not used by kernels – Avoid transferring data to the CPU when not modified kernels • Global shared memory partitioning between multiple GPUs 9/23/10 GTC 2010 34 GMAC Global Memory for Accelerators http://adsm.googlecode.com Backup Slides GMAC Advanced Free Features • Get advanced CUDA features for free – Asynchronous data transfers – Pinned memory Asynchronous Copies to device memory Pinned memory for I/O transfers 9/23/10 GTC 2010 37 GMAC Unified Address Space • When allocating memory 1. Allocate accelerator memory 2. Allocate CPU memory at the same virtual address CPU 9/23/10 System Memory Accelerator Memory Accelerator GTC 2010 38 Lazy Update Data Transfers • Avoid unnecessary data copies • Lazy-update: – Call: transfer modified data – Return: transfer when needed CPU 9/23/10 System Memory Accelerator Memory Accelerator GTC 2010 39 Rolling Update Data Transfers • Overlap CPU execution and data transfers • Minimal transfer on-demand • Rolling-update: – Memory-block size granularity CPU 9/23/10 System Memory Accelerator Memory Accelerator GTC 2010 40 GMAC Global Memory for Accelerators http://adsm.googlecode.com