Page-Locked Memory and CUDA Streams These notes introduce the use of multiple CUDA streams to overlap memory transfers with kernel computations. First need to introduce paged-locked memory as streams need page-locked memory These materials come from Chapter 10 of “CUDA by Example” by Jason Sanders and Edwards Kandrot. ITCS 4/5010 CUDA Programming, UNC-Charlotte, B. Wilkinson, Feb 4, 2013 Streams.pptx 1 Page-locked host memory (also called “pinned host” memory) Page-locked memory is not paged in and out main memory by the OS through paging but will remain resident. Allows: • • • Concurrent host/device memory transfers with kernel operations (Compute capability 2.x) Host memory can be mapped to device address space (Compute capability > 1.0) Memory bandwidth is higher • Uses real addresses rather than virtual addresses • Does not need to intermediate copy buffering 2 Questions What is paging? What are real and virtual addresses? 3 Paging and virtual memory recap A process is stored as one or more distributed pages Paging Hard drive (disk) One process (application) RA = 2, VA = 46 say Page Real address– the actual physical address of the location Virtual address – the address , allocated to a process by the paging/virtual memory mechanism to allow the pages to reside anywhere, allocated to a process RA = 0, VA = 45 say Real-virtual address translation done by a look up table, partly in hardware (translation look aside buffer, TLB) for recently used pages and partly in software Main memory Page - a block of memory using with virtual memory Pages are transferred to and from disk to make space 4 More information in an undergraduate Computer Architecture and Operating system courses Note on using page-locked memory Using page-locked memory will reduce memory available to the OS for paging and so need to be careful in allocating it 5 Allocating page locked memory cudaMallocHost ( void ** ptr, size_t size ) Allocates page-locked host memory that is accessible to device. cudaHostAlloc (void ** ptr, size_t size, unsigned int flags) Allocates page-locked host memory that is accessible to device – seems to have more options Notes: “The driver tracks the virtual memory ranges allocated with this function and automatically accelerates calls to functions such as cudaMemcpy () Since the memory can be accessed directly by the device, it can be read or written with much higher bandwidth than pageable memory obtained with functions such as malloc().” 6 http://www.clear.rice.edu/comp422/resources/cuda/html/group__CUDART__MEMORY_g9f93d9600f4504e0d637ce b43c91ebad.html Freeing page locked memory cudaFreeHost (void * ptr) “Frees the memory space pointed to by ptr, which must have been returned by a previous call to cudaMallocHost() or cudaHostAlloc().” Parameters: ptr - Pointer to memory to free http://www.clear.rice.edu/comp422/resources/cuda/html/group__CUDART__MEMORY_geda eb2708ad3f74d5b417ee1874ec84a.html#gedaeb2708ad3f74d5b417ee1874ec84a 7 //Pinned memory test written by Barry Wilkinson, UNC-Charlotte. Feb 10, 2011. Test of Pinned Memory #include <stdio.h> #include <cuda.h> #include <stdlib.h> #define SIZE (10*1024*1024) // number of bytes in arrays 10 MBytes GPU memory int main(int argc, char *argv[]) { int i; int *a; int *dev_a; // loop counter cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); // using cuda events to measure time // create events No address translation needed (no paging) float elapsed_time_ms1, elapsed_time_ms3; /* --------------------ENTER INPUT PARAMETERS AND DATA -----------------------*/ cudaMalloc((void**)&dev_a, SIZE); // allocate memory on device /* ---------------- COPY USING PINNED MEMORY -------------------- */ cudaHostAlloc((void**)&a, SIZE ,cudaHostAllocDefault); // allocate page-locked memory on host CPU memory cudaEventRecord(start, 0); for(i = 0; i < 100; i++) { // make transfer 100 times cudaMemcpy(dev_a, a , SIZE ,cudaMemcpyHostToDevice); //copy to device cudaMemcpy(a,dev_a, SIZE ,cudaMemcpyDeviceToHost); //copy back to host } cudaEventRecord(stop, 0); // instrument code to measure end time cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsed_time_ms1, start, stop ); printf("Time to copy %d bytes of data 100 times on GPU, pinned memory: %f ms\n", SIZE, elapsed_time_ms1); // exec. time Should have used cudaFreeHost() here! Pointer a re-used on next slide 8 /* ---------------- COPY USING REGULAR MEMORY-------------------- */ a = (int*) malloc(SIZE); // allocate regular memory on host cudaEventRecord(start, 0); for(i = 0; i < 100; i++) { cudaMemcpy(dev_a, a , SIZE ,cudaMemcpyHostToDevice); //copy to device cudaMemcpy(a,dev_a, SIZE ,cudaMemcpyDeviceToHost); //copy back to host } cudaEventRecord(stop, 0); // instrument code to measue end time cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsed_time_ms3, start, stop ); printf("Time to copy %d bytes of data 100 times on GPU: %f ms\n", SIZE, elapsed_time_ms3); // exec. time /*--------------------------SPEEDUP ---------------------------------*/ printf("Speedup of using pinned memory = %f\n", (float) elapsed_time_ms3 / (float) elapsed_time_ms1); /* -------------- clean up ---------------------------------------*/ free(a); cudaFree(dev_a); cudaEventDestroy(start); cudaEventDestroy(stop); return 0; } 9 My code 10 Using NVIDIA bandwidthTest Coit-grid06 Coit-grid07 ./bandwidthTest Starting... bandwidthTest Starting... Running on... Running on... Device 0: Tesla C2050 Quick Mode Device 0: Tesla C2050 Quick Mode Host to Device Bandwidth, 1 Device(s), Paged memory Host to Device Bandwidth, 1 Device(s), Paged memor Transfer Size (Bytes) Bandwidth(MB/s) Transfer Size (Bytes) Bandwidth(MB/s) 33554432 1026.7 33554432 4773.7 Device to Host Bandwidth, 1 Device(s), Paged memory Device to Host Bandwidth, 1 Device(s), Paged memor Transfer Size (Bytes) Bandwidth(MB/s) Transfer Size (Bytes) Bandwidth(MB/s) 33554432 1108.1 33554432 4060.4 Device to Device Bandwidth, 1 Device(s) Transfer Size (Bytes) Bandwidth(MB/s) 33554432 84097.6 Device to Device Bandwidth, 1 Device(s) Transfer Size (Bytes) Bandwidth(MB/s) 33554432 84254.9 [bandwidthTest] - Test results: PASSED [bandwidthTest] - Test results: PASSED Press <Enter> to Quit... ----------------------------------------------------------- Press <Enter> to Quit... ----------------------------------------------------------- 11 CUDA Streams A CUDA Stream is a sequence of operations (commands) that are executed in order. Multiple CUDA streams can be created and executed together and interleaved although the “program order” is always maintained within each stream. Streams provide a mechanism to overlap memory transfer and computations operations in different stream for increased performance if sufficient resources are available. 12 Creating a stream Done by creating a stream object and associated it with a series of CUDA commands that then becomes the stream. CUDA commands have a stream pointer as an argument: Cannot use cudaStream_t stream1; cudaStreamCreate(&stream1); Stream cudaMemcpyAsync(…, stream1); stream1 MyKernel<<< grid, block, stream1>>>(…); cudaMemcpyAsync(… , stream1); regular cudaMemcpy with streams. Need asynchronous commands for concurrent operation see next 13 cudaMemcpyAsync( …, stream) Asynchronous version of cudaMemcpy that copies date to/from host and the device May return before copy complete A stream argument specified. Needs “page-locked” memory 14 #define SIZE (N*20) … int main(void) { int *a, *b, *c; int *dev_a, *dev_b, *dev_c; Code Example Page 194-95 CUDA by Example, without error detection macros cudaMalloc( (void**)&dev_a, N * sizeof(int) ); cudaMalloc( (void**)&dev_b, N * sizeof(int) ); cudaMalloc( (void**)&dev_c, N * sizeof(int) ); One stream cudaHostAlloc((void**)&a,SIZE*sizeof(int),cudaHostAllocDefault); // paged-locked cudaHostAlloc((void**)&b,SIZE*sizeof(int),cudaHostAllocDefault); cudaHostAlloc((void**)&c,SIZE*sizeof(int),cudaHostAllocDefault); for(int i=0;i<SIZE;i++) { a[i] = rand(); b[i] = rand(); } } // load data for(int i=0;I < SIZE;i+= N { // loop over data in chunks cudaMemcpyAsync(dev_a,a+i,N*sizeof(int),cudaMemcpyHostToDevice,stream); cudaMemcpyAsync(dev_b,a+i,N*sizeof(int),cudaMemcpyHostToDevice,stream); kernel<<<N/256,256,0,stream>>>(dev_a,dev-b,dev_c); cudaMemcpyAsync(c+1,dev_c,N*sizeof(int),cudaMemcpyDeviceToHost,stream) } cudaStreamSynchronise(stream); // wait for stream to finish return 0; 15 Multiple streams Assuming device can support it (can check in code if needed), create two streams with: cudaStream_t stream1, stream2; cudaStreamCreate(&stream1); cudaStreamCreate(&stream2); and then duplicate stream code for each stream 16 int *dev_a1, *dev_b1, *dev_c1; // stream 1 mem ptrs int *dev_a2, *dev_b2, *dev_c2; // stream 2 mem ptrs First attempt //stream 1 described in book cudaMalloc( (void**)&dev_a1, N * sizeof(int) ); cudaMalloc( (void**)&dev_b1, N * sizeof(int) ); concatenate cudaMalloc( (void**)&dev_c1, N * sizeof(int) ); statements of each //stream 2 cudaMalloc( (void**)&dev_a2, N * sizeof(int) ); stream cudaMalloc( (void**)&dev_b2, N * sizeof(int) ); cudaMalloc( (void**)&dev_c2, N * sizeof(int) ); … for(int i=0;I < SIZE;i+= N*2 { // loop over data in chunks // stream 1 cudaMemcpyAsync(dev_a1,a+i,N*sizeof(int),cudaMemcpyHostToDevice,stream1); cudaMemcpyAsync(dev_b1,a+i,N*sizeof(int),cudaMemcpyHostToDevice,stream1); kernel<<<N/256,256,0,stream1>>>(dev_a,dev-b,dev_c); cudaMemcpyAsync(c+1,dev_c1,N*sizeof(int),cudaMemcpyDeviceToHost,stream1) //stream 2 cudaMemcpyAsync(dev_a2,a+i,N*sizeof(int),cudaMemcpyHostToDevice,stream2); cudaMemcpyAsync(dev_b2,a+i,N*sizeof(int),cudaMemcpyHostToDevice,stream2); kernel<<<N/256,256,0,stream2>>>(dev_a,dev-b,dev_c); cudaMemcpyAsync(c+1,dev_c2,N*sizeof(int),cudaMemcpyDeviceToHost,stream2) } cudaStreamSynchronise(stream1); // wait for stream to finish cudaStreamSynchronise(stream2); // wait for stream to finish 17 Simply concatenating statements does not work well because of the way the GPU schedules work 18 Page 206 CUDA by Example, 19 Page 207 CUDA by Example, 20 Page 208 CUDA by Example Second attempt described in book Interleave statements of each stream for(int i=0;I < SIZE;i+= N*2 { // loop over data in chunks // interleave stream 1 and stream 2 cudaMemcpyAsync(dev_a1,a+i,N*sizeof(int),cudaMemcpyHostToDevice,stream1); cudaMemcpyAsync(dev_a2,a+i,N*sizeof(int),cudaMemcpyHostToDevice,stream2); cudaMemcpyAsync(dev_b1,a+i,N*sizeof(int),cudaMemcpyHostToDevice,stream1); cudaMemcpyAsync(dev_b2,a+i,N*sizeof(int),cudaMemcpyHostToDevice,stream2); kernel<<<N/256,256,0,stream1>>>(dev_a,dev-b,dev_c); kernel<<<N/256,256,0,stream2>>>(dev_a,dev-b,dev_c); cudaMemcpyAsync(c+1,dev_c1,N*sizeof(int),cudaMemcpyDeviceToHost,stream1) cudaMemcpyAsync(c+1,dev_c2,N*sizeof(int),cudaMemcpyDeviceToHost,stream2) } 21 Page 210 CUDA by Example 22 Questions