Додаток А Лістинг програми для порівняння швидкодії CPU та GPU #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> #include <conio.h> #include <Windows.h> cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size); __global__ void addKernel(int *c, const int *a, const int *b) { __shared__ float as[1000]; __shared__ float bs[1000]; __shared__ float cs[1000]; int tx=threadIdx.x; // int ty=threadIdx.y; // int tz=threadIdx.z; as[tx]=a[tx]; bs[tx]=b[tx]; __syncthreads(); cs[tx] = as[tx] / bs[tx]; __syncthreads(); cs[tx]=cs[tx]*2; __syncthreads(); c[tx]=cs[tx]; } int main() { const int arraySize = 1000; int a[arraySize]; int b[arraySize]; int c[arraySize]; int cc[arraySize]; int j; for(j=0; j<arraySize; j++) { a[j]=8; b[j]=3; } int start = GetTickCount(); for(j=0; j<arraySize; j++) { cc[j]=a[j]/b[j]; cc[j]=cc[j]*2; } printf ("CPU compute time: %i\n", GetTickCount() - start); // Add vectors in parallel. cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addWithCuda failed!"); return 1; } /* for(j=0; j<arraySize; j++) printf("%3d",c[j]);*/ // cudaDeviceReset must be called before exiting in order for profiling and // tracing tools such as Nsight and Visual Profiler to show complete traces. cudaStatus = cudaDeviceReset(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceReset failed!"); return 1; } getch(); return 0; } // Helper function for using CUDA to add vectors in parallel. cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size) { int *dev_a = 0; int *dev_b = 0; int *dev_c = 0; cudaError_t cudaStatus; float gpuTime=0.0f; cudaEvent_t start, stop; // Choose which GPU to run on, change this on a multi-GPU system. cudaStatus = cudaSetDevice(0); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); goto Error; } // Allocate GPU buffers for three vectors (two input, one output) . cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } // Copy input vectors from host memory to GPU buffers. cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } cudaEventCreate (&start); cudaEventCreate (&stop); cudaEventRecord(start,0); // Launch a kernel on the GPU with one thread for each element. addKernel<<<1, size>>>(dev_c, dev_a, dev_b); cudaEventRecord(stop,0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpuTime, start, stop); cudaEventRecord(start,0); printf("GPU compute time %.4f miliseconds\n",gpuTime); _getch(); // Check for any errors launching the kernel cudaStatus = cudaGetLastError(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); goto Error; } // cudaDeviceSynchronize waits for the kernel to finish, and returns // any errors encountered during the launch. cudaStatus = cudaDeviceSynchronize(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); goto Error; } cudaEventRecord(start,0); // Copy output vector from GPU buffer to host memory. cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } cudaEventRecord(stop,0); cudaEventSynchronize(stop); cudaEventElapsedTime(&gpuTime, start, stop); cudaEventRecord(start,0); printf("CPU compute time: %.4f miliseconds\n",gpuTime); Error: cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b); cudaEventDestroy(start); cudaEventDestroy(stop); return cudaStatus; } Додаток Б Лістинг програми, що виводить інформацію про обчислювальні характеристики відеокарти #include <stdio.h> #include <cuda_runtime_api.h> int main() { int deviceCount; cudaDeviceProp deviceProp; //Сколько устройств CUDA установлено на PC. cudaGetDeviceCount(&deviceCount); printf("Device count: %d\n\n", deviceCount); for (int i = 0; i < deviceCount; i++) { //Получаем информацию об устройстве cudaGetDeviceProperties(&deviceProp, i); //Выводим иформацию об устройстве printf("Device name: %s\n", deviceProp.name); printf("Total global memory: %d\n", deviceProp.totalGlobalMem); printf("Shared memory per block: %d\n", deviceProp.sharedMemPerBlock); printf("Registers per block: %d\n", deviceProp.regsPerBlock); printf("Warp size: %d\n", deviceProp.warpSize); printf("Memory pitch: %d\n", deviceProp.memPitch); printf("Max threads per block: %d\n", deviceProp.maxThreadsPerBlock); printf("Max threads dimensions: x = %d, y = %d, z = %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); printf("Max grid size: x = %d, y = %d, z = %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); printf("Clock rate: %d\n", deviceProp.clockRate); printf("Total constant memory: %d\n", deviceProp.totalConstMem); printf("Compute capability: %d.%d\n", deviceProp.major, deviceProp.minor); printf("Texture alignment: %d\n", deviceProp.textureAlignment); printf("Device overlap: %d\n", deviceProp.deviceOverlap); printf("Multiprocessor count: %d\n", deviceProp.multiProcessorCount); printf("Kernel execution timeout enabled: %s\n", deviceProp.kernelExecTimeoutEnabled ? "true" : "false"); } getchar(); return 0; } Додаток В Лістинг програми з розгалуженнями __global__ void addKernel(int *c, const int *a, const int *b) { __shared__ float as[aSize]; __shared__ float bs[aSize]; __shared__ float cs[aSize]; int tx=threadIdx.x; if(tx%3==0) { as[tx]=a[tx]; bs[tx]=b[tx]; cs[tx] = as[tx] + bs[tx]; cs[tx]=cs[tx]+2; c[tx]=cs[tx]; } if(tx%3==1) { as[tx]=a[tx]; bs[tx]=b[tx]; cs[tx] = as[tx] + bs[tx]; cs[tx]=cs[tx]+2; c[tx]=cs[tx]; } if(tx%3==2) { as[tx]=a[tx]; bs[tx]=b[tx]; cs[tx] = as[tx] + bs[tx]; cs[tx]=cs[tx]+2; c[tx]=cs[tx]; } } Додаток З Програмна реалізація методу #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> #include <iostream> #include <iomanip> using namespace std; int intersection(int a, int b) { int result=a*b; if (result==4) result=2; if (result==9) result=3; if(result==6) result=0; return result; } /*__global__ void MyKernel(int **results, int ncubes, int var) { int cubes_number = pow(var,ncubes); __shared__ int buffer[100][100]; int tx=threadIdx.x; if (tx<=cubes_number) { for(int indx=0; indx<var; indx++) { buffer[tx][indx]=results[tx][indx]; } } }*/ int main() { const int width = 100; const int height = 100; float* devPtr; int a[height][width]; char array[height][width]; int col=0, row=0; cout << "Enter number of cubes in coverage: "; cin >> row; cout << "Enter number of variables in cube: "; cin >> col; int res_row=pow(col,row); for (int i = 0; i < row; ++i) { for (int j = 0; j < col; ++j) { cout << "Enter A[" << i+1 << "][" << j+1 << "]: "; cin >> array[i][j]; } } cout << endl << "Coverage: " << endl; for (int i = 0; i < row; ++i) { for (int j = 0; j < col; ++j) { cout << setw(3) << array[i][j]; } cout << endl; } //load and display input array cout << "a array: "<< endl; for (int i = 0 ; i < row; i ++) { for (int j = 0 ; j < col; j ++) { if(array[i][j]=='x') a[i][j] = 1; if(array[i][j]=='0') a[i][j] = 2; if(array[i][j]=='1') a[i][j] = 3; cout << a[i][j] << " "; } cout << endl; } cout<< endl; //Allocating Device memory for 2D array using pitch size_t host_orig_pitch = width * sizeof(float); //host original array pitch in bytes size_t pitch;// pitch for the device array cudaMallocPitch(&devPtr, &pitch, width * sizeof(float), height); /*cout << "host_orig_pitch: " << host_orig_pitch << endl; cout << "sizeof(float): " << sizeof(float)<< endl; cout << "width: " << width << endl; cout << "height: " << height << endl; cout << "pitch: " << pitch << endl; cout << endl;*/ cudaMemcpy2D(devPtr, pitch, a, host_orig_pitch, width*sizeof(float), height, cudaMemcpyHostToDevice); int b[height][width]; //load b and display array /* cout << "b array: "<< endl; for (int i = 0 ; i < row; i ++) { for (int j = 0 ; j < col; j ++) { b[i][j] = 0; cout << b[i][j] << " "; } cout << endl; } cout<< endl;*/ //MyKernel<<<1, width>>>(devPtr, row, col); //cudaThreadSynchronize(); //cudaMemcpy2d(dst, dPitch,src ,sPitch, width, height, typeOfCopy ) cudaMemcpy2D(b, host_orig_pitch, devPtr, pitch, width * sizeof(float), height, cudaMemcpyDeviceToHost); // should be filled in with the values of array a. // cout << "returned array" << endl; /* for(int i = 0 ; i < row ; i++){ for (int j = 0 ; j < col ; j++){ cout<< b[i][j] << " " ; } cout<<endl; } cout<<endl;*/ int coverage[10][100][100]; for (int i=0; i<10; i++) for (int j=0; j<100; j++) for(int k=0; k<100; k++) coverage[i][j][k]=1; for (int i=0; i<row; i++) { for(int j=0; j<col; j++) { coverage[i][j][j]=a[i][j]; if(a[i][j]==1) { for(int n=0;n<col;n++) { coverage[i][j][n]=0; } } } } cout << "starting coverage" << endl; for (int i=0; i<row; i++) { for (int j=0; j<col; j++) { for (int k=0; k<col; k++) { cout<< coverage[i][j][k]<< " " ; } cout<<endl; } cout<<endl; } // cout << "starting results" << endl; int results[100][100]; for (int i=0; i<res_row; i++) { for (int j=0; j<col; j++) { results[i][j]=0; } } for (int i=0; i<col; i++) { for (int j=0; j<col; j++) { results[i][j]=coverage[0][i][j]; // cout<< results[i][j]<< " " ; } // cout<<endl; } system("pause"); for (int ccube=1; ccube<row; ccube++) { for (int cube=0; cube<col; cube++) { for(int rcube=0; rcube<res_row; rcube++) { for (int coll=0;coll<col; coll++) { results[rcube][coll]=intersection(coverage[ccube1][cube][coll],results[rcube][coll]); if(results[rcube][coll]==1) { for(int n=0; n<col; n++) results[rcube][n]=0; rcube++; } } } } } /* cout << "result array: "<< endl; for (int i=0; i<res_row; i++) { for (int j=0; j<col; j++) { cout<< results[i][j] << " " ; } cout<<endl; } system("pause");*/ int sum=0; for (int i=0; i<res_row; i++) { sum+=results[i][0]; } if (sum==0) cout<<endl<<"Conclusion is right"<< endl; if (sum!=0) cout<<endl<<"Conclusion is wrong"<< endl; system("pause"); return 0; } cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size) { int *dev_a = 0; int *dev_b = 0; int *dev_c = 0; cudaError_t cudaStatus; // Choose which GPU to run on, change this on a multi-GPU system. cudaStatus = cudaSetDevice(0); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaSetDevice failed! 