Додаток А Лістинг програми для порівняння швидкодії CPU та

advertisement
Додаток А
Лістинг програми для порівняння швидкодії CPU та GPU
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <conio.h>
#include <Windows.h>
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
__global__ void addKernel(int *c, const int *a, const int *b)
{
__shared__ float as[1000];
__shared__ float bs[1000];
__shared__ float cs[1000];
int tx=threadIdx.x;
// int ty=threadIdx.y;
// int tz=threadIdx.z;
as[tx]=a[tx];
bs[tx]=b[tx];
__syncthreads();
cs[tx] = as[tx] / bs[tx];
__syncthreads();
cs[tx]=cs[tx]*2;
__syncthreads();
c[tx]=cs[tx];
}
int main()
{
const int arraySize = 1000;
int a[arraySize];
int b[arraySize];
int c[arraySize];
int cc[arraySize];
int j;
for(j=0; j<arraySize; j++)
{
a[j]=8; b[j]=3;
}
int start = GetTickCount();
for(j=0; j<arraySize; j++)
{
cc[j]=a[j]/b[j];
cc[j]=cc[j]*2;
}
printf ("CPU compute time: %i\n", GetTickCount() - start);
// Add vectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
/*
for(j=0; j<arraySize; j++)
printf("%3d",c[j]);*/
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
getch();
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
cudaError_t cudaStatus;
float gpuTime=0.0f;
cudaEvent_t start, stop;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaEventCreate (&start);
cudaEventCreate (&stop);
cudaEventRecord(start,0);
// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&gpuTime, start, stop);
cudaEventRecord(start,0);
printf("GPU compute time %.4f miliseconds\n",gpuTime);
_getch();
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching
addKernel!\n", cudaStatus);
goto Error;
}
cudaEventRecord(start,0);
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&gpuTime, start, stop);
cudaEventRecord(start,0);
printf("CPU compute time: %.4f miliseconds\n",gpuTime);
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return cudaStatus;
}
Додаток Б
Лістинг програми, що виводить інформацію про обчислювальні характеристики
відеокарти
#include <stdio.h>
#include <cuda_runtime_api.h>
int main()
{
int deviceCount;
cudaDeviceProp deviceProp;
//Сколько устройств CUDA установлено на PC.
cudaGetDeviceCount(&deviceCount);
printf("Device count: %d\n\n", deviceCount);
for (int i = 0; i < deviceCount; i++)
{
//Получаем информацию об устройстве
cudaGetDeviceProperties(&deviceProp, i);
//Выводим иформацию об устройстве
printf("Device name: %s\n", deviceProp.name);
printf("Total global memory: %d\n", deviceProp.totalGlobalMem);
printf("Shared memory per block: %d\n", deviceProp.sharedMemPerBlock);
printf("Registers per block: %d\n", deviceProp.regsPerBlock);
printf("Warp size: %d\n", deviceProp.warpSize);
printf("Memory pitch: %d\n", deviceProp.memPitch);
printf("Max threads per block: %d\n", deviceProp.maxThreadsPerBlock);
printf("Max threads dimensions: x = %d, y = %d, z = %d\n",
deviceProp.maxThreadsDim[0],
deviceProp.maxThreadsDim[1],
deviceProp.maxThreadsDim[2]);
printf("Max grid size: x = %d, y = %d, z = %d\n",
deviceProp.maxGridSize[0],
deviceProp.maxGridSize[1],
deviceProp.maxGridSize[2]);
printf("Clock rate: %d\n", deviceProp.clockRate);
printf("Total constant memory: %d\n", deviceProp.totalConstMem);
printf("Compute capability: %d.%d\n", deviceProp.major, deviceProp.minor);
printf("Texture alignment: %d\n", deviceProp.textureAlignment);
printf("Device overlap: %d\n", deviceProp.deviceOverlap);
printf("Multiprocessor count: %d\n", deviceProp.multiProcessorCount);
printf("Kernel execution timeout enabled: %s\n",
deviceProp.kernelExecTimeoutEnabled ? "true" : "false");
}
getchar();
return 0;
}
Додаток В
Лістинг програми з розгалуженнями
__global__ void addKernel(int *c, const int *a, const int *b)
{
__shared__ float as[aSize];
__shared__ float bs[aSize];
__shared__ float cs[aSize];
int tx=threadIdx.x;
if(tx%3==0)
{
as[tx]=a[tx];
bs[tx]=b[tx];
cs[tx] = as[tx] + bs[tx];
cs[tx]=cs[tx]+2;
c[tx]=cs[tx];
}
if(tx%3==1)
{
as[tx]=a[tx];
bs[tx]=b[tx];
cs[tx] = as[tx] + bs[tx];
cs[tx]=cs[tx]+2;
c[tx]=cs[tx];
}
if(tx%3==2)
{
as[tx]=a[tx];
bs[tx]=b[tx];
cs[tx] = as[tx] + bs[tx];
cs[tx]=cs[tx]+2;
c[tx]=cs[tx];
}
}
Додаток З
Програмна реалізація методу
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <iomanip>
using namespace std;
int intersection(int a, int b)
{
int result=a*b;
if (result==4)
result=2;
if (result==9)
result=3;
if(result==6)
result=0;
return result;
}
/*__global__ void MyKernel(int **results, int ncubes, int var)
{
int cubes_number = pow(var,ncubes);
__shared__ int buffer[100][100];
int tx=threadIdx.x;
if (tx<=cubes_number)
{
for(int indx=0; indx<var; indx++)
{
buffer[tx][indx]=results[tx][indx];
}
}
}*/
int main()
{
const int width = 100;
const int height = 100;
float* devPtr;
int a[height][width];
char array[height][width];
int col=0, row=0;
cout << "Enter number of cubes in coverage: ";
cin >> row;
cout << "Enter number of variables in cube: ";
cin >> col;
int res_row=pow(col,row);
for (int i = 0; i < row; ++i)
{
for (int j = 0; j < col; ++j)
{
cout << "Enter A[" << i+1 << "][" << j+1 << "]: ";
cin >> array[i][j];
}
}
cout << endl << "Coverage: " << endl;
for (int i = 0; i < row; ++i) {
for (int j = 0; j < col; ++j) {
cout << setw(3) << array[i][j];
}
cout << endl;
}
//load and display input array
cout << "a array: "<< endl;
for (int i = 0 ; i < row; i ++)
{
for (int j = 0 ; j < col; j ++)
{
if(array[i][j]=='x')
a[i][j] = 1;
if(array[i][j]=='0')
a[i][j] = 2;
if(array[i][j]=='1')
a[i][j] = 3;
cout << a[i][j] << " ";
}
cout << endl;
}
cout<< endl;
//Allocating Device memory for 2D array using pitch
size_t host_orig_pitch = width * sizeof(float); //host original array pitch in bytes
size_t pitch;// pitch for the device array
cudaMallocPitch(&devPtr, &pitch, width * sizeof(float), height);
/*cout << "host_orig_pitch: " << host_orig_pitch << endl;
cout << "sizeof(float): " << sizeof(float)<< endl;
cout << "width: " << width << endl;
cout << "height: " << height << endl;
cout << "pitch: " << pitch << endl;
cout << endl;*/
cudaMemcpy2D(devPtr, pitch, a, host_orig_pitch, width*sizeof(float), height,
cudaMemcpyHostToDevice);
int b[height][width];
//load b and display array
/*
cout << "b array: "<< endl;
for (int i = 0 ; i < row; i ++)
{
for (int j = 0 ; j < col; j ++)
{
b[i][j] = 0;
cout << b[i][j] << " ";
}
cout << endl;
}
cout<< endl;*/
//MyKernel<<<1, width>>>(devPtr, row, col);
//cudaThreadSynchronize();
//cudaMemcpy2d(dst, dPitch,src ,sPitch, width, height, typeOfCopy )
cudaMemcpy2D(b, host_orig_pitch, devPtr, pitch, width * sizeof(float), height,
cudaMemcpyDeviceToHost);
// should be filled in with the values of array a.
//
cout << "returned array" << endl;
/*
for(int i = 0 ; i < row ; i++){
for (int j = 0 ; j < col ; j++){
cout<< b[i][j] << " " ;
}
cout<<endl;
}
cout<<endl;*/
int coverage[10][100][100];
for (int i=0; i<10; i++)
for (int j=0; j<100; j++)
for(int k=0; k<100; k++)
coverage[i][j][k]=1;
for (int i=0; i<row; i++)
{
for(int j=0; j<col; j++)
{
coverage[i][j][j]=a[i][j];
if(a[i][j]==1)
{
for(int n=0;n<col;n++)
{
coverage[i][j][n]=0;
}
}
}
}
cout << "starting coverage" << endl;
for (int i=0; i<row; i++)
{
for (int j=0; j<col; j++)
{
for (int k=0; k<col; k++)
{
cout<< coverage[i][j][k]<< " " ;
}
cout<<endl;
}
cout<<endl;
}
//
cout << "starting results" << endl;
int results[100][100];
for (int i=0; i<res_row; i++)
{
for (int j=0; j<col; j++)
{
results[i][j]=0;
}
}
for (int i=0; i<col; i++)
{
for (int j=0; j<col; j++)
{
results[i][j]=coverage[0][i][j];
//
cout<< results[i][j]<< " " ;
}
//
cout<<endl;
}
system("pause");
for (int ccube=1; ccube<row; ccube++)
{
for (int cube=0; cube<col; cube++)
{
for(int rcube=0; rcube<res_row; rcube++)
{
for (int coll=0;coll<col; coll++)
{
results[rcube][coll]=intersection(coverage[ccube1][cube][coll],results[rcube][coll]);
if(results[rcube][coll]==1)
{
for(int n=0; n<col; n++)
results[rcube][n]=0;
rcube++;
}
}
}
}
}
/*
cout << "result array: "<< endl;
for (int i=0; i<res_row; i++)
{
for (int j=0; j<col; j++)
{
cout<< results[i][j] << " " ;
}
cout<<endl;
}
system("pause");*/
int sum=0;
for (int i=0; i<res_row; i++)
{
sum+=results[i][0];
}
if (sum==0)
cout<<endl<<"Conclusion is right"<< endl;
if (sum!=0)
cout<<endl<<"Conclusion is wrong"<< endl;
system("pause");
return 0;
}
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching
addKernel!\n", cudaStatus);
goto Error;
}
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
Download