CUDA Performance - Always return different values -
this code:
using namespace std; #include <iostream> #include <stdio.h> #include <stdlib.h> const int n = 8000; void fillarray(int *data, int count) { (int = 0; < count; i++) data[i] = rand() % 100; } __global__ void add(int* a, int *b, int *c) { int tid = threadidx.x + blockidx.x * blockdim.x; if (tid < n) { c[tid] = a[tid] + b[tid]; } } __global__ void subtract(int* a, int *b, int *c) { int tid = threadidx.x + blockidx.x * blockdim.x; if (tid < n) { c[tid] = a[tid] - b[tid]; } } __global__ void multiply(int* a, int *b, int *c) { int tid = threadidx.x + blockidx.x * blockdim.x; if (tid < n) { c[tid] = a[tid] * b[tid]; } } __global__ void divide(int* a, int *b, int *c) { int tid = threadidx.x + blockidx.x * blockdim.x; if (tid < n) { c[tid] = a[tid] / b[tid]; } } __global__ void modu(int* a, int *b, int *c) { int tid = threadidx.x + blockidx.x * blockdim.x; if (tid < n) { c[tid] = a[tid] % b[tid]; } } __global__ void neg(int *data, int *c) { int tid = threadidx.x + blockidx.x * blockdim.x; if (tid < n) { c[tid] = -data[tid]; } } float duration(int *deva, int *devb, int *devc, int blockspergrid, int threadsperblock) { cudaevent_t start, stop; float elapsedtime; cudaeventcreate(&start); cudaeventcreate(&stop); cudaeventrecord(start, 0); int harrayc[n]; add<<<blockspergrid, threadsperblock>>>(deva, devb,devc); cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost); subtract<<<blockspergrid, threadsperblock>>>(deva, devb,devc); cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost); multiply<<<blockspergrid, threadsperblock>>>(deva, devb,devc); cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost); divide<<<blockspergrid, threadsperblock>>>(deva, devb,devc); cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost); modu<<<blockspergrid, threadsperblock>>>(deva, devb,devc); cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost); neg<<<blockspergrid, threadsperblock>>>(deva,devc); cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost); neg<<<blockspergrid, threadsperblock>>>(devb,devc); cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost); cudaeventrecord(stop, 0); cudaeventsynchronize(stop); cudaeventelapsedtime(&elapsedtime, start, stop); cudaeventdestroy(start); cudaeventdestroy(stop); return elapsedtime; } int main(void) { int *a, *b; = new int[n]; b = new int [n]; float dur = 0; int *deva, *devb,*devc; cudamalloc((void**) &deva, n * sizeof(int)); cudamalloc((void**) &devb, n * sizeof(int)); cudamalloc((void**) &devc, n * sizeof(int)); fillarray(a, n); fillarray(b, n); cudamemcpy(deva, a, n * sizeof(int), cudamemcpyhosttodevice); cudamemcpy(devb, b, n * sizeof(int), cudamemcpyhosttodevice); dur = duration(deva, devb, devc,n, 1); cout << "global memory version:\n"; cout << "process completed in " << dur; cout << " data set of " << n << " integers."; cudafree(deva); cudafree(devb); delete [] a; delete [] b; return 0; }
what want know total miliseconds in duration function. miliseconds return in different values. 10 ms 0.78652 30 miliseconds.why? wrong code?
this may caused loading/unloading of nvidia drivers. think of initialization step gpu.
you can either set gpu persistence mode:
nvidia-smi -pm 1
or run dummy kernel before timing gpu code trigger loading of drivers:
__global__ void dummy() { // kernel nothing, "warm-up" } // before cudaeventrecord etc. dummy<<<blockspergrid, threadsperblock>>>();
or maybe use cudathreadsynchronize()
before timing kernels.
Comments
Post a Comment