CUDA Performance - Always return different values -


this code:

using namespace std; #include <iostream> #include <stdio.h> #include <stdlib.h>  const int n = 8000;  void fillarray(int *data, int count) {     (int = 0; < count; i++)         data[i] = rand() % 100; }  __global__ void add(int* a, int *b, int *c) {      int tid = threadidx.x + blockidx.x * blockdim.x;     if (tid < n) {         c[tid] = a[tid] + b[tid];     } }  __global__ void subtract(int* a, int *b, int *c) {      int tid = threadidx.x + blockidx.x * blockdim.x;     if (tid < n) {         c[tid] = a[tid] - b[tid];     } }  __global__ void multiply(int* a, int *b, int *c) {      int tid = threadidx.x + blockidx.x * blockdim.x;     if (tid < n) {         c[tid] = a[tid] * b[tid];     } }  __global__ void divide(int* a, int *b, int *c) {      int tid = threadidx.x + blockidx.x * blockdim.x;     if (tid < n) {         c[tid] = a[tid] / b[tid];     } }  __global__ void modu(int* a, int *b, int *c) {      int tid = threadidx.x + blockidx.x * blockdim.x;     if (tid < n) {         c[tid] = a[tid] % b[tid];     } }  __global__ void neg(int *data, int *c) {      int tid = threadidx.x + blockidx.x * blockdim.x;     if (tid < n) {         c[tid] = -data[tid];     } }  float duration(int *deva, int *devb, int *devc, int blockspergrid, int threadsperblock) {      cudaevent_t start, stop;     float elapsedtime;      cudaeventcreate(&start);     cudaeventcreate(&stop);     cudaeventrecord(start, 0);      int harrayc[n];      add<<<blockspergrid, threadsperblock>>>(deva, devb,devc);     cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost);      subtract<<<blockspergrid, threadsperblock>>>(deva, devb,devc);     cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost);      multiply<<<blockspergrid, threadsperblock>>>(deva, devb,devc);     cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost);      divide<<<blockspergrid, threadsperblock>>>(deva, devb,devc);     cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost);      modu<<<blockspergrid, threadsperblock>>>(deva, devb,devc);     cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost);      neg<<<blockspergrid, threadsperblock>>>(deva,devc);     cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost);      neg<<<blockspergrid, threadsperblock>>>(devb,devc);     cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost);      cudaeventrecord(stop, 0);     cudaeventsynchronize(stop);     cudaeventelapsedtime(&elapsedtime, start, stop);      cudaeventdestroy(start);     cudaeventdestroy(stop);      return elapsedtime; }  int main(void) {      int *a, *b;     = new int[n];     b = new int [n];      float dur = 0;      int *deva, *devb,*devc;      cudamalloc((void**) &deva, n * sizeof(int));     cudamalloc((void**) &devb, n * sizeof(int));     cudamalloc((void**) &devc, n * sizeof(int));      fillarray(a, n);     fillarray(b, n);      cudamemcpy(deva, a, n * sizeof(int), cudamemcpyhosttodevice);     cudamemcpy(devb, b, n * sizeof(int), cudamemcpyhosttodevice);       dur = duration(deva, devb, devc,n, 1);      cout << "global memory version:\n";     cout << "process completed in " << dur;     cout << " data set of " << n << " integers.";        cudafree(deva);     cudafree(devb);     delete [] a;     delete [] b;      return 0; } 

what want know total miliseconds in duration function. miliseconds return in different values. 10 ms 0.78652 30 miliseconds.why? wrong code?

this may caused loading/unloading of nvidia drivers. think of initialization step gpu.

you can either set gpu persistence mode:

nvidia-smi -pm 1 

or run dummy kernel before timing gpu code trigger loading of drivers:

__global__ void dummy() {     // kernel nothing, "warm-up" }  // before cudaeventrecord etc. dummy<<<blockspergrid, threadsperblock>>>(); 

or maybe use cudathreadsynchronize() before timing kernels.


Comments

Popular posts from this blog

php - cannot display multiple markers in google maps v3 from traceroute result -

c# - DetailsView in ASP.Net - How to add another column on the side/add a control in each row? -

javascript - firefox memory leak -