profiling - Cuda Performance measuring - Elapsed time returns zero -

i wrote few kernel function , wonder how many miliseconds process these functions.

using namespace std; #include <iostream> #include <stdio.h> #include <stdlib.h> #define n 8000  void fillarray(int *data, int count) {     (int = 0; < count; i++)         data[i] = rand() % 100; }  __global__ void add(int* a, int *b) {     int add = 0;      int tid = threadidx.x + blockidx.x * blockdim.x;     if (tid < n) {         add = a[tid] + b[tid];     } }  __global__ void subtract(int* a, int *b) {     int subtract = 0;      int tid = threadidx.x + blockidx.x * blockdim.x;     if (tid < n) {         subtract = a[tid] - b[tid];     } }  __global__ void multiply(int* a, int *b) {     int multiply = 0;      int tid = threadidx.x + blockidx.x * blockdim.x;     if (tid < n) {         multiply = a[tid] * b[tid];     } }  __global__ void divide(int* a, int *b) {     int divide = 0;      int tid = threadidx.x + blockidx.x * blockdim.x;     if (tid < n) {         divide = a[tid] / b[tid];     } }  __global__ void modu(int* a, int *b) {     int modulus = 0;      int tid = threadidx.x + blockidx.x * blockdim.x;     if (tid < n) {         modulus = a[tid] % b[tid];     } }  __global__ void neg(int *data) {      int tid = threadidx.x + blockidx.x * blockdim.x;     if (tid < n) {         data[tid] = -data[tid];     } }  float duration(int *deva, int *devb, int blockspergrid, int threadsperblock) {      cudaevent_t start, stop;     float elapsedtime;      cudaeventcreate(&start);     cudaeventcreate(&stop);     cudaeventrecord(start, 0);      add<<<blockspergrid, threadsperblock>>>(deva, devb);     subtract<<<blockspergrid, threadsperblock>>>(deva, devb);     multiply<<<blockspergrid, threadsperblock>>>(deva, devb);     divide<<<blockspergrid, threadsperblock>>>(deva, devb);     modu<<<blockspergrid, threadsperblock>>>(deva, devb);     neg<<<blockspergrid, threadsperblock>>>(deva);     neg<<<blockspergrid, threadsperblock>>>(devb);      cudaeventrecord(stop, 0);     cudaeventsynchronize(stop);     cudaeventelapsedtime(&elapsedtime, start, stop);      cudaeventdestroy(start);     cudaeventdestroy(stop);      return elapsedtime; }  int main(void) {      int a[n], b[n];     float dur = 0;        int *deva, *devb;      cudamalloc((void**) &deva, n * sizeof(int));     cudamalloc((void**) &devb, n * sizeof(int));      fillarray(a, n);     fillarray(b, n);      cudamemcpy(deva, a, n * sizeof(int), cudamemcpyhosttodevice);     cudamemcpy(deva, b, n * sizeof(int), cudamemcpyhosttodevice);        dur = duration(a, b, n, 1);      cout << "global memory version:\n";     cout << "process completed in " << dur;     cout << " data set of " << n << " integers.";      return 0; }

milisecond return zero. why? i'm missing here? if remove neg functions duration duration function. returns 0.15687 ms. think small number process these functions. whats wrong program?

after edit, did this:

using namespace std; #include <iostream> #include <stdio.h> #include <stdlib.h>  const int n = 8000;  void fillarray(int *data, int count) {     (int = 0; < count; i++)         data[i] = rand() % 100; }  __global__ void add(int* a, int *b, int *c) {      int tid = threadidx.x + blockidx.x * blockdim.x;     if (tid < n) {         c[tid] = a[tid] + b[tid];     } }  __global__ void subtract(int* a, int *b, int *c) {      int tid = threadidx.x + blockidx.x * blockdim.x;     if (tid < n) {         c[tid] = a[tid] - b[tid];     } }  __global__ void multiply(int* a, int *b, int *c) {      int tid = threadidx.x + blockidx.x * blockdim.x;     if (tid < n) {         c[tid] = a[tid] * b[tid];     } }  __global__ void divide(int* a, int *b, int *c) {      int tid = threadidx.x + blockidx.x * blockdim.x;     if (tid < n) {         c[tid] = a[tid] / b[tid];     } }  __global__ void modu(int* a, int *b, int *c) {      int tid = threadidx.x + blockidx.x * blockdim.x;     if (tid < n) {         c[tid] = a[tid] % b[tid];     } }  __global__ void neg(int *data, int *c) {      int tid = threadidx.x + blockidx.x * blockdim.x;     if (tid < n) {         c[tid] = -data[tid];     } }  float duration(int *deva, int *devb, int *devc, int blockspergrid, int threadsperblock) {      cudaevent_t start, stop;     float elapsedtime;      cudaeventcreate(&start);     cudaeventcreate(&stop);     cudaeventrecord(start, 0);      double harrayc[n];      add<<<blockspergrid, threadsperblock>>>(deva, devb,devc);     cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost);      subtract<<<blockspergrid, threadsperblock>>>(deva, devb,devc);     cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost);      multiply<<<blockspergrid, threadsperblock>>>(deva, devb,devc);     cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost);      divide<<<blockspergrid, threadsperblock>>>(deva, devb,devc);     cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost);      modu<<<blockspergrid, threadsperblock>>>(deva, devb,devc);     cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost);      neg<<<blockspergrid, threadsperblock>>>(deva,devc);     cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost);      neg<<<blockspergrid, threadsperblock>>>(devb,devc);     cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost);      cudaeventrecord(stop, 0);     cudaeventsynchronize(stop);     cudaeventelapsedtime(&elapsedtime, start, stop);      cudaeventdestroy(start);     cudaeventdestroy(stop);      return elapsedtime; }  int main(void) {      int a[n], b[n],c[n];     float dur = 0;      int *deva, *devb,*devc;      cudamalloc((void**) &deva, n * sizeof(int));     cudamalloc((void**) &devb, n * sizeof(int));     cudamalloc((void**) &devc, n * sizeof(int));      fillarray(a, n);     fillarray(b, n);      cudamemcpy(deva, a, n * sizeof(int), cudamemcpyhosttodevice);     cudamemcpy(devb, b, n * sizeof(int), cudamemcpyhosttodevice);     cudamemcpy(devc, c, n * sizeof(int), cudamemcpyhosttodevice);         dur = duration(deva, devb, devc,n, 1);      cout << "global memory version:\n";     cout << "process completed in " << dur;     cout << " data set of " << n << " integers.";        cudafree(deva);     cudafree(devb);     return 0; }

your kernels not doing anything, since store results in registers. when compiling, warnings:

kernel.cu(13): warning: variable "add" set never used

also, if want see better timings, use nvidia's profiler: either nvprof (cli) or nvvp (gui).

$ nvprof ./kernel

======== nvprof profiling kernel... ======== command: kernel global memory version: process completed in 0 data set of 8000 integers. ======== profiling result:   time(%)     time   calls       avg       min       max  name   100.00   18.46us       2    9.23us    6.02us   12.45us  [cuda memcpy htod]     0.00       0ns       1       0ns       0ns       0ns  multiply(int*, int*)     0.00       0ns       1       0ns       0ns       0ns  add(int*, int*)     0.00       0ns       1       0ns       0ns       0ns  modu(int*, int*)     0.00       0ns       2       0ns       0ns       0ns  neg(int*)     0.00       0ns       1       0ns       0ns       0ns  subtract(int*, int*)     0.00       0ns       1       0ns       0ns       0ns  divide(int*, int*)

you using n blocks per grid, , 1 thread per block. should consider reading answer this question.

update

concerning vector addition (and other simple operations) in itself, should either study vectoradd sample of cuda sdk, or use thrust. first option teach how use cuda, , second option show kind of high-level operations can thrust. if you, both.

Search This Blog

DIs

profiling - Cuda Performance measuring - Elapsed time returns zero -

update

Comments

Post a Comment

Popular posts from this blog

css - Text drops down with smaller window -

php - cannot display multiple markers in google maps v3 from traceroute result -

php - Boolean search on database with 5 million rows, very slow -