profiling - Cuda Performance measuring - Elapsed time returns zero -
i wrote few kernel function , wonder how many miliseconds process these functions.
using namespace std; #include <iostream> #include <stdio.h> #include <stdlib.h> #define n 8000 void fillarray(int *data, int count) { (int = 0; < count; i++) data[i] = rand() % 100; } __global__ void add(int* a, int *b) { int add = 0; int tid = threadidx.x + blockidx.x * blockdim.x; if (tid < n) { add = a[tid] + b[tid]; } } __global__ void subtract(int* a, int *b) { int subtract = 0; int tid = threadidx.x + blockidx.x * blockdim.x; if (tid < n) { subtract = a[tid] - b[tid]; } } __global__ void multiply(int* a, int *b) { int multiply = 0; int tid = threadidx.x + blockidx.x * blockdim.x; if (tid < n) { multiply = a[tid] * b[tid]; } } __global__ void divide(int* a, int *b) { int divide = 0; int tid = threadidx.x + blockidx.x * blockdim.x; if (tid < n) { divide = a[tid] / b[tid]; } } __global__ void modu(int* a, int *b) { int modulus = 0; int tid = threadidx.x + blockidx.x * blockdim.x; if (tid < n) { modulus = a[tid] % b[tid]; } } __global__ void neg(int *data) { int tid = threadidx.x + blockidx.x * blockdim.x; if (tid < n) { data[tid] = -data[tid]; } } float duration(int *deva, int *devb, int blockspergrid, int threadsperblock) { cudaevent_t start, stop; float elapsedtime; cudaeventcreate(&start); cudaeventcreate(&stop); cudaeventrecord(start, 0); add<<<blockspergrid, threadsperblock>>>(deva, devb); subtract<<<blockspergrid, threadsperblock>>>(deva, devb); multiply<<<blockspergrid, threadsperblock>>>(deva, devb); divide<<<blockspergrid, threadsperblock>>>(deva, devb); modu<<<blockspergrid, threadsperblock>>>(deva, devb); neg<<<blockspergrid, threadsperblock>>>(deva); neg<<<blockspergrid, threadsperblock>>>(devb); cudaeventrecord(stop, 0); cudaeventsynchronize(stop); cudaeventelapsedtime(&elapsedtime, start, stop); cudaeventdestroy(start); cudaeventdestroy(stop); return elapsedtime; } int main(void) { int a[n], b[n]; float dur = 0; int *deva, *devb; cudamalloc((void**) &deva, n * sizeof(int)); cudamalloc((void**) &devb, n * sizeof(int)); fillarray(a, n); fillarray(b, n); cudamemcpy(deva, a, n * sizeof(int), cudamemcpyhosttodevice); cudamemcpy(deva, b, n * sizeof(int), cudamemcpyhosttodevice); dur = duration(a, b, n, 1); cout << "global memory version:\n"; cout << "process completed in " << dur; cout << " data set of " << n << " integers."; return 0; }
milisecond return zero. why? i'm missing here? if remove neg functions duration duration function. returns 0.15687 ms. think small number process these functions. whats wrong program?
after edit, did this:
using namespace std; #include <iostream> #include <stdio.h> #include <stdlib.h> const int n = 8000; void fillarray(int *data, int count) { (int = 0; < count; i++) data[i] = rand() % 100; } __global__ void add(int* a, int *b, int *c) { int tid = threadidx.x + blockidx.x * blockdim.x; if (tid < n) { c[tid] = a[tid] + b[tid]; } } __global__ void subtract(int* a, int *b, int *c) { int tid = threadidx.x + blockidx.x * blockdim.x; if (tid < n) { c[tid] = a[tid] - b[tid]; } } __global__ void multiply(int* a, int *b, int *c) { int tid = threadidx.x + blockidx.x * blockdim.x; if (tid < n) { c[tid] = a[tid] * b[tid]; } } __global__ void divide(int* a, int *b, int *c) { int tid = threadidx.x + blockidx.x * blockdim.x; if (tid < n) { c[tid] = a[tid] / b[tid]; } } __global__ void modu(int* a, int *b, int *c) { int tid = threadidx.x + blockidx.x * blockdim.x; if (tid < n) { c[tid] = a[tid] % b[tid]; } } __global__ void neg(int *data, int *c) { int tid = threadidx.x + blockidx.x * blockdim.x; if (tid < n) { c[tid] = -data[tid]; } } float duration(int *deva, int *devb, int *devc, int blockspergrid, int threadsperblock) { cudaevent_t start, stop; float elapsedtime; cudaeventcreate(&start); cudaeventcreate(&stop); cudaeventrecord(start, 0); double harrayc[n]; add<<<blockspergrid, threadsperblock>>>(deva, devb,devc); cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost); subtract<<<blockspergrid, threadsperblock>>>(deva, devb,devc); cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost); multiply<<<blockspergrid, threadsperblock>>>(deva, devb,devc); cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost); divide<<<blockspergrid, threadsperblock>>>(deva, devb,devc); cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost); modu<<<blockspergrid, threadsperblock>>>(deva, devb,devc); cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost); neg<<<blockspergrid, threadsperblock>>>(deva,devc); cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost); neg<<<blockspergrid, threadsperblock>>>(devb,devc); cudamemcpy(harrayc,devc,n*sizeof(int),cudamemcpydevicetohost); cudaeventrecord(stop, 0); cudaeventsynchronize(stop); cudaeventelapsedtime(&elapsedtime, start, stop); cudaeventdestroy(start); cudaeventdestroy(stop); return elapsedtime; } int main(void) { int a[n], b[n],c[n]; float dur = 0; int *deva, *devb,*devc; cudamalloc((void**) &deva, n * sizeof(int)); cudamalloc((void**) &devb, n * sizeof(int)); cudamalloc((void**) &devc, n * sizeof(int)); fillarray(a, n); fillarray(b, n); cudamemcpy(deva, a, n * sizeof(int), cudamemcpyhosttodevice); cudamemcpy(devb, b, n * sizeof(int), cudamemcpyhosttodevice); cudamemcpy(devc, c, n * sizeof(int), cudamemcpyhosttodevice); dur = duration(deva, devb, devc,n, 1); cout << "global memory version:\n"; cout << "process completed in " << dur; cout << " data set of " << n << " integers."; cudafree(deva); cudafree(devb); return 0; }
your kernels not doing anything, since store results in registers. when compiling, warnings:
kernel.cu(13): warning: variable "add" set never used
also, if want see better timings, use nvidia's profiler: either nvprof
(cli) or nvvp
(gui).
$ nvprof ./kernel
======== nvprof profiling kernel... ======== command: kernel global memory version: process completed in 0 data set of 8000 integers. ======== profiling result: time(%) time calls avg min max name 100.00 18.46us 2 9.23us 6.02us 12.45us [cuda memcpy htod] 0.00 0ns 1 0ns 0ns 0ns multiply(int*, int*) 0.00 0ns 1 0ns 0ns 0ns add(int*, int*) 0.00 0ns 1 0ns 0ns 0ns modu(int*, int*) 0.00 0ns 2 0ns 0ns 0ns neg(int*) 0.00 0ns 1 0ns 0ns 0ns subtract(int*, int*) 0.00 0ns 1 0ns 0ns 0ns divide(int*, int*)
you using n
blocks per grid, , 1 thread per block. should consider reading answer this question.
update
concerning vector addition (and other simple operations) in itself, should either study vectoradd sample of cuda sdk, or use thrust. first option teach how use cuda, , second option show kind of high-level operations can thrust. if you, both.
Comments
Post a Comment