recursion - convert from recursive to iterative function cuda c++ -
i'm working on genetic program in porting of heavy lifting cuda. (previously openmp).
it's not running fast, , i'm getting error related recursion:
stack size entry function '_z9kscoreonepdpis_s_p9cppgpnode' cannot statically determined
i've added lump of logic runs on cuda. believe enough show how working. i'd happy hear other optimizations add, take recursion if speed things up.
examples on how achieved welcome.
__device__ double fadd(double a, double b) { return + b; }; __device__ double fsubtract(double a, double b) { return - b; }; __device__ double action (int fno, double aa , double bb, double cc, double dd) { switch (fno) { case 0 : return fadd(aa,bb); case 1 : return fsubtract(aa,bb); case 2 : return fmultiply(aa,bb); case 3 : return fdivide(aa,bb); default: return 0.0; } } __device__ double solve(int node,cppgpnode * dev_m_items,double * var_set) { if (dev_m_items[node].is_terminal) { return var_set[dev_m_items[node].tno]; } else { double values[4]; (unsigned int x = 0; x < 4; x++ ) { if (x < dev_m_items[node].finputs) { values[x] = solve(dev_m_items[node].children[x],dev_m_items,var_set); } else { values[x] = 0.0; } } return action(dev_m_items[node].fno,values[0],values[1],values[2],values[3]); } } __global__ void kscoreone(double *scores,int * root_nodes,double * targets,double * cases,cppgpnode * dev_m_items) { int pid = blockidx.x; // work if node needs calculated if (root_nodes[pid] != -1) { (unsigned int case_no = 0; case_no < fitness_cases; case_no ++) { double result = solve(root_nodes[pid],dev_m_items,&cases[case_no]); double target = targets[case_no]; scores[pid] += abs(result - target); } } } i'm having trouble making stack examples work large tree structure, solves.
i've solved issue now. not quite case of placing recursive arguments stack similar system.
as part of creation of node tree, append each node each vector. solve problem in reverse using http://en.wikipedia.org/wiki/reverse_polish_notation, fits nicely each node contains either value or function perform.
it's ~20% faster recursive version, i'm pleased!
Comments
Post a Comment