I have the next for nested loop and I would like to port it to CUDA to be run on a GP开发者_C百科U
int current=0;
int ptr=0;
for (int i=0; i < Nbeans; i++){
  for(int j=0;j< NbeamletsPerbeam[i];j++){
     current = j + ptr;
     for(int k=0;k<Nmax;k++){
        ......
     }
     ptr+=NbeamletsPerbeam[i];
 }   
}
I would be very happy if any body has an idea of how to do it or how can be done. We are talking about Nbeams=5, NbeamletsPerBeam around 200 each.
This is what I currently have but I am not sure it is right...
 for (int i= blockIdx.x; i < d_params->Nbeams; i += gridDim.x){
            for (int j= threadIdx.y; j < d_beamletsPerBeam[i]; j+= blockDim.y){
                 currentBeamlet= j+k;
                 for (int ivoxel= threadIdx.x; ivoxel < totalVoxels; ivoxel += blockDim.x){
I would suggest this idea. But you might need to do some minor modifications based on your code.
dim3 blocks(NoOfThreads, 1);
dim3 grid(Nbeans, 1);
kernel<<grid, blocks, 1>>()
__global__ kernel()
{
   int noOfBlocks = ( NbeamletsPerbeam[blockIdx.x] + blockDim.x -1)/blockDim.x;
   for(int j=0; j< noOfBlocks;j++){
     //  use threads and compute.... 
     if( (threadIdx.x * j) < NbeamletsPerbeam[blockIdx.x]) {
       current = (threadIdx.x * j) + ptr;
       for(int k=0;k<Nmax;k++){
          ......
       }
       ptr+=NbeamletsPerbeam[blockIdx.x];
    }
 }   
} 
This should do the trick and gives you better parallelization.
 
         
                                         
                                         
                                         
                                        ![Interactive visualization of a graph in python [closed]](https://www.devze.com/res/2023/04-10/09/92d32fe8c0d22fb96bd6f6e8b7d1f457.gif) 
                                         
                                         
                                         
                                         加载中,请稍侯......
 加载中,请稍侯......
      
精彩评论