Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 21 additions & 14 deletions coreneuron/permute/cellorder.cu
Original file line number Diff line number Diff line change
Expand Up @@ -72,25 +72,32 @@ __global__ void solve_interleaved2_kernel(NrnThread* nt, InterleaveInfo* ii, int
int* rootbegin = ii->firstnode; // nwarp+1 of these
int* nodebegin = ii->lastnode; // nwarp+1 of these

int iwarp = icore / warpsize; // figure out the >> value
int ic = icore & (warpsize - 1); // figure out the & mask
int ncycle = ncycles[iwarp];
int* stride = strides + stridedispl[iwarp];
int root = rootbegin[iwarp];
int lastroot = rootbegin[iwarp + 1];
int firstnode = nodebegin[iwarp];
int lastnode = nodebegin[iwarp + 1];

triang_interleaved2_device(nt, ic, ncycle, stride, lastnode);
bksub_interleaved2_device(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
while (icore < ncore) {
int iwarp = icore / warpsize; // figure out the >> value
int ic = icore & (warpsize - 1); // figure out the & mask
int ncycle = ncycles[iwarp];
int* stride = strides + stridedispl[iwarp];
int root = rootbegin[iwarp];
int lastroot = rootbegin[iwarp + 1];
int firstnode = nodebegin[iwarp];
int lastnode = nodebegin[iwarp + 1];

triang_interleaved2_device(nt, ic, ncycle, stride, lastnode);
bksub_interleaved2_device(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);

icore += blockDim.x * gridDim.x;
}
}

void solve_interleaved2_launcher(NrnThread* nt, InterleaveInfo* info, int ncore, void* stream) {
auto cuda_stream = static_cast<cudaStream_t>(stream);

int threadsPerBlock = warpsize;
// TODO: Should blocksPerGrid be a fixed number and have a while block inside the kernel?
int blocksPerGrid = (ncore + threadsPerBlock - 1) / threadsPerBlock;
// the selection of these parameters has been done after running the channel-benchmark for typical production runs, i.e.
// 1 MPI task with 1440 cells & 6 MPI tasks with 8800 cells.
// The main idea is to have multiple warps per SM and sufficient blocks to fill the GPU.
// In our case, given that multiple threads share the available GPUs, we "guarantee" a sufficient occupancy of the GPUs.
int threadsPerBlock = 128;
int blocksPerGrid = 512;

solve_interleaved2_kernel<<<blocksPerGrid, threadsPerBlock, 0, cuda_stream>>>(nt, info, ncore);

Expand Down