Skip to content

Instantly share code, notes, and snippets.

@troelsy
Created November 4, 2025 11:39
Show Gist options
  • Select an option

  • Save troelsy/c7dc3ebab1a4d6d8d82159e32f012f58 to your computer and use it in GitHub Desktop.

Select an option

Save troelsy/c7dc3ebab1a4d6d8d82159e32f012f58 to your computer and use it in GitHub Desktop.
#include <iostream>
#include <cuda_runtime_api.h>
#include <stdint.h>
#include <cub/cub.cuh>
template <uint32_t block_size>
__global__ void grid_stride_reduce(uint32_t *array, uint32_t *tmp_array, uint32_t length){
uint32_t thread_index = threadIdx.x;
uint32_t global_index = blockIdx.x * (block_size * 2) + thread_index;
uint32_t grid_size = block_size * 2 * gridDim.x;
uint32_t value = 0;
while (global_index < length){
value += array[global_index] + array[global_index + block_size];
global_index += grid_size;
}
// Use CUB block reduce
typedef cub::BlockReduce<uint32_t, block_size> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
value = BlockReduce(temp_storage).Sum(value);
if (thread_index == 0){
tmp_array[blockIdx.x] = value;
}
}
template <unsigned int blockSize>
__device__ void warpReduce(volatile int *sdata, unsigned int tid)
{
if (blockSize >= 64)
sdata[tid] += sdata[tid + 32];
if (blockSize >= 32)
sdata[tid] += sdata[tid + 16];
if (blockSize >= 16)
sdata[tid] += sdata[tid + 8];
if (blockSize >= 8)
sdata[tid] += sdata[tid + 4];
if (blockSize >= 4)
sdata[tid] += sdata[tid + 2];
if (blockSize >= 2)
sdata[tid] += sdata[tid + 1];
}
template <unsigned int blockSize>
__global__ void reduce6(int *g_idata, int *g_odata, unsigned int n)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * (blockSize * 2) + tid;
unsigned int gridSize = blockSize * 2 * gridDim.x;
sdata[tid] = 0;
while (i < n)
{
sdata[tid] += g_idata[i] + g_idata[i + blockSize];
i += gridSize;
}
__syncthreads();
if (blockSize >= 512)
{
if (tid < 256)
{
sdata[tid] += sdata[tid + 256];
}
__syncthreads();
}
if (blockSize >= 256)
{
if (tid < 128)
{
sdata[tid] += sdata[tid + 128];
}
__syncthreads();
}
if (blockSize >= 128)
{
if (tid < 64)
{
sdata[tid] += sdata[tid + 64];
}
__syncthreads();
}
if (tid < 32)
warpReduce<blockSize>(sdata, tid);
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
int32_t main(){
uint32_t length = 96 * 1024 * 1024;
uint32_t *host_array = (uint32_t *)malloc(length * sizeof(uint32_t));
if (!host_array) {
fprintf(stderr, "Failed to allocate host_array\n");
return 1;
}
for (int i = 0; i < length; ++i){
host_array[i] = 1;
}
uint32_t *array;
cudaMalloc(&array, length * sizeof(uint32_t));
cudaMemcpy(array, host_array, length * sizeof(uint32_t), cudaMemcpyHostToDevice);
int32_t devId = 0;
int32_t n_sm;
cudaDeviceGetAttribute(&n_sm, cudaDevAttrMultiProcessorCount, devId);
uint32_t grid_size = 32 * n_sm;
const uint32_t block_size = 256;
uint32_t *tmp_array;
cudaMalloc(&tmp_array, grid_size * sizeof(uint32_t));
grid_stride_reduce<block_size><<<grid_size, block_size>>>(array, tmp_array, length);
grid_stride_reduce<block_size><<<1, block_size>>>(tmp_array, tmp_array, grid_size);
uint32_t result;
cudaMemcpy(&result, tmp_array, sizeof(uint32_t), cudaMemcpyDeviceToHost);
std::cout << "Result: " << result << std::endl;
reduce6<block_size><<<grid_size, block_size, shared_memory>>>((int *) array, (int *) tmp_array, length);
reduce6<block_size><<<1, block_size, shared_memory>>>((int *) tmp_array, (int *) tmp_array, grid_size);
cudaMemcpy(&result, tmp_array, sizeof(uint32_t), cudaMemcpyDeviceToHost);
std::cout << "Result: " << result << std::endl;
cudaFree(array);
cudaFree(tmp_array);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment