Created
November 4, 2025 11:39
-
-
Save troelsy/c7dc3ebab1a4d6d8d82159e32f012f58 to your computer and use it in GitHub Desktop.
Implementation found in https://gist.github.com/troelsy/fff6aac2226e080dcebf05531a11d44e using CUDA CUB
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <iostream> | |
| #include <cuda_runtime_api.h> | |
| #include <stdint.h> | |
| #include <cub/cub.cuh> | |
| template <uint32_t block_size> | |
| __global__ void grid_stride_reduce(uint32_t *array, uint32_t *tmp_array, uint32_t length){ | |
| uint32_t thread_index = threadIdx.x; | |
| uint32_t global_index = blockIdx.x * (block_size * 2) + thread_index; | |
| uint32_t grid_size = block_size * 2 * gridDim.x; | |
| uint32_t value = 0; | |
| while (global_index < length){ | |
| value += array[global_index] + array[global_index + block_size]; | |
| global_index += grid_size; | |
| } | |
| // Use CUB block reduce | |
| typedef cub::BlockReduce<uint32_t, block_size> BlockReduce; | |
| __shared__ typename BlockReduce::TempStorage temp_storage; | |
| value = BlockReduce(temp_storage).Sum(value); | |
| if (thread_index == 0){ | |
| tmp_array[blockIdx.x] = value; | |
| } | |
| } | |
| template <unsigned int blockSize> | |
| __device__ void warpReduce(volatile int *sdata, unsigned int tid) | |
| { | |
| if (blockSize >= 64) | |
| sdata[tid] += sdata[tid + 32]; | |
| if (blockSize >= 32) | |
| sdata[tid] += sdata[tid + 16]; | |
| if (blockSize >= 16) | |
| sdata[tid] += sdata[tid + 8]; | |
| if (blockSize >= 8) | |
| sdata[tid] += sdata[tid + 4]; | |
| if (blockSize >= 4) | |
| sdata[tid] += sdata[tid + 2]; | |
| if (blockSize >= 2) | |
| sdata[tid] += sdata[tid + 1]; | |
| } | |
| template <unsigned int blockSize> | |
| __global__ void reduce6(int *g_idata, int *g_odata, unsigned int n) | |
| { | |
| extern __shared__ int sdata[]; | |
| unsigned int tid = threadIdx.x; | |
| unsigned int i = blockIdx.x * (blockSize * 2) + tid; | |
| unsigned int gridSize = blockSize * 2 * gridDim.x; | |
| sdata[tid] = 0; | |
| while (i < n) | |
| { | |
| sdata[tid] += g_idata[i] + g_idata[i + blockSize]; | |
| i += gridSize; | |
| } | |
| __syncthreads(); | |
| if (blockSize >= 512) | |
| { | |
| if (tid < 256) | |
| { | |
| sdata[tid] += sdata[tid + 256]; | |
| } | |
| __syncthreads(); | |
| } | |
| if (blockSize >= 256) | |
| { | |
| if (tid < 128) | |
| { | |
| sdata[tid] += sdata[tid + 128]; | |
| } | |
| __syncthreads(); | |
| } | |
| if (blockSize >= 128) | |
| { | |
| if (tid < 64) | |
| { | |
| sdata[tid] += sdata[tid + 64]; | |
| } | |
| __syncthreads(); | |
| } | |
| if (tid < 32) | |
| warpReduce<blockSize>(sdata, tid); | |
| if (tid == 0) | |
| g_odata[blockIdx.x] = sdata[0]; | |
| } | |
| int32_t main(){ | |
| uint32_t length = 96 * 1024 * 1024; | |
| uint32_t *host_array = (uint32_t *)malloc(length * sizeof(uint32_t)); | |
| if (!host_array) { | |
| fprintf(stderr, "Failed to allocate host_array\n"); | |
| return 1; | |
| } | |
| for (int i = 0; i < length; ++i){ | |
| host_array[i] = 1; | |
| } | |
| uint32_t *array; | |
| cudaMalloc(&array, length * sizeof(uint32_t)); | |
| cudaMemcpy(array, host_array, length * sizeof(uint32_t), cudaMemcpyHostToDevice); | |
| int32_t devId = 0; | |
| int32_t n_sm; | |
| cudaDeviceGetAttribute(&n_sm, cudaDevAttrMultiProcessorCount, devId); | |
| uint32_t grid_size = 32 * n_sm; | |
| const uint32_t block_size = 256; | |
| uint32_t *tmp_array; | |
| cudaMalloc(&tmp_array, grid_size * sizeof(uint32_t)); | |
| grid_stride_reduce<block_size><<<grid_size, block_size>>>(array, tmp_array, length); | |
| grid_stride_reduce<block_size><<<1, block_size>>>(tmp_array, tmp_array, grid_size); | |
| uint32_t result; | |
| cudaMemcpy(&result, tmp_array, sizeof(uint32_t), cudaMemcpyDeviceToHost); | |
| std::cout << "Result: " << result << std::endl; | |
| reduce6<block_size><<<grid_size, block_size, shared_memory>>>((int *) array, (int *) tmp_array, length); | |
| reduce6<block_size><<<1, block_size, shared_memory>>>((int *) tmp_array, (int *) tmp_array, grid_size); | |
| cudaMemcpy(&result, tmp_array, sizeof(uint32_t), cudaMemcpyDeviceToHost); | |
| std::cout << "Result: " << result << std::endl; | |
| cudaFree(array); | |
| cudaFree(tmp_array); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment