This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <iostream> | |
| #include <cuda_runtime_api.h> | |
| #include <stdint.h> | |
| #include <cub/cub.cuh> | |
| template <uint32_t block_size> | |
| __global__ void grid_stride_reduce(uint32_t *array, uint32_t *tmp_array, uint32_t length){ | |
| uint32_t thread_index = threadIdx.x; | |
| uint32_t global_index = blockIdx.x * (block_size * 2) + thread_index; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <iostream> | |
| #include <cuda_runtime_api.h> | |
| #include <stdint.h> | |
| #define FULL_MASK 0xFFFFFFFF | |
| #define WARP_SIZE 32 | |
| __device__ __forceinline__ uint32_t lane_id(uint32_t tid){ | |
| // https://stackoverflow.com/q/44337309 |