Skip to content

Instantly share code, notes, and snippets.

@weedge
Forked from mre/bitonic_sort.cu
Created October 25, 2023 02:57
Show Gist options
  • Save weedge/a524555100fb365d06ffba38a07b8950 to your computer and use it in GitHub Desktop.
Save weedge/a524555100fb365d06ffba38a07b8950 to your computer and use it in GitHub Desktop.

Revisions

  1. @mre mre revised this gist May 6, 2016. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions bitonic_sort.cu
    Original file line number Diff line number Diff line change
    @@ -3,6 +3,7 @@
    * Compile with
    * nvcc -arch=sm_11 bitonic_sort.cu
    * Based on http://www.tools-of-computing.com/tc/CS/Sorts/bitonic_sort.htm
    * License: BSD 3
    */

    #include <stdlib.h>
  2. @mre mre created this gist Nov 24, 2011.
    113 changes: 113 additions & 0 deletions bitonic_sort.cu
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,113 @@
    /*
    * Parallel bitonic sort using CUDA.
    * Compile with
    * nvcc -arch=sm_11 bitonic_sort.cu
    * Based on http://www.tools-of-computing.com/tc/CS/Sorts/bitonic_sort.htm
    */

    #include <stdlib.h>
    #include <stdio.h>
    #include <time.h>

    /* Every thread gets exactly one value in the unsorted array. */
    #define THREADS 512 // 2^9
    #define BLOCKS 32768 // 2^15
    #define NUM_VALS THREADS*BLOCKS

    void print_elapsed(clock_t start, clock_t stop)
    {
    double elapsed = ((double) (stop - start)) / CLOCKS_PER_SEC;
    printf("Elapsed time: %.3fs\n", elapsed);
    }

    float random_float()
    {
    return (float)rand()/(float)RAND_MAX;
    }

    void array_print(float *arr, int length)
    {
    int i;
    for (i = 0; i < length; ++i) {
    printf("%1.3f ", arr[i]);
    }
    printf("\n");
    }

    void array_fill(float *arr, int length)
    {
    srand(time(NULL));
    int i;
    for (i = 0; i < length; ++i) {
    arr[i] = random_float();
    }
    }

    __global__ void bitonic_sort_step(float *dev_values, int j, int k)
    {
    unsigned int i, ixj; /* Sorting partners: i and ixj */
    i = threadIdx.x + blockDim.x * blockIdx.x;
    ixj = i^j;

    /* The threads with the lowest ids sort the array. */
    if ((ixj)>i) {
    if ((i&k)==0) {
    /* Sort ascending */
    if (dev_values[i]>dev_values[ixj]) {
    /* exchange(i,ixj); */
    float temp = dev_values[i];
    dev_values[i] = dev_values[ixj];
    dev_values[ixj] = temp;
    }
    }
    if ((i&k)!=0) {
    /* Sort descending */
    if (dev_values[i]<dev_values[ixj]) {
    /* exchange(i,ixj); */
    float temp = dev_values[i];
    dev_values[i] = dev_values[ixj];
    dev_values[ixj] = temp;
    }
    }
    }
    }

    /**
    * Inplace bitonic sort using CUDA.
    */
    void bitonic_sort(float *values)
    {
    float *dev_values;
    size_t size = NUM_VALS * sizeof(float);

    cudaMalloc((void**) &dev_values, size);
    cudaMemcpy(dev_values, values, size, cudaMemcpyHostToDevice);

    dim3 blocks(BLOCKS,1); /* Number of blocks */
    dim3 threads(THREADS,1); /* Number of threads */

    int j, k;
    /* Major step */
    for (k = 2; k <= NUM_VALS; k <<= 1) {
    /* Minor step */
    for (j=k>>1; j>0; j=j>>1) {
    bitonic_sort_step<<<blocks, threads>>>(dev_values, j, k);
    }
    }
    cudaMemcpy(values, dev_values, size, cudaMemcpyDeviceToHost);
    cudaFree(dev_values);
    }

    int main(void)
    {
    clock_t start, stop;

    float *values = (float*) malloc( NUM_VALS * sizeof(float));
    array_fill(values, NUM_VALS);

    start = clock();
    bitonic_sort(values); /* Inplace */
    stop = clock();

    print_elapsed(start, stop);
    }