Skip to content

Instantly share code, notes, and snippets.

@zchothia
Created July 9, 2012 21:13
Show Gist options
  • Select an option

  • Save zchothia/3078968 to your computer and use it in GitHub Desktop.

Select an option

Save zchothia/3078968 to your computer and use it in GitHub Desktop.

Revisions

  1. zchothia created this gist Jul 9, 2012.
    258 changes: 258 additions & 0 deletions avx_dispatch_example.c
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,258 @@
    // AVX CPU dispatching - based on Agner Fog's C++ vector class library:
    // http://www.agner.org/optimize/vectorclass.zip

    #include <stdio.h>
    #include <stdbool.h>

    //------------------------------------------------------------------------------
    //>> BEGIN <instrset.h>

    // Detect 64 bit mode
    #if (defined(_M_AMD64) || defined(_M_X64) || defined(__amd64) ) && ! defined(__x86_64__)
    # define __x86_64__ 1 // There are many different macros for this, decide on only one
    #endif

    // Find instruction set from compiler macros if INSTRSET not defined
    // Note: Not all compilers define these macros automatically
    #ifndef INSTRSET
    #if defined(__AVX2__)
    # define INSTRSET 8
    #elif defined(__AVX__)
    # define INSTRSET 7
    #elif defined(__SSE4_2__)
    # define INSTRSET 6
    #elif defined(__SSE4_1__)
    # define INSTRSET 5
    #elif defined(__SSSE3__)
    # define INSTRSET 4
    #elif defined(__SSE3__)
    # define INSTRSET 3
    #elif defined(__SSE2__) || defined(__x86_64__)
    # define INSTRSET 2
    #elif defined(__SSE__)
    # define INSTRSET 1
    #elif defined(_M_IX86_FP) // Defined in MS compiler. 1: SSE, 2: SSE2
    # define INSTRSET _M_IX86_FP
    #else
    # define INSTRSET 0
    #endif // instruction set defines
    #endif // INSTRSET

    // Include the appropriate header file for intrinsic functions
    #if INSTRSET > 7 // AVX2 and later
    # ifdef __GNUC__
    # include <x86intrin.h> // x86intrin.h includes header files for whatever instruction
    // sets are specified on the compiler command line, such as:
    // xopintrin.h, fma4intrin.h
    # else
    # include <immintrin.h> // MS version of immintrin.h covers AVX, AVX2 and FMA3
    # endif // __GNUC__
    #elif INSTRSET == 7
    # include <immintrin.h> // AVX
    #elif INSTRSET == 6
    # include <nmmintrin.h> // SSE4.2
    #elif INSTRSET == 5
    # include <smmintrin.h> // SSE4.1
    #elif INSTRSET == 4
    # include <tmmintrin.h> // SSSE3
    #elif INSTRSET == 3
    # include <pmmintrin.h> // SSE3
    #elif INSTRSET == 2
    # include <emmintrin.h> // SSE2
    #elif INSTRSET == 1
    # include <xmmintrin.h> // SSE
    #endif // INSTRSET

    // AMD instruction sets
    #ifdef __XOP__
    # ifdef __GNUC__
    # include <x86intrin.h> // AMD XOP (Gnu)
    # else
    # include <ammintrin.h> // AMD XOP (Microsoft)
    # endif // __GNUC__
    #elif defined (__SSE4A__) // AMD SSE4A
    # include <ammintrin.h>
    #endif // __XOP__


    // Define integer types with known size
    #if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER >= 1600)
    // Compilers supporting C99 or C++0x have stdint.h defining these integer types
    #include <stdint.h>
    #elif defined(_MSC_VER)
    // Older Microsoft compilers have their own definitions
    typedef signed __int8 int8_t;
    typedef unsigned __int8 uint8_t;
    typedef signed __int16 int16_t;
    typedef unsigned __int16 uint16_t;
    typedef signed __int32 int32_t;
    typedef unsigned __int32 uint32_t;
    typedef signed __int64 int64_t;
    typedef unsigned __int64 uint64_t;
    #ifndef _INTPTR_T_DEFINED
    #define _INTPTR_T_DEFINED
    #ifdef __x86_64__
    typedef int64_t intptr_t;
    #else
    typedef int32_t intptr_t;
    #endif
    #endif
    #else
    // This works with most compilers
    typedef signed char int8_t;
    typedef unsigned char uint8_t;
    typedef signed short int int16_t;
    typedef unsigned short int uint16_t;
    typedef signed int int32_t;
    typedef unsigned int uint32_t;
    typedef long long int64_t;
    typedef unsigned long long uint64_t;
    #ifdef __x86_64__
    typedef int64_t intptr_t;
    #else
    typedef int32_t intptr_t;
    #endif
    #endif

    #include <stdlib.h> // define abs(int)

    #ifdef _MSC_VER // Microsoft compiler or compatible Intel compiler
    #include <intrin.h> // define _BitScanReverse(int), __cpuid(int[4],int), _xgetbv(int)
    #endif // _MSC_VER

    // functions in instrset_detect.cpp
    int instrset_detect(void); // tells which instruction sets are supported
    bool hasFMA3(void); // true if FMA3 instructions supported
    bool hasFMA4(void); // true if FMA4 instructions supported
    bool hasXOP (void); // true if XOP instructions supported

    // GCC version
    #if defined(__GNUC__) && ! defined (GCC_VERSION)
    #define GCC_VERSION ((__GNUC__) * 10000 + (__GNUC_MINOR__) * 100 + (__GNUC_PATCHLEVEL__))
    #endif

    //<< END <instrset.h>
    //------------------------------------------------------------------------------
    //>> BEGIN <instrset_detect.cpp>

    // Define interface to cpuid instruction.
    // input: eax = functionnumber, ecx = 0
    // output: eax = output[0], ebx = output[1], ecx = output[2], edx = output[3]
    static inline void cpuid (int output[4], int functionnumber) {
    #if defined (_MSC_VER) || defined (__INTEL_COMPILER) // Microsoft or Intel compiler, intrin.h included

    __cpuidex(output, functionnumber, 0); // intrinsic function for CPUID

    #elif defined(__GNUC__) // use inline assembly, Gnu/AT&T syntax

    int a, b, c, d;
    __asm("cpuid" : "=a"(a),"=b"(b),"=c"(c),"=d"(d) : "a"(functionnumber),"c"(0) : );
    output[0] = a;
    output[1] = b;
    output[2] = c;
    output[3] = d;

    #else // unknown platform. try inline assembly with masm/intel syntax

    __asm {
    mov eax, functionnumber
    xor ecx, ecx
    cpuid;
    mov esi, output
    mov [esi], eax
    mov [esi+4], ebx
    mov [esi+8], ecx
    mov [esi+12], edx
    }

    #endif
    }

    // Define interface to xgetbv instruction
    static inline int64_t xgetbv (int ctr) {
    #if (defined (_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined (__INTEL_COMPILER) && __INTEL_COMPILER >= 1200) // Microsoft or Intel compiler supporting _xgetbv intrinsic

    return _xgetbv(ctr); // intrinsic function for XGETBV

    #elif defined(__GNUC__) // use inline assembly, Gnu/AT&T syntax

    uint32_t a, d;
    __asm("xgetbv" : "=a"(a),"=d"(d) : "c"(ctr) : );
    return a | (((uint64_t) d) << 32);

    #else // #elif defined (_WIN32) // other compiler. try inline assembly with masm/intel/MS syntax

    uint32_t a, d;
    __asm {
    mov ecx, ctr
    _emit 0x0f
    _emit 0x01
    _emit 0xd0 ; // xgetbv
    mov a, eax
    mov d, edx
    }
    return a | (uint64_t(d) << 32);

    #endif
    }


    /* find supported instruction set
    return value:
    0 = 80386 instruction set
    1 or above = SSE (XMM) supported by CPU (not testing for O.S. support)
    2 or above = SSE2
    3 or above = SSE3
    4 or above = Supplementary SSE3 (SSSE3)
    5 or above = SSE4.1
    6 or above = SSE4.2
    7 or above = AVX supported by CPU and operating system
    8 or above = AVX2
    */
    int instrset_detect(void) {

    static int iset = -1; // remember value for next call
    if (iset >= 0) {
    return iset; // called before
    }
    iset = 0; // default value
    int abcd[4] = {0,0,0,0}; // cpuid results
    cpuid(abcd, 0); // call cpuid function 0
    if (abcd[0] == 0) return iset; // no further cpuid function supported
    cpuid(abcd, 1); // call cpuid function 1 for feature flags
    if ((abcd[3] & (1 << 0)) == 0) return iset; // no floating point
    if ((abcd[3] & (1 << 23)) == 0) return iset; // no MMX
    if ((abcd[3] & (1 << 15)) == 0) return iset; // no conditional move
    if ((abcd[3] & (1 << 24)) == 0) return iset; // no FXSAVE
    if ((abcd[3] & (1 << 25)) == 0) return iset; // no SSE
    iset = 1; // 1: SSE supported
    if ((abcd[3] & (1 << 26)) == 0) return iset; // no SSE2
    iset = 2; // 2: SSE2 supported
    if ((abcd[2] & (1 << 0)) == 0) return iset; // no SSE3
    iset = 3; // 3: SSE3 supported
    if ((abcd[2] & (1 << 9)) == 0) return iset; // no SSSE3
    iset = 4; // 4: SSSE3 supported
    if ((abcd[2] & (1 << 19)) == 0) return iset; // no SSE4.1
    iset = 5; // 5: SSE4.1 supported
    if ((abcd[2] & (1 << 23)) == 0) return iset; // no POPCNT
    if ((abcd[2] & (1 << 20)) == 0) return iset; // no SSE4.2
    iset = 6; // 6: SSE4.2 supported
    if ((abcd[2] & (1 << 27)) == 0) return iset; // no OSXSAVE
    if ((xgetbv(0) & 6) != 6) return iset; // AVX not enabled in O.S.
    if ((abcd[2] & (1 << 28)) == 0) return iset; // no AVX
    iset = 7; // 7: AVX supported
    cpuid(abcd, 7); // call cpuid leaf 7 for feature flags
    if ((abcd[1] & (1 << 5)) == 0) return iset; // no AVX2
    iset = 8; // 8: AVX2 supported
    return iset;
    }

    //<< END <instrset_detect.cpp>
    //------------------------------------------------------------------------------

    int main() {
    int iset = instrset_detect();
    bool hasAVX = iset >= 7;
    printf("Has AVX? %s (iset = %d)\n", hasAVX ? "Yes!" : "No.", iset);
    return 0;
    }