// half->float variants. // by Fabian "ryg" Giesen. // // I hereby place this code in the public domain. // // half_to_float_fast: table based // tables could be done in a more compact fashion (in particular, can store tab2 in low word of tab1!) // but something of a dead end since not very SIMD-friendly. pretty much abandoned at this point. // // half_to_float_fast2: use FP adder hardware to deal with denormals. // now this one has potential! (but needs some polish) // // half_to_float_fast3: same, but without bitfields. low number of constants involved. // looking pretty good. need to check how it comes out in SSE2. // // half_to_float_fast4: again the same, but written to be easy to translate to SSE2 // intrinsics. // // half_to_float4_SSE2: initial port of half_to_float_fast4 to SSE2. should do a second // pass + clean-up, but not today. :) // // half_to_float4b_SSE2: some tweaks. should generate less ops and needs less constants; // about 11% faster on my Sandy Bridge i7 using VS2010. YMMV. // // half_to_floast_fast5: slightly different approach, turns FP16 denormals into FP32 denormals. // it's very slick and short but will be slower if denormals actually occur. // // half_to_float5_SSE2: SSE2-ified version of "fast5" variant. as said, in the presence of // denormals, this will be noticably slower than variants 4/4b. use the included benchmarking // code to find out by how much :). it's kinda serial, which means that even though it has a lot // less instructions than variants 4/4b, it's not all that much faster even in the best case. #include #include #include typedef unsigned int uint; union FP32 { uint u; float f; struct { uint Mantissa : 23; uint Exponent : 8; uint Sign : 1; }; }; union FP16 { unsigned short u; struct { uint Mantissa : 10; uint Exponent : 5; uint Sign : 1; }; }; static FP32 half_to_float_full(FP16 h) { FP32 o = { 0 }; // From ISPC ref code if (h.Exponent == 0 && h.Mantissa == 0) // (Signed) zero o.Sign = h.Sign; else { if (h.Exponent == 0) // Denormal (will convert to normalized) { // Adjust mantissa so it's normalized (and keep track of exp adjust) int e = -1; uint m = h.Mantissa; do { e++; m <<= 1; } while ((m & 0x400) == 0); o.Mantissa = (m & 0x3ff) << 13; o.Exponent = 127 - 15 - e; o.Sign = h.Sign; } else if (h.Exponent == 0x1f) // Inf/NaN { // NOTE: It's safe to treat both with the same code path by just truncating // lower Mantissa bits in NaNs (this is valid). o.Mantissa = h.Mantissa << 13; o.Exponent = 255; o.Sign = h.Sign; } else // Normalized number { o.Mantissa = h.Mantissa << 13; o.Exponent = 127 - 15 + h.Exponent; o.Sign = h.Sign; } } return o; } // Conversion tables static uint tab1[256], tab2[256], tab3[256]; static void init_tables() { FP16 f16; FP32 f32; for (int i=0; i < 256; i++) { f16.u = i << 8; f32 = half_to_float_full(f16); tab1[i] = f32.u; tab2[i] = 1 << 13; f16.u = i; f32 = half_to_float_full(f16); tab3[i] = f32.u; } // Lower exponent end has some denormals tab2[0x03] = 1 << 14; tab2[0x02] = 1 << 14; tab2[0x01] = 1 << 15; tab2[0x83] = 1 << 14; tab2[0x82] = 1 << 14; tab2[0x81] = 1 << 15; } static FP32 half_to_float_fast(FP16 h) { FP32 o; if (h.u & 0x7f00) o.u = tab1[h.u >> 8] + tab2[h.u >> 8] * (h.u & 0xff); else o.u = ((h.u & 0x8000) << 16) | tab3[h.u & 0xff]; return o; } static FP32 half_to_float_fast2(FP16 h) { static const FP32 magic = { 126 << 23 }; FP32 o; if (h.Exponent == 0) // Zero / Denormal { o.u = magic.u + h.Mantissa; o.f -= magic.f; } else { o.Mantissa = h.Mantissa << 13; if (h.Exponent == 0x1f) // Inf/NaN o.Exponent = 255; else o.Exponent = 127 - 15 + h.Exponent; } o.Sign = h.Sign; return o; } static FP32 half_to_float_fast3(FP16 h) { static const FP32 magic = { 113 << 23 }; static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift FP32 o; // mantissa+exponent uint shifted = (h.u & 0x7fff) << 13; uint exponent = shifted & shifted_exp; // exponent cases o.u = shifted; if (exponent == 0) // Zero / Denormal { o.u += magic.u; o.f -= magic.f; } else if (exponent == shifted_exp) // Inf/NaN o.u += (255 - 31) << 23; else o.u += (127 - 15) << 23; o.u |= (h.u & 0x8000) << 16; // copy sign bit return o; } static FP32 half_to_float_fast4(FP16 h) { static const FP32 magic = { 113 << 23 }; static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift FP32 o; o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits uint exp = shifted_exp & o.u; // just the exponent o.u += (127 - 15) << 23; // exponent adjust // handle exponent special cases if (exp == shifted_exp) // Inf/NaN? o.u += (128 - 16) << 23; // extra exp adjust else if (exp == 0) // Zero/Denormal? { o.u += 1 << 23; // extra exp adjust o.f -= magic.f; // renormalize } o.u |= (h.u & 0x8000) << 16; // sign bit return o; } static FP32 half_to_float_fast5(FP16 h) { static const FP32 magic = { (254 - 15) << 23 }; static const FP32 was_infnan = { (127 + 16) << 23 }; FP32 o; o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits o.f *= magic.f; // exponent adjust if (o.f >= was_infnan.f) // make sure Inf/NaN survive o.u |= 255 << 23; o.u |= (h.u & 0x8000) << 16; // sign bit return o; } static __m128 half_to_float4_SSE2(__m128i h) { #define SSE_CONST4(name, val) static const __declspec(align(16)) uint name[4] = { (val), (val), (val), (val) } #define CONST(name) *(const __m128i *)&name SSE_CONST4(mask_nosign, 0x7fff); SSE_CONST4(mask_justsign, 0x8000); SSE_CONST4(mask_shifted_exp, 0x7c00 << 13); SSE_CONST4(expadjust_normal, (127 - 15) << 23); SSE_CONST4(expadjust_infnan, (128 - 16) << 23); SSE_CONST4(expadjust_denorm, 1 << 23); SSE_CONST4(magic_denorm, 113 << 23); __m128i mnosign = CONST(mask_nosign); __m128i expmant = _mm_and_si128(mnosign, h); __m128i justsign = _mm_and_si128(h, CONST(mask_justsign)); __m128i mshiftexp = CONST(mask_shifted_exp); __m128i eadjust = CONST(expadjust_normal); __m128i shifted = _mm_slli_epi32(expmant, 13); __m128i adjusted = _mm_add_epi32(eadjust, shifted); __m128i justexp = _mm_and_si128(shifted, mshiftexp); __m128i zero = _mm_setzero_si128(); __m128i b_isinfnan = _mm_cmpeq_epi32(mshiftexp, justexp); __m128i b_isdenorm = _mm_cmpeq_epi32(zero, justexp); __m128i adj_infnan = _mm_and_si128(b_isinfnan, CONST(expadjust_infnan)); __m128i adjusted2 = _mm_add_epi32(adjusted, adj_infnan); __m128i adj_den = CONST(expadjust_denorm); __m128i den1 = _mm_add_epi32(adj_den, adjusted2); __m128 den2 = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm); __m128 adjusted3 = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm)); __m128 adjusted4 = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2)); __m128 adjusted5 = _mm_or_ps(adjusted3, adjusted4); __m128i sign = _mm_slli_epi32(justsign, 16); __m128 final = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign)); // ~21 SSE2 ops. return final; #undef SSE_CONST4 #undef CONST } static __m128 half_to_float4b_SSE2(__m128i h) { #define SSE_CONST4(name, val) static const __declspec(align(16)) uint name[4] = { (val), (val), (val), (val) } #define CONST(name) *(const __m128i *)&name SSE_CONST4(mask_nosign, 0x7fff); SSE_CONST4(smallest_normal, 0x0400); SSE_CONST4(infinity, 0x7c00); SSE_CONST4(expadjust_normal, (127 - 15) << 23); SSE_CONST4(magic_denorm, 113 << 23); __m128i mnosign = CONST(mask_nosign); __m128i eadjust = CONST(expadjust_normal); __m128i smallest = CONST(smallest_normal); __m128i infty = CONST(infinity); __m128i expmant = _mm_and_si128(mnosign, h); __m128i justsign = _mm_xor_si128(h, expmant); __m128i b_notinfnan = _mm_cmpgt_epi32(infty, expmant); __m128i b_isdenorm = _mm_cmpgt_epi32(smallest, expmant); __m128i shifted = _mm_slli_epi32(expmant, 13); __m128i adj_infnan = _mm_andnot_si128(b_notinfnan, eadjust); __m128i adjusted = _mm_add_epi32(eadjust, shifted); __m128i den1 = _mm_add_epi32(shifted, CONST(magic_denorm)); __m128i adjusted2 = _mm_add_epi32(adjusted, adj_infnan); __m128 den2 = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm); __m128 adjusted3 = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm)); __m128 adjusted4 = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2)); __m128 adjusted5 = _mm_or_ps(adjusted3, adjusted4); __m128i sign = _mm_slli_epi32(justsign, 16); __m128 final = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign)); // ~19 SSE2 ops. return final; #undef SSE_CONST4 #undef CONST } static __m128 half_to_float5_SSE2(__m128i h) { #define SSE_CONST4(name, val) static const __declspec(align(16)) uint name[4] = { (val), (val), (val), (val) } #define CONST(name) *(const __m128i *)&name #define CONSTF(name) *(const __m128 *)&name SSE_CONST4(mask_nosign, 0x7fff); SSE_CONST4(magic, (254 - 15) << 23); SSE_CONST4(was_infnan, 0x7bff); SSE_CONST4(exp_infnan, 255 << 23); __m128i mnosign = CONST(mask_nosign); __m128i expmant = _mm_and_si128(mnosign, h); __m128i justsign = _mm_xor_si128(h, expmant); __m128i expmant2 = expmant; // copy (just here for counting purposes) __m128i shifted = _mm_slli_epi32(expmant, 13); __m128 scaled = _mm_mul_ps(_mm_castsi128_ps(shifted), *(const __m128 *)&magic); __m128i b_wasinfnan = _mm_cmpgt_epi32(expmant2, CONST(was_infnan)); __m128i sign = _mm_slli_epi32(justsign, 16); __m128 infnanexp = _mm_and_ps(_mm_castsi128_ps(b_wasinfnan), CONSTF(exp_infnan)); __m128 sign_inf = _mm_or_ps(_mm_castsi128_ps(sign), infnanexp); __m128 final = _mm_or_ps(scaled, sign_inf); // ~11 SSE2 ops. return final; #undef SSE_CONST4 #undef CONST #undef CONSTF } // make sure we don't get DCE on SSE code __declspec(align(16)) float output[1024*4]; int main(int argc, char **argv) { FP16 h; FP32 full, fast, fast2, fast3, fast4, fast5; init_tables(); for (int i=0; i < 0x10000; i++) { h.u = i; full = half_to_float_full(h); fast = half_to_float_fast(h); fast2 = half_to_float_fast2(h); fast3 = half_to_float_fast3(h); fast4 = half_to_float_fast4(h); fast5 = half_to_float_fast5(h); if (full.u != fast.u || full.u != fast2.u || full.u != fast3.u || full.u != fast4.u || full.u != fast5.u) { printf("mismatch! val=%04x full=%08x fast=%08x fast2=%08x fast3=%08x fast4=%08x fast5=%08x\n", i, full.u, fast.u, fast2.u, fast3.u, fast4.u, fast5.u); return 1; } } for (int i=0; i < 0x10000; i += 4) { uint ref[4]; uint ssein[4], sseout[4]; for (int j=0; j < 4; j++) { ssein[j] = i + j; h.u = i + j; full = half_to_float_full(h); ref[j] = full.u; } __m128i in = _mm_loadu_si128((const __m128i *)ssein); __m128 out = half_to_float4b_SSE2(in); _mm_storeu_ps((float *)sseout, out); for (int j=0; j < 4; j++) { if (sseout[j] != ref[j]) { printf("mismatch! val=%04x full=%08x fast4SSE2=%08x\n", i+j, ref[j], sseout[j]); return 1; } } } uint best = ~0u; int start = 0, end = 0x10000; for (int runs=0; runs < 15000; runs++) { __m128i vals = _mm_set_epi32(start + 3, start + 2, start + 1, start + 0); __m128i incr = _mm_set1_epi32(4); uint tstart = (uint) __rdtsc(); for (int i=start; i < end; i += 4) { __m128 out = half_to_float4b_SSE2(vals); _mm_store_ps(&output[i & 1023], out); vals = _mm_add_epi32(vals, incr); } uint time = (uint) __rdtsc() - tstart; if (time < best) best = time; } printf("best: %d cycles = %.2f / vec\n", best, 4.0f * best / (end - start)); printf("all ok.\n"); return 0; }