#include #include #include #include // hi lo // Vd, Vn, Vm, #imm #define vext8(x, y) ({ \ uint64_t xhi = _mm_extract_epi64((x), 1); \ uint64_t ylo = _mm_extract_epi64((y), 0); \ _mm_set_epi64((__m64)ylo, (__m64)xhi); \ }) __m128i polymul(uint8_t acc[16], uint8_t key[16]) { __m128i x = _mm_loadu_si128((const __m128i*)acc); __m128i y = _mm_loadu_si128((const __m128i*)key); // Karatsuba 1 __m128i tmp0 = vext8(x, y); tmp0 = _mm_xor_si128(tmp0, x); __m128i tmp1 = vext8(y, y); tmp1 = _mm_xor_si128(tmp1, y); __m128i m = _mm_clmulepi64_si128(tmp0, tmp1, 0x00); __m128i h = _mm_clmulepi64_si128(x, y, 0x11); __m128i l = _mm_clmulepi64_si128(x, y, 0x00); // Karatsuba 2 __m128i tmp2 = vext8(l, h); m = _mm_xor_si128(m, tmp2); tmp2 = _mm_xor_si128(h, l); tmp2 = _mm_xor_si128(m, tmp2); h = vext8(h, h); l = vext8(l, l); __m128i x23 = vext8(tmp2, h); __m128i x01 = vext8(l, tmp2); // Reduce __m128i poly = _mm_set_epi64((__m64)0xc200000000000000ULL, (__m64)0xc200000000000000ULL); __m128i a = _mm_clmulepi64_si128(poly, x01, 0x00); __m128i b = vext8(a, a); b = _mm_xor_si128(b, x01); __m128i c = _mm_clmulepi64_si128(poly, b, 0x11); __m128i d = _mm_xor_si128(c, b); d = _mm_xor_si128(d, x23); return d; }