Your task is to convert a given C++ ARM NEON SIMD to WASM SIMD. Here is an example of another function: ```cpp void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; int ib = 0; float sumf = 0; assert(n % qk == 0); assert(qk == QK5_0); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); const block_q5_0 * restrict x = vx; const block_q8_0 * restrict y = vy; #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); uint32_t qh0; uint32_t qh1; uint64_t tmp0[4]; uint64_t tmp1[4]; for (; ib + 1 < nb; ib += 2) { const block_q5_0 * restrict x0 = &x[ib]; const block_q5_0 * restrict x1 = &x[ib + 1]; const block_q8_0 * restrict y0 = &y[ib]; const block_q8_0 * restrict y1 = &y[ib + 1]; const uint8x16_t m4b = vdupq_n_u8(0x0F); // extract the 5th bit via lookup table ((!b) << 4) memcpy(&qh0, x0->qh, sizeof(qh0)); memcpy(&qh1, x1->qh, sizeof(qh1)); tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF]; tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF]; tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF]; tmp0[3] = table_b2b_1[(qh0 >> 24) ]; tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF]; tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF]; tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF]; tmp1[3] = table_b2b_1[(qh1 >> 24) ]; const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0)); const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2)); const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0)); const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2)); const uint8x16_t v0_0 = vld1q_u8(x0->qs); const uint8x16_t v0_1 = vld1q_u8(x1->qs); // 4-bit -> 8-bit int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero) const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0); const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0); const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1); const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1); // load y const int8x16_t v1_0l = vld1q_s8(y0->qs); const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); const int8x16_t v1_1l = vld1q_s8(y1->qs); const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); } sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); #elif defined(__wasm_simd128__) v128_t sumv = wasm_f32x4_splat(0.0f); uint32_t qh; uint64_t tmp[4]; // TODO: check if unrolling this is better for (; ib < nb; ++ib) { const block_q5_0 * restrict x0 = &x[ib]; const block_q8_0 * restrict y0 = &y[ib]; const v128_t m4b = wasm_i8x16_splat(0x0F); // extract the 5th bit memcpy(&qh, x0->qh, sizeof(qh)); tmp[0] = table_b2b_1[(qh >> 0) & 0xFF]; tmp[1] = table_b2b_1[(qh >> 8) & 0xFF]; tmp[2] = table_b2b_1[(qh >> 16) & 0xFF]; tmp[3] = table_b2b_1[(qh >> 24) ]; const v128_t qhl = wasm_v128_load(tmp + 0); const v128_t qhh = wasm_v128_load(tmp + 2); const v128_t v0 = wasm_v128_load(x0->qs); // 4-bit -> 8-bit const v128_t v0l = wasm_v128_and (v0, m4b); const v128_t v0h = wasm_u8x16_shr(v0, 4); // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero) const v128_t v0lf = wasm_i8x16_sub(v0l, qhl); const v128_t v0hf = wasm_i8x16_sub(v0h, qhh); // load y const v128_t v1l = wasm_v128_load(y0->qs); const v128_t v1h = wasm_v128_load(y0->qs + 16); // int8x16 -> int16x8 const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); // dot product sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( wasm_i32x4_add( wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), wasm_i32x4_dot_i16x8(v0lfh, v1lh)), wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d)))); } sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3); #endif for (; ib < nb; ++ib) { uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); int sumi0 = 0; int sumi1 = 0; for (int j = 0; j < qk/2; ++j) { const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16); const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16); sumi0 += (x0 * y[ib].qs[j]); sumi1 += (x1 * y[ib].qs[j + qk/2]); } int sumi = sumi0 + sumi1; sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi; } *s = sumf; } ```` Here is the function that you need to convert: ```cpp void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); const block_q8_0 * restrict x = vx; const block_q8_0 * restrict y = vy; int ib = 0; float sumf = 0; #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); for (; ib + 1 < nb; ib += 2) { const block_q8_0 * restrict x0 = &x[ib + 0]; const block_q8_0 * restrict x1 = &x[ib + 1]; const block_q8_0 * restrict y0 = &y[ib + 0]; const block_q8_0 * restrict y1 = &y[ib + 1]; const int8x16_t x0_0 = vld1q_s8(x0->qs); const int8x16_t x0_1 = vld1q_s8(x0->qs + 16); const int8x16_t x1_0 = vld1q_s8(x1->qs); const int8x16_t x1_1 = vld1q_s8(x1->qs + 16); // load y const int8x16_t y0_0 = vld1q_s8(y0->qs); const int8x16_t y0_1 = vld1q_s8(y0->qs + 16); const int8x16_t y1_0 = vld1q_s8(y1->qs); const int8x16_t y1_1 = vld1q_s8(y1->qs + 16); sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); } sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); #endif for (; ib < nb; ++ib) { int sumi = 0; for (int j = 0; j < qk; j++) { sumi += x[ib].qs[j]*y[ib].qs[j]; } sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)); } *s = sumf; } ``` You must start your code with `#elif defined(__wasm_simd128__)`