ngxson/FAQ.md

Last active August 5, 2025 17:29

Star (129) You must be signed in to star a gist
Fork (22) You must be signed in to fork a gist

Learn more about clone URLs
Clone this repository at <script src="https://gist.github.com/ngxson/307140d24d80748bd683b396ba13be07.js"></script>
Save ngxson/307140d24d80748bd683b396ba13be07 to your computer and use it in GitHub Desktop.

Download ZIP

convert ARM NEON to WASM SIMD prompt

Raw

prompt.md

Your task is to convert a given C++ ARM NEON SIMD to WASM SIMD. Here is an example of another function:

void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    const int qk = QK8_0;
    const int nb = n / qk;

    int ib = 0;
    float sumf = 0;

    assert(n % qk == 0);
    assert(qk == QK5_0);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);

    const block_q5_0 * restrict x = vx;
    const block_q8_0 * restrict y = vy;

#if defined(__ARM_NEON)
    float32x4_t sumv0 = vdupq_n_f32(0.0f);
    float32x4_t sumv1 = vdupq_n_f32(0.0f);

    uint32_t qh0;
    uint32_t qh1;

    uint64_t tmp0[4];
    uint64_t tmp1[4];

    for (; ib + 1 < nb; ib += 2) {
        const block_q5_0 * restrict x0 = &x[ib];
        const block_q5_0 * restrict x1 = &x[ib + 1];
        const block_q8_0 * restrict y0 = &y[ib];
        const block_q8_0 * restrict y1 = &y[ib + 1];

        const uint8x16_t m4b = vdupq_n_u8(0x0F);

        // extract the 5th bit via lookup table ((!b) << 4)
        memcpy(&qh0, x0->qh, sizeof(qh0));
        memcpy(&qh1, x1->qh, sizeof(qh1));

        tmp0[0] = table_b2b_1[(qh0 >>  0) & 0xFF];
        tmp0[1] = table_b2b_1[(qh0 >>  8) & 0xFF];
        tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
        tmp0[3] = table_b2b_1[(qh0 >> 24)       ];

        tmp1[0] = table_b2b_1[(qh1 >>  0) & 0xFF];
        tmp1[1] = table_b2b_1[(qh1 >>  8) & 0xFF];
        tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
        tmp1[3] = table_b2b_1[(qh1 >> 24)       ];

        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));

        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
        const uint8x16_t v0_1 = vld1q_u8(x1->qs);

        // 4-bit -> 8-bit
        int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
        int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
        int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
        int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));

        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
        const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0);
        const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0);
        const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1);
        const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1);

        // load y
        const int8x16_t v1_0l = vld1q_s8(y0->qs);
        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
        const int8x16_t v1_1l = vld1q_s8(y1->qs);
        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);

        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
    }

    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
#elif defined(__wasm_simd128__)
    v128_t sumv = wasm_f32x4_splat(0.0f);

    uint32_t qh;
    uint64_t tmp[4];

    // TODO: check if unrolling this is better
    for (; ib < nb; ++ib) {
        const block_q5_0 * restrict x0 = &x[ib];
        const block_q8_0 * restrict y0 = &y[ib];

        const v128_t m4b  = wasm_i8x16_splat(0x0F);

        // extract the 5th bit
        memcpy(&qh, x0->qh, sizeof(qh));

        tmp[0] = table_b2b_1[(qh >>  0) & 0xFF];
        tmp[1] = table_b2b_1[(qh >>  8) & 0xFF];
        tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
        tmp[3] = table_b2b_1[(qh >> 24)       ];

        const v128_t qhl = wasm_v128_load(tmp + 0);
        const v128_t qhh = wasm_v128_load(tmp + 2);

        const v128_t v0 = wasm_v128_load(x0->qs);

        // 4-bit -> 8-bit
        const v128_t v0l = wasm_v128_and (v0, m4b);
        const v128_t v0h = wasm_u8x16_shr(v0, 4);

        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
        const v128_t v0lf = wasm_i8x16_sub(v0l, qhl);
        const v128_t v0hf = wasm_i8x16_sub(v0h, qhh);

        // load y
        const v128_t v1l = wasm_v128_load(y0->qs);
        const v128_t v1h = wasm_v128_load(y0->qs + 16);

        // int8x16 -> int16x8
        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);

        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);

        // dot product
        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
                        wasm_i32x4_add(
                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
    }

    sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
           wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
#endif

    for (; ib < nb; ++ib) {
        uint32_t qh;
        memcpy(&qh, x[ib].qh, sizeof(qh));

        int sumi0 = 0;
        int sumi1 = 0;

        for (int j = 0; j < qk/2; ++j) {
            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));

            const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
            const int32_t x1 = (int8_t)(((x[ib].qs[j] >>   4) | xh_1) - 16);

            sumi0 += (x0 * y[ib].qs[j]);
            sumi1 += (x1 * y[ib].qs[j + qk/2]);
        }

        int sumi = sumi0 + sumi1;
        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
    }

    *s = sumf;
}

Here is the function that you need to convert:

void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    const int qk = QK8_0;
    const int nb = n / qk;

    assert(n % qk == 0);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);

    const block_q8_0 * restrict x = vx;
    const block_q8_0 * restrict y = vy;

    int ib = 0;
    float sumf = 0;

#if defined(__ARM_NEON)
    float32x4_t sumv0 = vdupq_n_f32(0.0f);
    float32x4_t sumv1 = vdupq_n_f32(0.0f);

    for (; ib + 1 < nb; ib += 2) {
        const block_q8_0 * restrict x0 = &x[ib + 0];
        const block_q8_0 * restrict x1 = &x[ib + 1];
        const block_q8_0 * restrict y0 = &y[ib + 0];
        const block_q8_0 * restrict y1 = &y[ib + 1];

        const int8x16_t x0_0 = vld1q_s8(x0->qs);
        const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
        const int8x16_t x1_0 = vld1q_s8(x1->qs);
        const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);

        // load y
        const int8x16_t y0_0 = vld1q_s8(y0->qs);
        const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
        const int8x16_t y1_0 = vld1q_s8(y1->qs);
        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);

        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
                        ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
                        ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));

        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
                        ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
                        ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
    }

    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
#endif
    for (; ib < nb; ++ib) {
        int sumi = 0;

        for (int j = 0; j < qk; j++) {
            sumi += x[ib].qs[j]*y[ib].qs[j];
        }

        sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
    }

    *s = sumf;
}

You must start your code with #elif defined(__wasm_simd128__)

Naman-ntc commented Jan 27, 2025

Thanks for sharing the prompts! How much effort was required to build the tests to enable this optimization?

And how many iterations/attempts were needed to generate the correct code?

Author

ngxson commented Jan 27, 2025

For the qX_0 variants, they are actually quite straight-forward so deepseek can come up with a correct result in 1 shot

For the qX_K it's more complicated, I would say most of the time I need to re-prompt it 4 to 8 more times.

The most difficult was q6_K, the code never works until I ask it to only optimize one part, but not all (so it does not mess up everything)

henryclw commented Jan 27, 2025

Thank you so much for sharing this

timwu-ipevo commented Jan 27, 2025

Could you share the thought process R1 outputted? I'd like to understand how it approached this complex problem with just a single example.

Author

ngxson commented Jan 27, 2025 •

edited

Loading

@timwu-ipevo It's too long so I don't think it's worth sharing here. You can try it yourself on chat.deepseek.com, it's free

Manamama commented Jan 28, 2025

Amazing results, ta too. FYI, I have landed here from a Reddit llama thread discussing some Deepseek's geopolitics...

0xCA commented Jan 28, 2025 •

edited

Loading

Which qX_0 exactly you used? I can only see q8_0 here or here
You also said q6_K was the worst. Does it mean anything lower than 6bit is even worse?

Author

ngxson commented Jan 28, 2025

@0xCA All the qX_0_q8_0 functions were generated by that one prompt, just replace the data.

q6_K is an exception case because I need to ask it to reinvent the whole thing from scratch. Not sure why, but other qX_K requires less trials to get it right. For example, q4_K just work one-shot IIRC

Author

ngxson commented Jan 28, 2025

q6_K is an exception case because I need to ask it to reinvent the whole thing from scratch

And btw, for this exact reason, you will see that the code for q6_K looks different from the rest. For example, that's the only place it uses __attribute__((aligned(16)))

0xCA commented Jan 28, 2025

@ngxson ah, seems like I misunderstood you! You meant functions, and I thought you used quantized models of R1.

Author

ngxson commented Jan 28, 2025

No, as stated in the FAQ, I used chat.deepseek.com

And ironically, I came up with this whole idea while optimizing wllama to run deepseek-r1-distilled-qwen-1.5B faster. So the bigger deepseek helping optimize code to run the smaller deepseek.

hjortholm commented Jan 29, 2025

The code is out of my league, but I might have an improvement for the prompt. Try this:

Goal: Translate the ggml_vec_dot_q8_0_q8_0 function from ARM NEON to WASM SIMD, matching the style shown in the provided ggml_vec_dot_q5_0_q8_0 conversion.

Instructions:

Provide a WASM SIMD version of the function.
Begin the code block with #elif defined(wasm_simd128).
Preserve key operations:
Loading data into 128-bit registers (v128_t in WASM).
Performing dot products using WASM intrinsics analogous to vmlaq_n_f32 and vcvtq_f32_s32.
Summation in float32 format, as in the example.
Ensure the final WASM SIMD code is self-contained and references the same arrays/structs (block_q8_0, qs, d) as the ARM version.
Clearly comment any lines where NEON intrinsics have a direct WASM SIMD equivalent.
Retain the function signature and the final accumulation (sumf) approach.
If loops are unrolled similarly to the example, replicate that strategy. If performance trade-offs exist, note them in the comments.
Output Criteria:

A complete WASM SIMD function ready to place in the code as the counterpart to the NEON block.
Comments highlighting any differences or special considerations for WebAssembly.
Maintenance of the same input parameters and usage patterns, ensuring that the rest of the code remains compatible.

Here is an example of another function:

void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    const int qk = QK8_0;
    const int nb = n / qk;

    int ib = 0;
    float sumf = 0;

    assert(n % qk == 0);
    assert(qk == QK5_0);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);

    const block_q5_0 * restrict x = vx;
    const block_q8_0 * restrict y = vy;

#if defined(__ARM_NEON)
    float32x4_t sumv0 = vdupq_n_f32(0.0f);
    float32x4_t sumv1 = vdupq_n_f32(0.0f);

    uint32_t qh0;
    uint32_t qh1;

    uint64_t tmp0[4];
    uint64_t tmp1[4];

    for (; ib + 1 < nb; ib += 2) {
        const block_q5_0 * restrict x0 = &x[ib];
        const block_q5_0 * restrict x1 = &x[ib + 1];
        const block_q8_0 * restrict y0 = &y[ib];
        const block_q8_0 * restrict y1 = &y[ib + 1];

        const uint8x16_t m4b = vdupq_n_u8(0x0F);

        // extract the 5th bit via lookup table ((!b) << 4)
        memcpy(&qh0, x0->qh, sizeof(qh0));
        memcpy(&qh1, x1->qh, sizeof(qh1));

        tmp0[0] = table_b2b_1[(qh0 >>  0) & 0xFF];
        tmp0[1] = table_b2b_1[(qh0 >>  8) & 0xFF];
        tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
        tmp0[3] = table_b2b_1[(qh0 >> 24)       ];

        tmp1[0] = table_b2b_1[(qh1 >>  0) & 0xFF];
        tmp1[1] = table_b2b_1[(qh1 >>  8) & 0xFF];
        tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
        tmp1[3] = table_b2b_1[(qh1 >> 24)       ];

        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));

        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
        const uint8x16_t v0_1 = vld1q_u8(x1->qs);

        // 4-bit -> 8-bit
        int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
        int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
        int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
        int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));

        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
        const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0);
        const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0);
        const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1);
        const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1);

        // load y
        const int8x16_t v1_0l = vld1q_s8(y0->qs);
        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
        const int8x16_t v1_1l = vld1q_s8(y1->qs);
        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);

        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
    }

    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
#elif defined(__wasm_simd128__)
    v128_t sumv = wasm_f32x4_splat(0.0f);

    uint32_t qh;
    uint64_t tmp[4];

    // TODO: check if unrolling this is better
    for (; ib < nb; ++ib) {
        const block_q5_0 * restrict x0 = &x[ib];
        const block_q8_0 * restrict y0 = &y[ib];

        const v128_t m4b  = wasm_i8x16_splat(0x0F);

        // extract the 5th bit
        memcpy(&qh, x0->qh, sizeof(qh));

        tmp[0] = table_b2b_1[(qh >>  0) & 0xFF];
        tmp[1] = table_b2b_1[(qh >>  8) & 0xFF];
        tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
        tmp[3] = table_b2b_1[(qh >> 24)       ];

        const v128_t qhl = wasm_v128_load(tmp + 0);
        const v128_t qhh = wasm_v128_load(tmp + 2);

        const v128_t v0 = wasm_v128_load(x0->qs);

        // 4-bit -> 8-bit
        const v128_t v0l = wasm_v128_and (v0, m4b);
        const v128_t v0h = wasm_u8x16_shr(v0, 4);

        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
        const v128_t v0lf = wasm_i8x16_sub(v0l, qhl);
        const v128_t v0hf = wasm_i8x16_sub(v0h, qhh);

        // load y
        const v128_t v1l = wasm_v128_load(y0->qs);
        const v128_t v1h = wasm_v128_load(y0->qs + 16);

        // int8x16 -> int16x8
        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);

        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);

        // dot product
        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
                        wasm_i32x4_add(
                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
    }

    sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
           wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
#endif

    for (; ib < nb; ++ib) {
        uint32_t qh;
        memcpy(&qh, x[ib].qh, sizeof(qh));

        int sumi0 = 0;
        int sumi1 = 0;

        for (int j = 0; j < qk/2; ++j) {
            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));

            const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
            const int32_t x1 = (int8_t)(((x[ib].qs[j] >>   4) | xh_1) - 16);

            sumi0 += (x0 * y[ib].qs[j]);
            sumi1 += (x1 * y[ib].qs[j + qk/2]);
        }

        int sumi = sumi0 + sumi1;
        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
    }

    *s = sumf;
}

Here is the function that you need to convert:

void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    const int qk = QK8_0;
    const int nb = n / qk;

    assert(n % qk == 0);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);

    const block_q8_0 * restrict x = vx;
    const block_q8_0 * restrict y = vy;

    int ib = 0;
    float sumf = 0;

#if defined(__ARM_NEON)
    float32x4_t sumv0 = vdupq_n_f32(0.0f);
    float32x4_t sumv1 = vdupq_n_f32(0.0f);

    for (; ib + 1 < nb; ib += 2) {
        const block_q8_0 * restrict x0 = &x[ib + 0];
        const block_q8_0 * restrict x1 = &x[ib + 1];
        const block_q8_0 * restrict y0 = &y[ib + 0];
        const block_q8_0 * restrict y1 = &y[ib + 1];

        const int8x16_t x0_0 = vld1q_s8(x0->qs);
        const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
        const int8x16_t x1_0 = vld1q_s8(x1->qs);
        const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);

        // load y
        const int8x16_t y0_0 = vld1q_s8(y0->qs);
        const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
        const int8x16_t y1_0 = vld1q_s8(y1->qs);
        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);

        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
                        ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
                        ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));

        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
                        ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
                        ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
    }

    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
#endif
    for (; ib < nb; ++ib) {
        int sumi = 0;

        for (int j = 0; j < qk; j++) {
            sumi += x[ib].qs[j]*y[ib].qs[j];
        }

        sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
    }

    *s = sumf;
}

hjortholm commented Jan 29, 2025

Optimized prompt for converting code

hjortholm commented Jan 29, 2025

DeepSeek evaluation of the converted code.

The performance gains from converting the ggml_vec_dot_q8_0_q8_0 function to use WASM SIMD (versus a scalar fallback in WebAssembly) are expected to be significant, with potential improvements of 4–8× for the SIMD-optimized portion of the code. Here’s a breakdown of the factors and trade-offs:

Key Performance Drivers
SIMD Parallelism:

Each 128-bit WASM SIMD register processes 16 int8 elements at once, reducing the number of instructions needed for arithmetic.

The i32x4.dot_i16x8 instruction computes 4 accumulated 32-bit dot products in parallel, replacing 32 scalar operations per block of 16 elements.

Memory Efficiency:

Contiguous 128-bit loads (wasm_v128_load) align with SIMD-friendly memory access patterns, minimizing overhead compared to scalar byte-wise loads.

Reduced Loop Overhead:

The SIMD version processes 32 elements per iteration (two 128-bit registers for x and y), drastically reducing loop iterations compared to scalar code (which processes 1 element per iteration).

Comparison to ARM NEON
NEON (original code): Uses vdotq_s32 for direct 8-bit dot products and unrolls loops (2 blocks per iteration), achieving high throughput.

WASM SIMD (converted code):

Requires 8-bit → 16-bit extension (extend_low/high_i8x16) before using dot_i16x8, adding slight overhead.

Processes 1 block per iteration (vs. 2 in NEON), increasing loop overhead slightly.

Despite this, WASM SIMD still achieves close-to-NEON performance in practice for WebAssembly targets.

Expected Gains
Scalar Fallback (WebAssembly):

Without SIMD, scalar code requires 32 multiply-adds + loop overhead per block.

SIMD reduces this to ~4–6 vector instructions per block, yielding 4–8× speedups for the SIMD-optimized loop.

Real-World Impact:

For large tensors (common in LLMs), the SIMD-optimized loop will dominate runtime, leading to near-linear scaling with SIMD width.

Example: A 512-element dot product would see ~16× fewer loop iterations in SIMD vs. scalar.

Trade-Offs
Loop Unrolling:

The NEON version unrolls loops (2 blocks/iteration), but the WASM code processes 1 block/iteration. This slightly increases loop overhead but simplifies code maintenance.

8→16-Bit Extension:

WASM lacks direct 8-bit dot product instructions, requiring extension to 16-bit. This adds instructions but is still far faster than scalar.

Code-Specific Optimizations
The provided WASM SIMD code:

Uses wasm_f32x4_mul + wasm_f32x4_add to mimic NEON’s vmlaq_n_f32 (fused multiply-add).

Efficiently reduces partial sums with wasm_f32x4_extract_lane, minimizing horizontal addition overhead.

Conclusion
The WASM SIMD version should deliver 4–8× faster execution compared to the scalar fallback in WebAssembly, depending on runtime optimizations (e.g., V8 TurboFan). While slightly less efficient than ARM NEON due to 8→16-bit extension and loop unrolling differences, it brings WebAssembly performance much closer to native code for quantized LLM inference. For best results, ensure the runtime enables SIMD support (e.g., --experimental-wasm-simd in Node.js).

hjortholm commented Jan 29, 2025

DeepSeek evaluation of the converted code.

hjortholm commented Jan 29, 2025 •

edited

Loading

A second pass optimization prompt:
In your thinking, you state "Possible performance trade-offs: processing one block at a time may use fewer registers but may not utilize SIMD as efficiently as processing two blocks. However, given the example's approach, this is acceptable."
Do I read this correctly as there are further performance gains possible? If yes explore potential further performance gains and implement these

#elif defined(wasm_simd128)
v128_t sumv = wasm_f32x4_splat(0.0f);

// Process two blocks per iteration to match NEON's unrolling
for (; ib + 1 < nb; ib += 2) {
    const block_q8_0 * restrict x0 = &x[ib];
    const block_q8_0 * restrict x1 = &x[ib + 1];
    const block_q8_0 * restrict y0 = &y[ib];
    const block_q8_0 * restrict y1 = &y[ib + 1];

    // Load all 32 elements (two blocks) into SIMD registers
    const v128_t x0_0 = wasm_v128_load(x0->qs);
    const v128_t x0_1 = wasm_v128_load(x0->qs + 16);
    const v128_t x1_0 = wasm_v128_load(x1->qs);
    const v128_t x1_1 = wasm_v128_load(x1->qs + 16);

    const v128_t y0_0 = wasm_v128_load(y0->qs);
    const v128_t y0_1 = wasm_v128_load(y0->qs + 16);
    const v128_t y1_0 = wasm_v128_load(y1->qs);
    const v128_t y1_1 = wasm_v128_load(y1->qs + 16);

    // Helper function to compute dot product for a single 16-element pair
    #define DOT_PRODUCT(a, b) \
        wasm_i32x4_add( \
            wasm_i32x4_dot_i16x8( \
                wasm_i16x8_extend_low_i8x16(a), \
                wasm_i16x8_extend_low_i8x16(b) \
            ), \
            wasm_i32x4_dot_i16x8( \
                wasm_i16x8_extend_high_i8x16(a), \
                wasm_i16x8_extend_high_i8x16(b) \
            ) \
        )

    // Compute dot products for both blocks in parallel
    const v128_t dot0 = DOT_PRODUCT(x0_0, y0_0); // Block 0, first 16 elements
    const v128_t dot1 = DOT_PRODUCT(x0_1, y0_1); // Block 0, second 16 elements
    const v128_t dot2 = DOT_PRODUCT(x1_0, y1_0); // Block 1, first 16 elements
    const v128_t dot3 = DOT_PRODUCT(x1_1, y1_1); // Block 1, second 16 elements

    // Combine results and convert to float
    const v128_t sum01 = wasm_i32x4_add(wasm_i32x4_add(dot0, dot1), wasm_i32x4_add(dot2, dot3));
    const v128_t scales = wasm_f32x4_make(
        GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d),
        GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d),
        GGML_FP16_TO_FP32(x1->d) * GGML_FP16_TO_FP32(y1->d),
        GGML_FP16_TO_FP32(x1->d) * GGML_FP16_TO_FP32(y1->d)
    );

    // Accumulate scaled sums
    sumv = wasm_f32x4_add(sumv, 
        wasm_f32x4_mul(
            wasm_f32x4_convert_i32x4(sum01),
            scales
        )
    );
}

// Handle remaining block (if nb is odd)
for (; ib < nb; ++ib) {
    const block_q8_0 * restrict x0 = &x[ib];
    const block_q8_0 * restrict y0 = &y[ib];

    const v128_t x0_0 = wasm_v128_load(x0->qs);
    const v128_t x0_1 = wasm_v128_load(x0->qs + 16);
    const v128_t y0_0 = wasm_v128_load(y0->qs);
    const v128_t y0_1 = wasm_v128_load(y0->qs + 16);

    const v128_t dot0 = DOT_PRODUCT(x0_0, y0_0);
    const v128_t dot1 = DOT_PRODUCT(x0_1, y0_1);

    const float scale = GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d);
    sumv = wasm_f32x4_add(sumv,
        wasm_f32x4_mul(
            wasm_f32x4_convert_i32x4(wasm_i32x4_add(dot0, dot1)),
            wasm_f32x4_splat(scale)
        )
    );
}

// Final reduction
sumf = wasm_f32x4_extract_lane(sumv, 0)
     + wasm_f32x4_extract_lane(sumv, 1)
     + wasm_f32x4_extract_lane(sumv, 2)
     + wasm_f32x4_extract_lane(sumv, 3);

#endif
``
Performance Analysis
Loop Unrolling:

Reduces loop overhead by processing 2 blocks/iteration (vs. 1), cutting branch checks by ~50%.

Matches NEON’s strategy for better instruction scheduling.

SIMD Throughput:

Processes 64 int8 elements (2 blocks × 32 elements) per iteration.

Uses i32x4.doti16x8 efficiently across all registers without register spilling.

Reduced Redundancy:

The DOT_PRODUCT macro avoids code duplication.

Combines scaled sums for both blocks into a single f32x4 multiplication, reducing conversions.

Expected Gains
~10–20% Speedup: By unrolling loops and reducing overhead, this optimized version closes the gap with NEON.

Better ILP: Modern CPUs can pipeline instructions more effectively when working on multiple independent blocks.

Trade-Offs
Code Complexity: Slightly harder to read but maintains the same algorithmic structure.

This implementation balances performance and readability, matching ARM NEON’s efficiency while adhering to WASM’s constraints.

hjortholm commented Jan 29, 2025

A second pass optimization prompt

hjortholm commented Jan 29, 2025

Prompt for measuring performance and further enhancement:

objective: Determine the total performance gains from newly introduced optimizations and explore potential improvements.

Performance Gains:
How much speedup or efficiency is expected compared to the previous implementation?
Which metrics (e.g., execution time, memory usage, throughput) should be measured, and how?
What tools or methods (e.g., profilers, benchmark suites) will validate these performance improvements? 
Optimization Angles:
Are there further optimizations in algorithmic design, concurrency, or hardware-specific tuning?
Could different compiler options, instruction sets, or data layouts enhance results?
What trade-offs (e.g., code complexity vs. performance) might arise?
Expected Outcomes:
Provide guidance on short-term improvements and long-term enhancements to maintain high performance.
Outline any potential bottlenecks or corner cases that may limit gains

Performance Gains Analysis
1. Expected Speedup vs. Previous Implementation
Previous (1 block/iteration): Processed 32 elements (1 block) per iteration, with loop overhead for each block.

Optimized (2 blocks/iteration):

Loop Unrolling: Reduces loop overhead by ~50% (fewer branch checks).

SIMD Utilization: Processes 64 elements (2 blocks) in parallel, leveraging instruction-level parallelism.

Expected Gains:

10–20% speedup over the initial WASM SIMD version (1 block/iteration).

4–6× faster than scalar WebAssembly code (non-SIMD fallback).

2. Key Metrics to Measure
Metric How to Measure Relevance
Execution Time Time per ggml_vec_dot_q8_0_q8_0 call using high-resolution timers (e.g., performance.now()). Direct measure of optimization impact.
Throughput Elements processed per second (e.g., (n / execution_time) * 1e6). Quantifies SIMD efficiency.
SIMD Utilization Assembly inspection (e.g., wasm-dis tool) to confirm SIMD instructions dominate. Ensures vectorization isn’t hindered by data dependencies or misalignment.
Memory Bandwidth Profiler metrics (e.g., cache misses, load/store throughput via perf stat in native runtimes). Identifies bottlenecks in data access.
3. Validation Tools & Methods
WebAssembly Tools:

Chrome DevTools: Profile WebAssembly execution with the Performance panel.

WABT (WebAssembly Binary Toolkit): Inspect generated SIMD instructions.

V8 Flags: Use --experimental-wasm-simd to enforce SIMD optimizations.

Benchmark Suites:

Microbenchmarks with varying n (e.g., 512, 1024, 2048 elements) to test scaling.

Cross-validate with ARM NEON timings to compare against native performance.

Profilers:

Linux Perf: If running in a WASM-to-native environment (e.g., Wasmer/Wasmtime).

Emscripten’s --profiling: Generate function-level timing reports.

Optimization Angles

Algorithmic Improvements
Reduced 8→16-Bit Extension Overhead:

Explore fused 8-bit dot product approximations (if WebAssembly adds i32x4.dot_i8x16 in future).

Block Size Tuning:

Experiment with larger blocks (e.g., QK8_0=64) to amortize loop overhead, but balance with cache locality.

2. Concurrency
Web Workers:

Split n into chunks processed in parallel across threads (limited by WebAssembly’s threading support).

SIMD + Multithreading:

Combine thread-level parallelism (e.g., SharedArrayBuffer) with SIMD for large tensors.

Hardware-Specific Tuning
Memory Alignment:

Ensure block_q8_0 structs are 16-byte aligned for faster v128.load operations.

CPU-Specific Scheduling:

Use compiler hints (e.g., #pragma unroll in Emscripten) to optimize for pipelining.

Compiler & Data Layout
Compiler Flags:

-msimd128 -O3 (Emscripten) to maximize SIMD optimizations.

-flto for link-time optimizations.

Data Layout:

Structure-of-Arrays (SoA) for qs/d fields to improve prefetching.

Trade-Offs
Optimization Benefit Trade-Off
Loop Unrolling Reduces branch overhead. Increases code size; harder to maintain.
SIMD Register Pressure Maximizes parallelism. Risk of spilling to memory on constrained hardware.
Threading Utilizes multiple cores. Adds complexity; WebAssembly threading is still experimental.
Expected Outcomes
Short-Term Improvements
Validate Current Optimizations:

Confirm 10–20% gains over the 1-block/iteration WASM SIMD code with microbenchmarks.

Memory Alignment:

Enforce 16-byte alignment for block_q8_0 to avoid unaligned loads.

Compiler Tuning:

Test -O3 vs. -Os to balance speed and code size.

Long-Term Enhancements
WebAssembly Future Features:

Adopt wider SIMD (e.g., 256-bit) if standardized.

Leverage i8x16.dot instructions if added to the spec.

Algorithmic Hybridization:

Mix SIMD and scalar code for small n (e.g., n < 256).

WebGPU Integration:

Offload large dot products to GPU compute shaders.

Bottlenecks & Corner Cases
Memory Bandwidth:

If tensors exceed L2 cache, SIMD gains may plateau. Use smaller blocks or tiling.

Odd Block Counts:

The fallback loop for residual blocks (ib < nb) adds minor overhead. Ensure it’s minimal.

Half-Precision Scaling:

GGML_FP16_TO_FP32 conversions are scalar; batch-convert d values upfront if possible.

Final Guidance
Immediate Next Steps:

Benchmark with real-world LLM inference workloads (e.g., a transformer layer).

Profile memory alignment impact using wasm-opt --align-features.

Future Roadmap:

Monitor WebAssembly SIMD evolution (e.g., relaxed SIMD proposals).

Explore WebGPU for heterogeneous compute in parallel with SIMD.

By balancing SIMD efficiency, loop unrolling, and memory optimizations, the current implementation achieves near-native performance for WebAssembly while leaving room for future gains as the ecosystem matures.

ngxson/FAQ.md

Naman-ntc commented Jan 27, 2025

Uh oh!

ngxson commented Jan 27, 2025

Uh oh!

henryclw commented Jan 27, 2025

Uh oh!

timwu-ipevo commented Jan 27, 2025

Uh oh!

ngxson commented Jan 27, 2025 •

edited

Loading

Uh oh!

Manamama commented Jan 28, 2025

Uh oh!

0xCA commented Jan 28, 2025 •

edited

Loading

Uh oh!

ngxson commented Jan 28, 2025

Uh oh!

ngxson commented Jan 28, 2025

Uh oh!

0xCA commented Jan 28, 2025

Uh oh!

ngxson commented Jan 28, 2025

Uh oh!

hjortholm commented Jan 29, 2025

Uh oh!

hjortholm commented Jan 29, 2025

Uh oh!

hjortholm commented Jan 29, 2025

Uh oh!

hjortholm commented Jan 29, 2025

Uh oh!

hjortholm commented Jan 29, 2025 •

edited

Loading

Uh oh!

hjortholm commented Jan 29, 2025

Uh oh!

hjortholm commented Jan 29, 2025

Uh oh!

ngxson/FAQ.md

Naman-ntc commented Jan 27, 2025

Uh oh!

ngxson commented Jan 27, 2025

Uh oh!

henryclw commented Jan 27, 2025

Uh oh!

timwu-ipevo commented Jan 27, 2025

Uh oh!

ngxson commented Jan 27, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Manamama commented Jan 28, 2025

Uh oh!

0xCA commented Jan 28, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

ngxson commented Jan 28, 2025

Uh oh!

ngxson commented Jan 28, 2025

Uh oh!

0xCA commented Jan 28, 2025

Uh oh!

ngxson commented Jan 28, 2025

Uh oh!

hjortholm commented Jan 29, 2025

Uh oh!

hjortholm commented Jan 29, 2025

Uh oh!

hjortholm commented Jan 29, 2025

Uh oh!

hjortholm commented Jan 29, 2025

Uh oh!

hjortholm commented Jan 29, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

hjortholm commented Jan 29, 2025

Uh oh!

hjortholm commented Jan 29, 2025

Uh oh!

ngxson commented Jan 27, 2025 •

edited

Loading

0xCA commented Jan 28, 2025 •

edited

Loading

hjortholm commented Jan 29, 2025 •

edited

Loading