test_ds_v31.md

skip sdpa

{
    "mode": "QUANTIZE",
    "observer": "maxabs",
    "scale_method": "ACT_MAXABS_HW_WEIGHTS_PCS_MAXABS_POW2",
    "scale_format": "const",
    "allowlist": {
        "types": [],
        "names": []
    },
    "blocklist": {
        "types": [],
        "names": [
            "lm_head",
            "mlp\\.gate\\b",
            "latent_cache_k_nodeq",
            "latent_cache_v_nodeq",
            "latent_cache_v",
            "latent_cache_k",
            "matmul_qk",
            "matmul_av",
            "batch2block_matmul",
            "block2batch_matmul",
            "fused_scaled_dot_product_attention"
        ]
    },
    "dump_stats_path": "./scripts/nc_workspace_measure_kvcache/inc_measure_output"
}

pow of 2 + INC_FORCE_NAIVE_SCALING=0

export INC_FORCE_NAIVE_SCALING=0

{
    "mode": "QUANTIZE",
    "observer": "maxabs",
    "scale_method": "ACT_MAXABS_POW2_WEIGHTS_PCS_MAXABS_POW2",
    "scale_format": "const",
    "allowlist": {
        "types": [],
        "names": []
    },
    "blocklist": {
        "types": [],
        "names": [
            "lm_head",
            "mlp\\.gate\\b",
            "latent_cache_k_nodeq",
            "latent_cache_v_nodeq",
            "latent_cache_v",
            "latent_cache_k",
            "matmul_qk",
            "matmul_av",
            "batch2block_matmul",
            "block2batch_matmul",
            "fused_scaled_dot_product_attention"
        ]
    },
    "dump_stats_path": "./scripts/nc_workspace_measure_kvcache/inc_measure_output"
}

Calibration all pile + 1 or 2

bash scripts/run_inc_calib.sh --model /path/to/converted/model/ --nprompts 100000

yiliu30/test_ds_v31.md