- skip sdpa
{
"mode": "QUANTIZE",
"observer": "maxabs",
"scale_method": "ACT_MAXABS_HW_WEIGHTS_PCS_MAXABS_POW2",
"scale_format": "const",
"allowlist": {
"types": [],
"names": []
},
"blocklist": {
"types": [],
"names": [
"lm_head",
"mlp\\.gate\\b",
"latent_cache_k_nodeq",
"latent_cache_v_nodeq",
"latent_cache_v",
"latent_cache_k",
"matmul_qk",
"matmul_av",
"batch2block_matmul",
"block2batch_matmul",
"fused_scaled_dot_product_attention"
]
},
"dump_stats_path": "./scripts/nc_workspace_measure_kvcache/inc_measure_output"
}
- pow of 2 + INC_FORCE_NAIVE_SCALING=0
export INC_FORCE_NAIVE_SCALING=0
{
"mode": "QUANTIZE",
"observer": "maxabs",
"scale_method": "ACT_MAXABS_POW2_WEIGHTS_PCS_MAXABS_POW2",
"scale_format": "const",
"allowlist": {
"types": [],
"names": []
},
"blocklist": {
"types": [],
"names": [
"lm_head",
"mlp\\.gate\\b",
"latent_cache_k_nodeq",
"latent_cache_v_nodeq",
"latent_cache_v",
"latent_cache_k",
"matmul_qk",
"matmul_av",
"batch2block_matmul",
"block2batch_matmul",
"fused_scaled_dot_product_attention"
]
},
"dump_stats_path": "./scripts/nc_workspace_measure_kvcache/inc_measure_output"
}
- Calibration all pile + 1 or 2
bash scripts/run_inc_calib.sh --model /path/to/converted/model/ --nprompts 100000