-
-
Save chaucerling/7a5db6aed538c723b26712dc491e85d8 to your computer and use it in GitHub Desktop.
patch for ktransformers to support nvidia v100 and t4, testd for v0.2.1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| From 1d3f2ede5adebbd3a6fca0afa083545a68112574 Mon Sep 17 00:00:00 2001 | |
| From: Your Name <[email protected]> | |
| Date: Thu, 27 Feb 2025 23:35:12 +0800 | |
| Subject: [PATCH] support v100 | |
| --- | |
| Dockerfile | 24 +++++++-------- | |
| ktransformers/local_chat.py | 10 +++---- | |
| ktransformers/operators/attention.py | 29 ++++++++++++++----- | |
| .../DeepSeek-V2-Chat-multi-gpu-4.yaml | 10 +++---- | |
| .../DeepSeek-V2-Chat-multi-gpu.yaml | 6 ++-- | |
| .../optimize_rules/DeepSeek-V2-Chat.yaml | 4 +-- | |
| .../DeepSeek-V2-Lite-Chat-multi-gpu.yaml | 6 ++-- | |
| .../optimize_rules/DeepSeek-V2-Lite-Chat.yaml | 6 ++-- | |
| .../DeepSeek-V3-Chat-multi-gpu-4.yaml | 10 +++---- | |
| .../DeepSeek-V3-Chat-multi-gpu-8.yaml | 18 ++++++------ | |
| .../DeepSeek-V3-Chat-multi-gpu-marlin.yaml | 6 ++-- | |
| .../DeepSeek-V3-Chat-multi-gpu.yaml | 6 ++-- | |
| .../optimize_rules/DeepSeek-V3-Chat.yaml | 4 +-- | |
| .../optimize/optimize_rules/Mixtral.yaml | 4 +-- | |
| .../optimize_rules/Moonlight-16B-A3B.yaml | 4 +-- | |
| .../Qwen2-57B-A14B-Instruct-multi-gpu.yaml | 6 ++-- | |
| .../Qwen2-57B-A14B-Instruct.yaml | 4 +-- | |
| 17 files changed, 85 insertions(+), 72 deletions(-) | |
| diff --git a/ktransformers/local_chat.py b/ktransformers/local_chat.py | |
| index 7cbac7c..e4f5660 100644 | |
| --- a/ktransformers/local_chat.py | |
| +++ b/ktransformers/local_chat.py | |
| @@ -81,17 +81,17 @@ def local_chat( | |
| print("using custom modeling_xxx.py.") | |
| if ( | |
| "Qwen2Moe" in config.architectures[0] | |
| - ): # Qwen2Moe must use flash_attention_2 to avoid overflow. | |
| - config._attn_implementation = "flash_attention_2" | |
| + ): # Qwen2Moe must use eager to avoid overflow. | |
| + config._attn_implementation = "eager" | |
| if "Llama" in config.architectures[0]: | |
| config._attn_implementation = "eager" | |
| if "Mixtral" in config.architectures[0]: | |
| - config._attn_implementation = "flash_attention_2" | |
| + config._attn_implementation = "eager" | |
| model = custom_models[config.architectures[0]](config) | |
| else: | |
| model = AutoModelForCausalLM.from_config( | |
| - config, trust_remote_code=True, attn_implementation="flash_attention_2" | |
| + config, trust_remote_code=True, attn_implementation="eager" | |
| ) | |
| if optimize_config_path is None: | |
| @@ -180,4 +180,4 @@ def local_chat( | |
| if __name__ == "__main__": | |
| - fire.Fire(local_chat) | |
| \ No newline at end of file | |
| + fire.Fire(local_chat) | |
| diff --git a/ktransformers/operators/attention.py b/ktransformers/operators/attention.py | |
| index 35c8093..0b84350 100644 | |
| --- a/ktransformers/operators/attention.py | |
| +++ b/ktransformers/operators/attention.py | |
| @@ -272,6 +272,13 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention): | |
| print("position_ids", torch.isnan(position_ids).any()) | |
| """ | |
| + original_dtype = query_states.dtype | |
| + target_dtype = torch.half | |
| + query_states = query_states.to(target_dtype) | |
| + compressed_kv_with_k_pe = compressed_kv_with_k_pe.to(target_dtype) | |
| + compressed_kv = compressed_kv.to(target_dtype) | |
| + attn_output = attn_output.to(target_dtype) | |
| + | |
| # flash attn doesn't support head_dim bigger than 256 | |
| # use triton attention kernel adapted from vLLM and SGLang for MQA | |
| decode_attention_fwd_grouped(query_states, compressed_kv_with_k_pe, compressed_kv, attn_output, | |
| @@ -280,6 +287,7 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention): | |
| 4, #num_kv_splits # follow vLLM, fix it TODO | |
| self.softmax_scale, | |
| past_key_value.page_size) | |
| + attn_output = attn_output.to(original_dtype) | |
| # attn_output [bsz, q_len, self.num_heads, self.kv_lora_rank] | |
| # out_absorb [self.num_heads, self.v_head_dim, self.kv_lora_rank] | |
| @@ -321,13 +329,20 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention): | |
| value_states = value_states.view(bsz, kv_seq_len, self.num_heads, self.v_head_dim) | |
| value_states_padded = torch.nn.functional.pad(value_states, [0, query_states.shape[-1] - value_states.shape[-1]], value=0) | |
| - attn_output = flash_attn_func( | |
| - query_states, | |
| - key_states, | |
| - value_states_padded, | |
| - softmax_scale=self.softmax_scale, | |
| - causal=True, | |
| - ) | |
| + # attn_output = flash_attn_func( | |
| + # query_states, | |
| + # key_states, | |
| + # value_states_padded, | |
| + # softmax_scale=self.softmax_scale, | |
| + # causal=True, | |
| + # ) | |
| + attn_output = F.scaled_dot_product_attention( | |
| + query_states.transpose(1, 2), | |
| + key_states.transpose(1, 2), | |
| + value_states_padded.transpose(1, 2), | |
| + scale=self.softmax_scale, | |
| + is_causal=True | |
| + ).transpose(1, 2) | |
| if self.q_head_dim != self.v_head_dim: | |
| attn_output = attn_output[:, :, :, : self.v_head_dim] | |
| diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml | |
| index 66a420a..173a6e0 100644 | |
| --- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml | |
| +++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml | |
| @@ -47,7 +47,7 @@ | |
| kwargs: | |
| generate_device: "cuda:0" | |
| prefill_device: "cuda:0" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| name: "^model\\.layers\\.([2][0-9]|[1][5-9])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression | |
| @@ -57,7 +57,7 @@ | |
| kwargs: | |
| generate_device: "cuda:1" | |
| prefill_device: "cuda:1" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| name: "^model\\.layers\\.([3][0-9]|[4][0-4])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression | |
| @@ -67,7 +67,7 @@ | |
| kwargs: | |
| generate_device: "cuda:2" | |
| prefill_device: "cuda:2" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| name: "^model\\.layers\\.([5][0-9]|[4][5-9])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression | |
| @@ -77,7 +77,7 @@ | |
| kwargs: | |
| generate_device: "cuda:3" | |
| prefill_device: "cuda:3" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| @@ -228,7 +228,7 @@ | |
| kwargs: | |
| generate_device: "cuda:3" | |
| prefill_device: "cuda:3" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml | |
| index f409376..63b3ffa 100644 | |
| --- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml | |
| +++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml | |
| @@ -31,7 +31,7 @@ | |
| kwargs: | |
| generate_device: "cuda:0" | |
| prefill_device: "cuda:0" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| @@ -42,7 +42,7 @@ | |
| kwargs: | |
| generate_device: "cuda:1" | |
| prefill_device: "cuda:1" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| @@ -125,7 +125,7 @@ | |
| kwargs: | |
| generate_device: "cuda:1" | |
| prefill_device: "cuda:1" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml | |
| index 7f3e44e..85a3aeb 100644 | |
| --- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml | |
| +++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml | |
| @@ -13,7 +13,7 @@ | |
| kwargs: | |
| generate_device: "cuda" | |
| prefill_device: "cuda" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| @@ -24,7 +24,7 @@ | |
| kwargs: | |
| generate_device: "cuda" | |
| prefill_device: "cuda" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml | |
| index 158892d..bb7891f 100644 | |
| --- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml | |
| +++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml | |
| @@ -31,7 +31,7 @@ | |
| kwargs: | |
| generate_device: "cuda:0" | |
| prefill_device: "cuda:0" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| @@ -42,7 +42,7 @@ | |
| kwargs: | |
| generate_device: "cuda:1" | |
| prefill_device: "cuda:1" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| @@ -125,7 +125,7 @@ | |
| kwargs: | |
| generate_device: "cuda:1" | |
| prefill_device: "cuda:1" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat.yaml | |
| index 7f3e44e..d2c92d0 100644 | |
| --- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat.yaml | |
| +++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat.yaml | |
| @@ -13,7 +13,7 @@ | |
| kwargs: | |
| generate_device: "cuda" | |
| prefill_device: "cuda" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| @@ -24,7 +24,7 @@ | |
| kwargs: | |
| generate_device: "cuda" | |
| prefill_device: "cuda" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| @@ -65,4 +65,4 @@ | |
| class: "default" | |
| kwargs: | |
| generate_device: "cpu" | |
| - prefill_device: "cpu" | |
| \ No newline at end of file | |
| + prefill_device: "cpu" | |
| diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml | |
| index ea75b30..25e6d05 100644 | |
| --- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml | |
| +++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml | |
| @@ -59,7 +59,7 @@ | |
| kwargs: | |
| generate_device: "cuda:0" | |
| prefill_device: "cuda:0" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| # GPU 1: layers 15–29 | |
| @@ -71,7 +71,7 @@ | |
| kwargs: | |
| generate_device: "cuda:1" | |
| prefill_device: "cuda:1" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| # GPU 2: layers 30–44 | |
| @@ -83,7 +83,7 @@ | |
| kwargs: | |
| generate_device: "cuda:2" | |
| prefill_device: "cuda:2" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| # GPU 3: layers 45–60 | |
| @@ -95,7 +95,7 @@ | |
| kwargs: | |
| generate_device: "cuda:3" | |
| prefill_device: "cuda:3" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| # === MLP (MoE) Replacement === | |
| @@ -375,7 +375,7 @@ | |
| kwargs: | |
| generate_device: "cuda:3" | |
| prefill_device: "cuda:3" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| # For final modules (model.norm), ensure they are on GPU 3 (as in your original config) | |
| diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-8.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-8.yaml | |
| index b00d2b4..e746680 100644 | |
| --- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-8.yaml | |
| +++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-8.yaml | |
| @@ -100,7 +100,7 @@ | |
| kwargs: | |
| generate_device: "cuda:0" | |
| prefill_device: "cuda:0" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| # GPU 1: layers 8–15 | |
| @@ -112,7 +112,7 @@ | |
| kwargs: | |
| generate_device: "cuda:1" | |
| prefill_device: "cuda:1" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| # GPU 2: layers 16–23 | |
| @@ -124,7 +124,7 @@ | |
| kwargs: | |
| generate_device: "cuda:2" | |
| prefill_device: "cuda:2" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| # GPU 3: layers 24–31 | |
| @@ -136,7 +136,7 @@ | |
| kwargs: | |
| generate_device: "cuda:3" | |
| prefill_device: "cuda:3" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| # GPU 4: layers 32–39 | |
| @@ -148,7 +148,7 @@ | |
| kwargs: | |
| generate_device: "cuda:4" | |
| prefill_device: "cuda:4" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| # GPU 5: layers 40–47 | |
| @@ -160,7 +160,7 @@ | |
| kwargs: | |
| generate_device: "cuda:5" | |
| prefill_device: "cuda:5" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| # GPU 6: layers 48–55 | |
| @@ -172,7 +172,7 @@ | |
| kwargs: | |
| generate_device: "cuda:6" | |
| prefill_device: "cuda:6" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| # GPU 7: layers 56–63 | |
| @@ -184,7 +184,7 @@ | |
| kwargs: | |
| generate_device: "cuda:7" | |
| prefill_device: "cuda:7" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| @@ -721,7 +721,7 @@ | |
| kwargs: | |
| generate_device: "cuda:7" | |
| prefill_device: "cuda:7" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| # For final modules (model.norm), ensure they are on GPU 7 (as in your original config) | |
| diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml | |
| index e04c6ce..0fca38c 100644 | |
| --- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml | |
| +++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml | |
| @@ -31,7 +31,7 @@ | |
| kwargs: | |
| generate_device: "cuda:0" | |
| prefill_device: "cuda:0" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| @@ -42,7 +42,7 @@ | |
| kwargs: | |
| generate_device: "cuda:1" | |
| prefill_device: "cuda:1" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| @@ -160,7 +160,7 @@ | |
| kwargs: | |
| generate_device: "cuda:0" | |
| prefill_device: "cuda:0" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml | |
| index 50e282d..88174ea 100644 | |
| --- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml | |
| +++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml | |
| @@ -31,7 +31,7 @@ | |
| kwargs: | |
| generate_device: "cuda:0" | |
| prefill_device: "cuda:0" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| @@ -42,7 +42,7 @@ | |
| kwargs: | |
| generate_device: "cuda:1" | |
| prefill_device: "cuda:1" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| @@ -142,7 +142,7 @@ | |
| kwargs: | |
| generate_device: "cuda:1" | |
| prefill_device: "cuda:1" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml | |
| index d28e016..f0f8718 100644 | |
| --- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml | |
| +++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml | |
| @@ -14,7 +14,7 @@ | |
| kwargs: | |
| generate_device: "cuda" | |
| prefill_device: "cuda" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| @@ -25,7 +25,7 @@ | |
| kwargs: | |
| generate_device: "cuda" | |
| prefill_device: "cuda" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| name: "^model\\.layers\\..*\\.mlp$" | |
| diff --git a/ktransformers/optimize/optimize_rules/Mixtral.yaml b/ktransformers/optimize/optimize_rules/Mixtral.yaml | |
| index 80a346a..a8705ac 100644 | |
| --- a/ktransformers/optimize/optimize_rules/Mixtral.yaml | |
| +++ b/ktransformers/optimize/optimize_rules/Mixtral.yaml | |
| @@ -13,7 +13,7 @@ | |
| kwargs: | |
| generate_device: "cuda" | |
| prefill_device: "cuda" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| name: "^lm_head" | |
| @@ -23,7 +23,7 @@ | |
| kwargs: | |
| generate_device: "cuda" | |
| prefill_device: "cuda" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| name: "^model\\.layers\\..*\\.block_sparse_moe$" | |
| diff --git a/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml b/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml | |
| index 6cea246..dc0fd6a 100644 | |
| --- a/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml | |
| +++ b/ktransformers/optimize/optimize_rules/Moonlight-16B-A3B.yaml | |
| @@ -14,7 +14,7 @@ | |
| kwargs: | |
| generate_device: "cuda" | |
| prefill_device: "cuda" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| @@ -25,7 +25,7 @@ | |
| kwargs: | |
| generate_device: "cuda" | |
| prefill_device: "cuda" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| name: "^model\\.layers\\..*\\.mlp$" | |
| diff --git a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml | |
| index da01c82..caba1e1 100644 | |
| --- a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml | |
| +++ b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml | |
| @@ -14,7 +14,7 @@ | |
| kwargs: | |
| generate_device: "cuda:0" | |
| prefill_device: "cuda:0" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| name: "^model\\.layers\\.([012])\\.mlp$" | |
| @@ -50,7 +50,7 @@ | |
| kwargs: | |
| generate_device: "cuda:1" | |
| prefill_device: "cuda:1" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| name: "^model\\.layers\\.([12][0-9]|[3-9])\\.mlp$" | |
| @@ -85,7 +85,7 @@ | |
| kwargs: | |
| generate_device: "cuda:1" | |
| prefill_device: "cuda:1" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| diff --git a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml | |
| index 38e9e73..b12f022 100644 | |
| --- a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml | |
| +++ b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml | |
| @@ -13,7 +13,7 @@ | |
| kwargs: | |
| generate_device: "cuda" | |
| prefill_device: "cuda" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| name: "^lm_head" | |
| @@ -23,7 +23,7 @@ | |
| kwargs: | |
| generate_device: "cuda" | |
| prefill_device: "cuda" | |
| - generate_op: "KLinearMarlin" | |
| + generate_op: "KLinearTorch" | |
| prefill_op: "KLinearTorch" | |
| - match: | |
| name: "^model\\.layers\\..*\\.mlp$" | |
| -- | |
| 2.34.1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment