Last active
September 14, 2025 20:56
-
-
Save avatsaev/dc302228e6628b3099cbafab80ec8998 to your computer and use it in GitHub Desktop.
My llamaswap config
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| logLevel: debug | |
| macros: | |
| #################### LARGE MODELS ############################################################## | |
| "Qwen3-30B-A3B-Instruct-2507": > | |
| llama-server \ | |
| --api-key qwen \ | |
| --port ${PORT} \ | |
| -ngl 80 \ | |
| --ctx-size 14000 \ | |
| --temp 0.7 \ | |
| --top-p 0.8 \ | |
| --top-k 20 \ | |
| --min-p 0 \ | |
| --repeat-penalty 1.05 \ | |
| --no-webui \ | |
| --timeout 300 \ | |
| --flash-attn on \ | |
| --jinja \ | |
| --alias Qwen3-instruct \ | |
| -m /home/avatsaev/models/qwen/Qwen3-30B-A3B-Instruct-2507-GGUF/Qwen3-30B-A3B-Instruct-2507-Q5_K_S.gguf | |
| "Qwen3-30B-A3B-Instruct-2507-100k": > | |
| llama-server \ | |
| --api-key qwen \ | |
| --port ${PORT} \ | |
| -ngl 80 \ | |
| --ctx-size 110000 \ | |
| --temp 0.7 \ | |
| --top-p 0.8 \ | |
| --top-k 20 \ | |
| --min-p 0 \ | |
| --cache-type-k q8_0 \ | |
| --cache-type-v q8_0 \ | |
| --repeat-penalty 1.05 \ | |
| --no-webui \ | |
| --timeout 300 \ | |
| --flash-attn on \ | |
| --jinja \ | |
| --alias Qwen3-instruct-100k \ | |
| -m /home/avatsaev/models/qwen/Qwen3-30B-A3B-Instruct-2507-GGUF/Qwen3-30B-A3B-Instruct-2507-UD-Q4_K_XL.gguf | |
| "Qwen3-30B-A3B-Instruct-parallel": > | |
| llama-server \ | |
| --api-key qwen \ | |
| --port ${PORT} \ | |
| -ngl 80 \ | |
| --ctx-size 110000 \ | |
| --temp 0.7 \ | |
| --top-p 0.8 \ | |
| --top-k 20 \ | |
| --min-p 0 \ | |
| --repeat-penalty 1.05 \ | |
| --cache-type-k q8_0 \ | |
| --cache-type-v q8_0 \ | |
| --no-webui \ | |
| --timeout 300 \ | |
| --flash-attn on \ | |
| --jinja \ | |
| --parallel 10 \ | |
| --cont-batching \ | |
| --alias Qwen3-instruct-parallel \ | |
| -m /home/avatsaev/models/qwen/Qwen3-30B-A3B-Instruct-2507-GGUF/Qwen3-30B-A3B-Instruct-2507-UD-Q4_K_XL.gguf \ | |
| "Qwen3-Coder-30B-A3B-Instruct": > | |
| llama-server \ | |
| --api-key qwen \ | |
| --port ${PORT} \ | |
| -ngl 80 \ | |
| --ctx-size 110000 \ | |
| --temp 0.7 \ | |
| --top-p 0.8 \ | |
| --top-k 20 \ | |
| --min-p 0 \ | |
| --repeat-penalty 1.05 \ | |
| --cache-type-k q8_0 \ | |
| --cache-type-v q8_0 \ | |
| --no-webui \ | |
| --timeout 300 \ | |
| --flash-attn on \ | |
| --alias Qwen3-coder-instruct \ | |
| --jinja \ | |
| -m /home/avatsaev/models/qwen/Qwen3-Coder-30B-A3B-Instruct-GGUF/Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf | |
| ################ VISION MODELS ############################################### | |
| "Gemma-3-mini-Omni": > | |
| llama-server \ | |
| --api-key qwen \ | |
| --port ${PORT} \ | |
| -ngl 80 \ | |
| --top-p 0.95 \ | |
| --top-k 64 \ | |
| --no-webui \ | |
| --timeout 300 \ | |
| --flash-attn on \ | |
| --alias Gemma-3-mini-Omni \ | |
| --mmproj /home/avatsaev/models/google/gemma-3-27b-it-qat-q4_0-gguf/mmproj-model-f16-27B.gguf \ | |
| -m /home/avatsaev/models/google/gemma-3-27b-it-qat-q4_0-gguf/gemma-3-27b-it-q4_0.gguf | |
| ################ FAST MODELS #################################################################### | |
| "Qwen3-4B-Instruct-2507": > | |
| llama-server \ | |
| --api-key qwen \ | |
| --port ${PORT} \ | |
| -ngl 80 \ | |
| --ctx-size 120000 \ | |
| --temp 0.7 \ | |
| --top-p 0.8 \ | |
| --top-k 20 \ | |
| --min-p 0 \ | |
| --repeat-penalty 1.05 \ | |
| --no-webui \ | |
| --timeout 300 \ | |
| --flash-attn on \ | |
| --alias Qwen3-mini \ | |
| -m /home/avatsaev/models/qwen/Qwen3-4B-Instruct-2507-GGUF/Qwen3-4B-Instruct-2507-UD-Q8_K_XL.gguf | |
| "Qwen3-4B-Instruct-2507-230k": > | |
| llama-server \ | |
| --api-key qwen \ | |
| --port ${PORT} \ | |
| -ngl 80 \ | |
| --ctx-size 230000 \ | |
| --temp 0.7 \ | |
| --top-p 0.8 \ | |
| --top-k 20 \ | |
| --min-p 0 \ | |
| --repeat-penalty 1.05 \ | |
| --no-webui \ | |
| --cache-type-k q8_0 \ | |
| --cache-type-v q8_0 \ | |
| --timeout 300 \ | |
| --flash-attn on \ | |
| --alias Qwen3-small-parallel \ | |
| -m /home/avatsaev/models/qwen/Qwen3-4B-Instruct-2507-GGUF/Qwen3-4B-Instruct-2507-UD-Q8_K_XL.gguf | |
| "Qwen3-4B-Instruct-2507-parallel": > | |
| llama-server \ | |
| --api-key qwen \ | |
| --port ${PORT} \ | |
| -ngl 80 \ | |
| --ctx-size 230000 \ | |
| --temp 0.7 \ | |
| --top-p 0.8 \ | |
| --top-k 20 \ | |
| --min-p 0 \ | |
| --repeat-penalty 1.05 \ | |
| --no-webui \ | |
| --cache-type-k q8_0 \ | |
| --cache-type-v q8_0 \ | |
| --timeout 300 \ | |
| --flash-attn on \ | |
| --parallel 10 \ | |
| --cont-batching \ | |
| --alias Qwen3-small-parallel \ | |
| -m /home/avatsaev/models/qwen/Qwen3-4B-Instruct-2507-GGUF/Qwen3-4B-Instruct-2507-UD-Q8_K_XL.gguf | |
| ################ VERY FAST MODELS #################################################################### | |
| "Qwen3-4B-Instruct-2507-q4": > | |
| llama-server \ | |
| --api-key qwen \ | |
| --port ${PORT} \ | |
| -ngl 80 \ | |
| --ctx-size 140000 \ | |
| --temp 0.7 \ | |
| --top-p 0.8 \ | |
| --top-k 20 \ | |
| --min-p 0 \ | |
| --repeat-penalty 1.05 \ | |
| --no-webui \ | |
| --timeout 300 \ | |
| --flash-attn on \ | |
| --alias Qwen3-nano \ | |
| -m /home/avatsaev/models/qwen/Qwen3-4B-Instruct-2507-GGUF/Qwen3-4B-Instruct-2507-UD-Q4_K_XL.gguf | |
| "Qwen3-4B-Instruct-2507-q4-260k": > | |
| llama-server \ | |
| --api-key qwen \ | |
| --port ${PORT} \ | |
| -ngl 80 \ | |
| --ctx-size 260000 \ | |
| --temp 0.7 \ | |
| --top-p 0.8 \ | |
| --top-k 20 \ | |
| --cache-type-k q8_0 \ | |
| --cache-type-v q8_0 \ | |
| --min-p 0 \ | |
| --repeat-penalty 1.05 \ | |
| --no-webui \ | |
| --timeout 300 \ | |
| --flash-attn on \ | |
| --alias Qwen3-nano \ | |
| -m /home/avatsaev/models/qwen/Qwen3-4B-Instruct-2507-GGUF/Qwen3-4B-Instruct-2507-UD-Q4_K_XL.gguf | |
| "Qwen3-4B-Instruct-2507-q4-parallel": > | |
| llama-server \ | |
| --api-key qwen \ | |
| --port ${PORT} \ | |
| -ngl 80 \ | |
| --ctx-size 260000 \ | |
| --temp 0.7 \ | |
| --top-p 0.8 \ | |
| --top-k 20 \ | |
| --cache-type-k q8_0 \ | |
| --cache-type-v q8_0 \ | |
| --min-p 0 \ | |
| --repeat-penalty 1.05 \ | |
| --no-webui \ | |
| --timeout 300 \ | |
| --parallel 10 \ | |
| --flash-attn on \ | |
| --cont-batching \ | |
| --alias Qwen3-nano \ | |
| -m /home/avatsaev/models/qwen/Qwen3-4B-Instruct-2507-GGUF/Qwen3-4B-Instruct-2507-UD-Q4_K_XL.gguf | |
| ################ EMBEDDING MODELS ################################################################## | |
| "qwen3-embedding-small": > | |
| llama-server \ | |
| --model /home/avatsaev/models/qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-f16.gguf \ | |
| --embedding \ | |
| -ngl 80 \ | |
| --cont-batching \ | |
| --alias qwen3-embedding-small \ | |
| --port ${PORT} \ | |
| --mlock \ | |
| --no-webui \ | |
| --timeout 300 | |
| "qwen3-embedding-large": > | |
| llama-server \ | |
| --model /home/avatsaev/models/qwen/Qwen3-Embedding-8B-GGUF/Qwen3-Embedding-8B-Q8_0.gguf \ | |
| --embedding \ | |
| -ngl 80 \ | |
| --cont-batching \ | |
| --alias qwen3-embedding-large \ | |
| --port ${PORT} \ | |
| --mlock \ | |
| --no-webui \ | |
| --timeout 300 | |
| ########## RERANKER MODELS ################################################################### | |
| "qwen3-reranker-small": > | |
| llama-server \ | |
| --model /home/avatsaev/models/qwen/Qwen3-Reranker-0.6B-GGUF/Qwen3-Reranker-0.6B-Q8_0.gguf \ | |
| --reranking \ | |
| --port ${PORT} \ | |
| -ngl 80 \ | |
| --cont-batching \ | |
| --no-webui \ | |
| --timeout 300 \ | |
| --flash-attn | |
| "qwen3-reranker-large": > | |
| llama-server \ | |
| --model /home/avatsaev/models/qwen/Qwen3-Reranker-4B-GGUF/Qwen3-Reranker-4B.Q8_0.gguf \ | |
| --reranking \ | |
| --port ${PORT} \ | |
| -ngl 80 \ | |
| --cont-batching \ | |
| --no-webui \ | |
| --timeout 300 \ | |
| --flash-attn | |
| "bge-reranker-v2-m3": > | |
| llama-server \ | |
| --model /home/avatsaev/models/BAAI/bge-reranker-v2-m3-Q8_0-GGUF/bge-reranker-v2-m3-q8_0.gguf \ | |
| --reranking \ | |
| --port ${PORT} \ | |
| --threads 8 \ | |
| --threads-http 16 \ | |
| -ngl 80 \ | |
| --batch-size 64 \ | |
| --ubatch-size 32 \ | |
| --mlock \ | |
| -c 65536 \ | |
| --cont-batching \ | |
| --no-webui \ | |
| --timeout 300 \ | |
| --flash-attn | |
| ############################################################################################ | |
| # | |
| # | |
| # | |
| # | |
| # | |
| # | |
| # | |
| # | |
| # | |
| # | |
| # | |
| # | |
| # | |
| # | |
| # | |
| # | |
| ############################# ACTIVE MODELS ################################################## | |
| models: | |
| ################################## LARGE Models ############################## | |
| "Qwen3": | |
| cmd: | | |
| ${Qwen3-30B-A3B-Instruct-2507} | |
| ttl: 3600 | |
| "Qwen3-100k": | |
| cmd: | | |
| ${Qwen3-30B-A3B-Instruct-2507-100k} | |
| ttl: 3600 | |
| "Qwen3-parallel": | |
| cmd: | | |
| ${Qwen3-30B-A3B-Instruct-parallel} | |
| ttl: 3600 | |
| "Qwen3-coder": | |
| cmd: | | |
| ${Qwen3-Coder-30B-A3B-Instruct} | |
| ttl: 3600 | |
| ########### FAST models ############################### | |
| "Qwen3-mini": | |
| cmd: | | |
| ${Qwen3-4B-Instruct-2507} | |
| ttl: 3600 | |
| "Qwen3-mini-230k": | |
| cmd: | | |
| ${Qwen3-4B-Instruct-2507-230k} | |
| ttl: 3600 | |
| "Qwen3-mini-parallel": | |
| cmd: | | |
| ${Qwen3-4B-Instruct-2507-parallel} | |
| ttl: 3600 | |
| ############ VERY FAST MODELS ########################### | |
| "Qwen3-nano": | |
| cmd: | | |
| ${Qwen3-4B-Instruct-2507-q4} | |
| ttl: 3600 | |
| "Qwen3-nano-260k": | |
| cmd: | | |
| ${Qwen3-4B-Instruct-2507-q4-260k} | |
| ttl: 3600 | |
| "Qwen3-nano-parallel": | |
| cmd: | | |
| ${Qwen3-4B-Instruct-2507-q4-parallel} | |
| ttl: 3600 | |
| ########################### VISION Models ##################### | |
| "Gemma-3-mini-Omni": | |
| cmd: | | |
| ${Gemma-3-mini-Omni} | |
| ttl: 3600 | |
| ######################## Emmbeding and reranking ################################# | |
| "qwen3-embedding-small": | |
| cmd: | | |
| ${qwen3-embedding-small} | |
| ttl: 3600 | |
| "qwen3-embedding-large": | |
| cmd: | | |
| ${qwen3-embedding-large} | |
| ttl: 3600 | |
| "qwen3-reranker-small": | |
| cmd: | | |
| ${qwen3-reranker-small} | |
| ttl: 3600 | |
| "qwen3-reranker-large": | |
| cmd: | | |
| ${qwen3-reranker-large} | |
| ttl: 3600 | |
| "bge-reranker-v2-m3": | |
| cmd: | | |
| ${bge-reranker-v2-m3} | |
| ttl: 3600 | |
| ################################################################################### | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment