airstation:llama.cpp ic$ git rev-parse HEAD 952d03dbead16e4dbdd1d3458486340673cc2465 airstation:llama.cpp ic$ echo ; awk '(NR>=4341 && NR<=4382 ){print NR " " $0}' llama.cpp 4341 // for now, only BPE models have pre-tokenizers 4342 if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { 4343 if (tokenizer_pre.empty()) { 4344 LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__); 4345 LLAMA_LOG_WARN("%s: \n", __func__); 4346 LLAMA_LOG_WARN("%s: ************************************ \n", __func__); 4347 LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__); 4348 LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__); 4349 LLAMA_LOG_WARN("%s: ************************************ \n", __func__); 4350 LLAMA_LOG_WARN("%s: \n", __func__); 4351 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; 4352 } else if ( 4353 tokenizer_pre == "default") { 4354 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; 4355 } else if ( 4356 tokenizer_pre == "llama3" || 4357 tokenizer_pre == "llama-v3" || 4358 tokenizer_pre == "llama-bpe") { 4359 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3; 4360 } else if ( 4361 tokenizer_pre == "deepseek-llm") { 4362 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM; 4363 } else if ( 4364 tokenizer_pre == "deepseek-coder") { 4365 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER; 4366 } else if ( 4367 tokenizer_pre == "falcon") { 4368 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON; 4369 } else if ( 4370 tokenizer_pre == "mpt") { 4371 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT; 4372 } else if ( 4373 tokenizer_pre == "starcoder") { 4374 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER; 4375 } else if ( 4376 tokenizer_pre == "gpt-2") { 4377 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2; 4378 } else { 4379 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); 4380 } 4381 } else { 4382 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;