ink-splatters · May 5, 2024 01:32 · May 5, 2024 · May 5, 2024
diff --git a/gistfile1.txt → gistfile1.sh b/gistfile1.txt → gistfile1.sh
diff --git a/gistfile1.txt b/gistfile1.txt
@@ -0,0 +1,46 @@
+airstation:llama.cpp ic$ git rev-parse HEAD
+952d03dbead16e4dbdd1d3458486340673cc2465
+airstation:llama.cpp ic$ echo ; awk '(NR>=4341 &&  NR<=4382 ){print NR " " $0}' llama.cpp
+
+4341         // for now, only BPE models have pre-tokenizers
+4342         if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
+4343             if (tokenizer_pre.empty()) {
+4344                 LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
+4345                 LLAMA_LOG_WARN("%s:                                             \n", __func__);
+4346                 LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
+4347                 LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED!        \n", __func__);
+4348                 LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL             \n", __func__);
+4349                 LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
+4350                 LLAMA_LOG_WARN("%s:                                             \n", __func__);
+4351                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+4352             } else if (
+4353                     tokenizer_pre == "default") {
+4354                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+4355             } else if (
+4356                     tokenizer_pre == "llama3"   ||
+4357                     tokenizer_pre == "llama-v3" ||
+4358                     tokenizer_pre == "llama-bpe") {
+4359                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
+4360             } else if (
+4361                     tokenizer_pre == "deepseek-llm") {
+4362                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
+4363             } else if (
+4364                     tokenizer_pre == "deepseek-coder") {
+4365                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
+4366             } else if (
+4367                     tokenizer_pre == "falcon") {
+4368                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
+4369             } else if (
+4370                     tokenizer_pre == "mpt") {
+4371                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
+4372             } else if (
+4373                     tokenizer_pre == "starcoder") {
+4374                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
+4375             } else if (
+4376                     tokenizer_pre == "gpt-2") {
+4377                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
+4378             } else {
+4379                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+4380             }
+4381         } else {
+4382             vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;