Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save ink-splatters/d0899a4749261b7dfd4818f40ff062b4 to your computer and use it in GitHub Desktop.
Save ink-splatters/d0899a4749261b7dfd4818f40ff062b4 to your computer and use it in GitHub Desktop.

Revisions

  1. ink-splatters renamed this gist May 5, 2024. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  2. ink-splatters created this gist May 5, 2024.
    46 changes: 46 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,46 @@
    airstation:llama.cpp ic$ git rev-parse HEAD
    952d03dbead16e4dbdd1d3458486340673cc2465
    airstation:llama.cpp ic$ echo ; awk '(NR>=4341 && NR<=4382 ){print NR " " $0}' llama.cpp

    4341 // for now, only BPE models have pre-tokenizers
    4342 if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
    4343 if (tokenizer_pre.empty()) {
    4344 LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
    4345 LLAMA_LOG_WARN("%s: \n", __func__);
    4346 LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
    4347 LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
    4348 LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
    4349 LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
    4350 LLAMA_LOG_WARN("%s: \n", __func__);
    4351 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
    4352 } else if (
    4353 tokenizer_pre == "default") {
    4354 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
    4355 } else if (
    4356 tokenizer_pre == "llama3" ||
    4357 tokenizer_pre == "llama-v3" ||
    4358 tokenizer_pre == "llama-bpe") {
    4359 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
    4360 } else if (
    4361 tokenizer_pre == "deepseek-llm") {
    4362 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
    4363 } else if (
    4364 tokenizer_pre == "deepseek-coder") {
    4365 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
    4366 } else if (
    4367 tokenizer_pre == "falcon") {
    4368 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
    4369 } else if (
    4370 tokenizer_pre == "mpt") {
    4371 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
    4372 } else if (
    4373 tokenizer_pre == "starcoder") {
    4374 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
    4375 } else if (
    4376 tokenizer_pre == "gpt-2") {
    4377 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
    4378 } else {
    4379 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
    4380 }
    4381 } else {
    4382 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;