Skip to content

Instantly share code, notes, and snippets.

@japajoe
Created October 8, 2025 18:29
Show Gist options
  • Save japajoe/5915b098b6871048ed7baed68b25c44f to your computer and use it in GitHub Desktop.
Save japajoe/5915b098b6871048ed7baed68b25c44f to your computer and use it in GitHub Desktop.

Revisions

  1. japajoe created this gist Oct 8, 2025.
    55 changes: 55 additions & 0 deletions utf8.cpp
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,55 @@
    bool isValidUTF8(const char *pText, size_t size, size_t &charCount) {
    if(size == 0)
    size = strlen(pText);

    int numBytes = 0; // Number of bytes expected in the current UTF-8 character
    unsigned char byte;
    const uint8_t *pPayload = reinterpret_cast<const uint8_t*>(pText);

    for (size_t i = 0; i < size; ++i) {
    byte = pPayload[i];

    if (numBytes == 0)
    {
    // Determine the number of bytes in the UTF-8 character
    if ((byte & 0x80) == 0)
    {
    // 1-byte character (ASCII)
    charCount++;
    continue;
    }
    else if ((byte & 0xE0) == 0xC0)
    {
    // 2-byte character
    numBytes = 1;
    }
    else if ((byte & 0xF0) == 0xE0)
    {
    // 3-byte character
    numBytes = 2;
    }
    else if ((byte & 0xF8) == 0xF0)
    {
    // 4-byte character
    numBytes = 3;
    }
    else
    {
    // Invalid first byte
    return false;
    }
    charCount++;
    }
    else
    {
    // Check continuation bytes
    if ((byte & 0xC0) != 0x80)
    {
    return false; // Invalid continuation byte
    }
    numBytes--;
    }
    }

    return numBytes == 0; // Ensure all characters were complete
    }