Skip to content

Instantly share code, notes, and snippets.

@PAMinerva
Last active January 17, 2025 18:49
Show Gist options
  • Save PAMinerva/8b506f360389b80b878b84f99eefb8a7 to your computer and use it in GitHub Desktop.
Save PAMinerva/8b506f360389b80b878b84f99eefb8a7 to your computer and use it in GitHub Desktop.

Revisions

  1. PAMinerva revised this gist Jan 17, 2025. 1 changed file with 24 additions and 3 deletions.
    27 changes: 24 additions & 3 deletions strconv.cpp
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,6 @@
    #include "strconv.h"


    // Per comprendere meglio il funzionamento di queste funzioni, si consiglia di leggere:
    // https://en.wikipedia.org/wiki/UTF-8
    // https://en.wikipedia.org/wiki/UTF-16
    @@ -8,10 +11,17 @@
    // In the following table, the characters u to z are replaced by the bits of the code point, from the positions U+uvwxyz:
    // Code point ↔ UTF-8 conversion
    // First code point Last code point Byte 1 Byte 2 Byte 3 Byte 4
    // U+0000 U+007F 0yyyzzzz
    // U+0080 U+07FF 110xxxyy 10yyzzzz
    // U+0800 U+FFFF 1110wwww 10xxxxyy 10yyzzzz
    // U+000000 U+00007F 0yyyzzzz
    // U+000080 U+0007FF 110xxxyy 10yyzzzz
    // U+000800 U+00FFFF 1110wwww 10xxxxyy 10yyzzzz
    // U+010000 U+10FFFF 11110uvv 10vvwwww 10xxxxyy 10yyzzzz
    //
    // Esempio:
    // Il carattere alef (א), che corrispondente all'Unicode U+0005D0, viene rappresentato in UTF-8 con questo procedimento:
    // - ricade nell'intervallo da 0x0080 a 0x07FF. Secondo la tabella verrà rappresentato con due byte (110xxxyy 10yyzzzz);
    // - l'esadecimale 0x05D0 equivale al binario 101-1101-0000 (xxx=101=5, yyyy=1101=D, zzzz=0000=0);
    // - gli undici bit vengono copiati in ordine nelle posizioni marcate con x ed y: 110-10111 10-010000;
    // - il risultato finale è la coppia di byte 11010111 10010000, o in esadecimale 0xD7 0x90.


    // UTF-16 encodes code points in the range U+0000 to U+D7FF and U+E000 to U+FFFF (U+D800 to U+DFFF have a special purpose, see below)
    @@ -32,6 +42,17 @@
    // U' = yyyyyyyyyyxxxxxxxxxx // U - 0x10000
    // W1 = 110110yyyyyyyyyy // 0xD800 + yyyyyyyyyy
    // W2 = 110111xxxxxxxxxx // 0xDC00 + xxxxxxxxxx
    //
    // Examples:
    // To encode U+10437 (𐐷) to UTF-16:
    // - Subtract 0x10000 from the code point, leaving 0x0437.
    // - For the high surrogate, shift right by 10 (divide by 0x400), then add 0xD800, resulting in 0x0001 + 0xD800 = 0xD801.
    // - For the low surrogate, take the low 10 bits (remainder of dividing by 0x400), then add 0xDC00, resulting in 0x0037 + 0xDC00 = 0xDC37.
    //
    // To decode U+10437 (𐐷) from UTF-16:
    // - Take the high surrogate (0xD801) and subtract 0xD800, then shift left by 10 (multiply by 0x400), resulting in 0x0001 × 0x400 = 0x0400.
    // - Take the low surrogate (0xDC37) and subtract 0xDC00, resulting in 0x37.
    // - Add these two results together (0x0437), and finally add 0x10000 to get the final code point, 0x10437.


    void StringConvert(const std::string& from, std::wstring& to) {
  2. PAMinerva revised this gist Jan 17, 2025. No changes.
  3. PAMinerva created this gist Jan 17, 2025.
    135 changes: 135 additions & 0 deletions strconv.cpp
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,135 @@
    // Per comprendere meglio il funzionamento di queste funzioni, si consiglia di leggere:
    // https://en.wikipedia.org/wiki/UTF-8
    // https://en.wikipedia.org/wiki/UTF-16
    // https://en.wikipedia.org/wiki/UTF-32


    // UTF-8 encodes code points in one to four bytes, depending on the value of the code point.
    // In the following table, the characters u to z are replaced by the bits of the code point, from the positions U+uvwxyz:
    // Code point ↔ UTF-8 conversion
    // First code point Last code point Byte 1 Byte 2 Byte 3 Byte 4
    // U+0000 U+007F 0yyyzzzz
    // U+0080 U+07FF 110xxxyy 10yyzzzz
    // U+0800 U+FFFF 1110wwww 10xxxxyy 10yyzzzz
    // U+010000 U+10FFFF 11110uvv 10vvwwww 10xxxxyy 10yyzzzz


    // UTF-16 encodes code points in the range U+0000 to U+D7FF and U+E000 to U+FFFF (U+D800 to U+DFFF have a special purpose, see below)
    // as single 16-bit code units that are numerically equal to the corresponding code points
    //
    // UTF-16 encodes code points in the range U+10000 to U+10FFFF as two 16-bit code units called surrogate pair.
    // The first code unit is a high surrogate and the second is a low surrogate.
    // The high surrogate is in the range U+D800 to U+DBFF, and the low surrogate is in the range U+DC00 to U+DFFF.
    //
    // - 0x10000 is subtracted from the code point (U), leaving a 20-bit number (U') in the hex number range 0x00000–0xFFFFF.
    // - The high ten bits (in the range 0x000–0x3FF) are added to 0xD800 to give the first 16-bit code unit or high surrogate (W1),
    // which will be in the range 0xD800–0xDBFF.
    // - The low ten bits (also in the range 0x000–0x3FF) are added to 0xDC00 to give the second 16-bit code unit or low surrogate (W2),
    // which will be in the range 0xDC00–0xDFFF.
    //
    // Illustrated visually, the distribution of U' between W1 and W2 looks like:
    //
    // U' = yyyyyyyyyyxxxxxxxxxx // U - 0x10000
    // W1 = 110110yyyyyyyyyy // 0xD800 + yyyyyyyyyy
    // W2 = 110111xxxxxxxxxx // 0xDC00 + xxxxxxxxxx


    void StringConvert(const std::string& from, std::wstring& to) {
    to.clear();
    size_t i = 0;
    while (i < from.size()) {
    uint32_t codepoint = 0;
    unsigned char c = from[i];

    if (c < 0x80) { // Se il byte è minore di 0x80 (128), allora è un carattere ASCII
    codepoint = c; // in quel caso, valore di c è uguale al codepoint
    i += 1;
    } else if ((c & 0xE0) == 0xC0) { // 0xE0 = 11100000, 0xC0 = 11000000
    if (i + 1 >= from.size()) break;
    // combina tramite OR i 5 bit meno significativi di c con i 6 bit meno significativi di from[i + 1] per ottenere il codepoint
    // in modo che i 5 bit meno significativi di c vengano prima e i 6 bit meno significativi di from[i + 1] vengano dopo.
    codepoint = ((c & 0x1F) << 6) | // 0x1F = 00011111, quindi (c & 0x1F) << 6) prende i 5 bit meno significativi di c e li sposta a sinistra di 6 posizioni
    (from[i + 1] & 0x3F); // 0x3F = 00111111, quindi (from[i + 1] & 0x3F) prende i 6 bit meno significativi di from[i + 1]
    i += 2;
    } else if ((c & 0xF0) == 0xE0) { // 0xF0 = 11110000, 0xE0 = 11100000
    if (i + 2 >= from.size()) break;
    // combina tramite OR i 4 bit meno significativi di c con i 6 bit meno significativi di from[i + 1] e i 6 bit meno significativi di from[i + 2] per ottenere il codepoint
    // in modo che i 4 bit meno significativi di c vengano prima, i 6 bit meno significativi di from[i + 1] vengano dopo e i 6 bit meno significativi di from[i + 2] vengano alla fine.
    codepoint = ((c & 0x0F) << 12) | // 0x0F = 00001111, quindi (c & 0x0F) << 12 prende i 4 bit meno significativi di c e li sposta a sinistra di 12 posizioni
    ((from[i + 1] & 0x3F) << 6) | // (from[i + 1] & 0x3F) prende i 6 bit meno significativi di from[i + 1] e li sposta a sinistra di 6 posizioni
    (from[i + 2] & 0x3F); // (from[i + 2] & 0x3F) prende i 6 bit meno significativi di from[i + 2]
    i += 3;
    } else if ((c & 0xF8) == 0xF0) { // 0xF8 = 11111000, 0xF0 = 11110000
    if (i + 3 >= from.size()) break;
    codepoint = ((c & 0x07) << 18) | // 0x07 = 00000111, quindi ((c & 0x07) << 18) prende i 3 bit meno significativi di c e li sposta a sinistra di 18 posizioni
    ((from[i + 1] & 0x3F) << 12) | // ((from[i + 1] & 0x3F) << 12) prende i 6 bit meno significativi di from[i + 1] e li sposta a sinistra di 12 posizioni
    ((from[i + 2] & 0x3F) << 6) | // ((from[i + 2] & 0x3F) << 6) prende i 6 bit meno significativi di from[i + 2] e li sposta a sinistra di 6 posizioni
    (from[i + 3] & 0x3F);
    i += 4;
    } else {
    // Sequenza UTF-8 non valida
    ++i;
    continue;
    }

    if constexpr (sizeof(wchar_t) >= 4) {
    // wchar_t a 32 bit (es. Linux)
    to += static_cast<wchar_t>(codepoint);
    } else {
    // wchar_t a 16 bit (es. Windows)
    if (codepoint <= 0xFFFF) {
    to += static_cast<wchar_t>(codepoint);
    } else {
    // Converti in surrogati UTF-16
    codepoint -= 0x10000;
    to += static_cast<wchar_t>((codepoint >> 10) + 0xD800);
    to += static_cast<wchar_t>((codepoint & 0x3FF) + 0xDC00);
    }
    }
    }
    }

    void StringConvert(const std::wstring& from, std::string& to) {
    to.clear();
    for (size_t i = 0; i < from.size(); ++i) {
    uint32_t codepoint = 0;
    wchar_t wc = from[i];

    if constexpr (sizeof(wchar_t) >= 4) {
    // wchar_t a 32 bit (es. Linux)
    codepoint = static_cast<uint32_t>(wc);
    } else {
    // wchar_t a 16 bit (es. Windows)
    if (wc >= 0xD800 && wc <= 0xDBFF) {
    // Parte alta del surrogato
    if (i + 1 < from.size()) {
    wchar_t wc_low = from[i + 1];
    if (wc_low >= 0xDC00 && wc_low <= 0xDFFF) {
    // Parte bassa del surrogato
    codepoint = ((static_cast<uint32_t>(wc - 0xD800) << 10) |
    (static_cast<uint32_t>(wc_low - 0xDC00))) + 0x10000;
    ++i; // Salta la parte bassa del surrogato
    }
    }
    } else {
    codepoint = static_cast<uint32_t>(wc);
    }
    }

    if (codepoint <= 0x7F) {
    to += static_cast<char>(codepoint);
    } else if (codepoint <= 0x7FF) {
    to += static_cast<char>(0xC0 | ((codepoint >> 6) & 0x1F));
    to += static_cast<char>(0x80 | (codepoint & 0x3F));
    } else if (codepoint <= 0xFFFF) {
    to += static_cast<char>(0xE0 | ((codepoint >> 12) & 0x0F));
    to += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
    to += static_cast<char>(0x80 | (codepoint & 0x3F));
    } else if (codepoint <= 0x10FFFF) {
    to += static_cast<char>(0xF0 | ((codepoint >> 18) & 0x07));
    to += static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F));
    to += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
    to += static_cast<char>(0x80 | (codepoint & 0x3F));
    }
    }
    }