PAMinerva · January 17, 2025 18:49 · Jan 17, 2025 · Jan 17, 2025 · Jan 17, 2025
diff --git a/strconv.cpp b/strconv.cpp
@@ -1,3 +1,6 @@
+#include "strconv.h"
+
+
 // Per comprendere meglio il funzionamento di queste funzioni, si consiglia di leggere:
 // https://en.wikipedia.org/wiki/UTF-8
 // https://en.wikipedia.org/wiki/UTF-16
@@ -8,10 +11,17 @@
 // In the following table, the characters u to z are replaced by the bits of the code point, from the positions U+uvwxyz:
 // Code point ↔ UTF-8 conversion
 // First code point  Last code point  Byte 1      Byte 2       Byte 3      Byte 4
-// U+0000            U+007F           0yyyzzzz
-// U+0080            U+07FF           110xxxyy    10yyzzzz
-// U+0800            U+FFFF           1110wwww    10xxxxyy     10yyzzzz
+// U+000000          U+00007F         0yyyzzzz
+// U+000080          U+0007FF         110xxxyy    10yyzzzz
+// U+000800          U+00FFFF         1110wwww    10xxxxyy     10yyzzzz
 // U+010000          U+10FFFF         11110uvv    10vvwwww     10xxxxyy   10yyzzzz
+//
+// Esempio:
+// Il carattere alef (א), che corrispondente all'Unicode U+0005D0, viene rappresentato in UTF-8 con questo procedimento:
+// - ricade nell'intervallo da 0x0080 a 0x07FF. Secondo la tabella verrà rappresentato con due byte (110xxxyy 10yyzzzz);
+// - l'esadecimale 0x05D0 equivale al binario 101-1101-0000 (xxx=101=5, yyyy=1101=D, zzzz=0000=0);
+// - gli undici bit vengono copiati in ordine nelle posizioni marcate con x ed y: 110-10111 10-010000;
+// - il risultato finale è la coppia di byte 11010111 10010000, o in esadecimale 0xD7 0x90.
 
 
 // UTF-16 encodes code points in the range U+0000 to U+D7FF and U+E000 to U+FFFF (U+D800 to U+DFFF have a special purpose, see below) 
@@ -32,6 +42,17 @@
 // U' = yyyyyyyyyyxxxxxxxxxx  // U - 0x10000
 // W1 = 110110yyyyyyyyyy      // 0xD800 + yyyyyyyyyy
 // W2 = 110111xxxxxxxxxx      // 0xDC00 + xxxxxxxxxx
+//
+// Examples:
+// To encode U+10437 (𐐷) to UTF-16:
+// - Subtract 0x10000 from the code point, leaving 0x0437.
+// - For the high surrogate, shift right by 10 (divide by 0x400), then add 0xD800, resulting in 0x0001 + 0xD800 = 0xD801.
+// - For the low surrogate, take the low 10 bits (remainder of dividing by 0x400), then add 0xDC00, resulting in 0x0037 + 0xDC00 = 0xDC37.
+//
+// To decode U+10437 (𐐷) from UTF-16:
+// - Take the high surrogate (0xD801) and subtract 0xD800, then shift left by 10 (multiply by 0x400), resulting in 0x0001 × 0x400 = 0x0400.
+// - Take the low surrogate (0xDC37) and subtract 0xDC00, resulting in 0x37.
+// - Add these two results together (0x0437), and finally add 0x10000 to get the final code point, 0x10437.
 
 
 void StringConvert(const std::string& from, std::wstring& to) {

diff --git a/strconv.cpp b/strconv.cpp
@@ -0,0 +1,135 @@
+// Per comprendere meglio il funzionamento di queste funzioni, si consiglia di leggere:
+// https://en.wikipedia.org/wiki/UTF-8
+// https://en.wikipedia.org/wiki/UTF-16
+// https://en.wikipedia.org/wiki/UTF-32
+
+
+// UTF-8 encodes code points in one to four bytes, depending on the value of the code point. 
+// In the following table, the characters u to z are replaced by the bits of the code point, from the positions U+uvwxyz:
+// Code point ↔ UTF-8 conversion
+// First code point  Last code point  Byte 1      Byte 2       Byte 3      Byte 4
+// U+0000            U+007F           0yyyzzzz
+// U+0080            U+07FF           110xxxyy    10yyzzzz
+// U+0800            U+FFFF           1110wwww    10xxxxyy     10yyzzzz
+// U+010000          U+10FFFF         11110uvv    10vvwwww     10xxxxyy   10yyzzzz
+
+
+// UTF-16 encodes code points in the range U+0000 to U+D7FF and U+E000 to U+FFFF (U+D800 to U+DFFF have a special purpose, see below) 
+// as single 16-bit code units that are numerically equal to the corresponding code points
+//
+// UTF-16 encodes code points in the range U+10000 to U+10FFFF as two 16-bit code units called surrogate pair.
+// The first code unit is a high surrogate and the second is a low surrogate.
+// The high surrogate is in the range U+D800 to U+DBFF, and the low surrogate is in the range U+DC00 to U+DFFF.
+//
+// - 0x10000 is subtracted from the code point (U), leaving a 20-bit number (U') in the hex number range 0x00000–0xFFFFF.
+// - The high ten bits (in the range 0x000–0x3FF) are added to 0xD800 to give the first 16-bit code unit or high surrogate (W1), 
+//   which will be in the range 0xD800–0xDBFF.
+// - The low ten bits (also in the range 0x000–0x3FF) are added to 0xDC00 to give the second 16-bit code unit or low surrogate (W2), 
+//   which will be in the range 0xDC00–0xDFFF.
+//
+// Illustrated visually, the distribution of U' between W1 and W2 looks like:
+//
+// U' = yyyyyyyyyyxxxxxxxxxx  // U - 0x10000
+// W1 = 110110yyyyyyyyyy      // 0xD800 + yyyyyyyyyy
+// W2 = 110111xxxxxxxxxx      // 0xDC00 + xxxxxxxxxx
+
+
+void StringConvert(const std::string& from, std::wstring& to) {
+    to.clear();
+    size_t i = 0;
+    while (i < from.size()) {
+        uint32_t codepoint = 0;
+        unsigned char c = from[i];
+
+        if (c < 0x80) { // Se il byte è minore di 0x80 (128), allora è un carattere ASCII
+            codepoint = c; // in quel caso, valore di c è uguale al codepoint
+            i += 1;
+        } else if ((c & 0xE0) == 0xC0) { // 0xE0 = 11100000, 0xC0 = 11000000
+            if (i + 1 >= from.size()) break;
+            // combina tramite OR i 5 bit meno significativi di c con i 6 bit meno significativi di from[i + 1] per ottenere il codepoint
+            // in modo che i 5 bit meno significativi di c vengano prima e i 6 bit meno significativi di from[i + 1] vengano dopo.
+            codepoint = ((c & 0x1F) << 6) | // 0x1F = 00011111, quindi (c & 0x1F) << 6) prende i 5 bit meno significativi di c e li sposta a sinistra di 6 posizioni
+                        (from[i + 1] & 0x3F); // 0x3F = 00111111, quindi (from[i + 1] & 0x3F) prende i 6 bit meno significativi di from[i + 1]
+            i += 2;
+        } else if ((c & 0xF0) == 0xE0) { // 0xF0 = 11110000, 0xE0 = 11100000
+            if (i + 2 >= from.size()) break;
+            // combina tramite OR i 4 bit meno significativi di c con i 6 bit meno significativi di from[i + 1] e i 6 bit meno significativi di from[i + 2] per ottenere il codepoint
+            // in modo che i 4 bit meno significativi di c vengano prima, i 6 bit meno significativi di from[i + 1] vengano dopo e i 6 bit meno significativi di from[i + 2] vengano alla fine.
+            codepoint = ((c & 0x0F) << 12) | // 0x0F = 00001111, quindi (c & 0x0F) << 12 prende i 4 bit meno significativi di c e li sposta a sinistra di 12 posizioni
+                        ((from[i + 1] & 0x3F) << 6) | // (from[i + 1] & 0x3F) prende i 6 bit meno significativi di from[i + 1] e li sposta a sinistra di 6 posizioni
+                        (from[i + 2] & 0x3F); // (from[i + 2] & 0x3F) prende i 6 bit meno significativi di from[i + 2]
+            i += 3;
+        } else if ((c & 0xF8) == 0xF0) { // 0xF8 = 11111000, 0xF0 = 11110000
+            if (i + 3 >= from.size()) break;
+            codepoint = ((c & 0x07) << 18) | // 0x07 = 00000111, quindi ((c & 0x07) << 18) prende i 3 bit meno significativi di c e li sposta a sinistra di 18 posizioni
+                        ((from[i + 1] & 0x3F) << 12) | // ((from[i + 1] & 0x3F) << 12) prende i 6 bit meno significativi di from[i + 1] e li sposta a sinistra di 12 posizioni
+                        ((from[i + 2] & 0x3F) << 6) | // ((from[i + 2] & 0x3F) << 6) prende i 6 bit meno significativi di from[i + 2] e li sposta a sinistra di 6 posizioni
+                        (from[i + 3] & 0x3F);
+            i += 4;
+        } else {
+            // Sequenza UTF-8 non valida
+            ++i;
+            continue;
+        }
+
+        if constexpr (sizeof(wchar_t) >= 4) {
+            // wchar_t a 32 bit (es. Linux)
+            to += static_cast<wchar_t>(codepoint);
+        } else {
+            // wchar_t a 16 bit (es. Windows)
+            if (codepoint <= 0xFFFF) {
+                to += static_cast<wchar_t>(codepoint);
+            } else {
+                // Converti in surrogati UTF-16
+                codepoint -= 0x10000;
+                to += static_cast<wchar_t>((codepoint >> 10) + 0xD800);
+                to += static_cast<wchar_t>((codepoint & 0x3FF) + 0xDC00);
+            }
+        }
+    }
+}
+
+void StringConvert(const std::wstring& from, std::string& to) {
+    to.clear();
+    for (size_t i = 0; i < from.size(); ++i) {
+        uint32_t codepoint = 0;
+        wchar_t wc = from[i];
+
+        if constexpr (sizeof(wchar_t) >= 4) {
+            // wchar_t a 32 bit (es. Linux)
+            codepoint = static_cast<uint32_t>(wc);
+        } else {
+            // wchar_t a 16 bit (es. Windows)
+            if (wc >= 0xD800 && wc <= 0xDBFF) {
+                // Parte alta del surrogato
+                if (i + 1 < from.size()) {
+                    wchar_t wc_low = from[i + 1];
+                    if (wc_low >= 0xDC00 && wc_low <= 0xDFFF) {
+                        // Parte bassa del surrogato
+                        codepoint = ((static_cast<uint32_t>(wc - 0xD800) << 10) |
+                                     (static_cast<uint32_t>(wc_low - 0xDC00))) + 0x10000;
+                        ++i; // Salta la parte bassa del surrogato
+                    }
+                }
+            } else {
+                codepoint = static_cast<uint32_t>(wc);
+            }
+        }
+
+        if (codepoint <= 0x7F) {
+            to += static_cast<char>(codepoint);
+        } else if (codepoint <= 0x7FF) {
+            to += static_cast<char>(0xC0 | ((codepoint >> 6) & 0x1F));
+            to += static_cast<char>(0x80 | (codepoint & 0x3F));
+        } else if (codepoint <= 0xFFFF) {
+            to += static_cast<char>(0xE0 | ((codepoint >> 12) & 0x0F));
+            to += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
+            to += static_cast<char>(0x80 | (codepoint & 0x3F));
+        } else if (codepoint <= 0x10FFFF) {
+            to += static_cast<char>(0xF0 | ((codepoint >> 18) & 0x07));
+            to += static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F));
+            to += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
+            to += static_cast<char>(0x80 | (codepoint & 0x3F));
+        }
+    }
+}