nullhook · June 9, 2021 03:17 · Jun 9, 2021 · Jun 9, 2021
diff --git a/chars.cc b/chars.cc
@@ -0,0 +1,58 @@
+#include <iostream>
+#include <locale>
+#include <string>
+#include <fstream>
+#include <codecvt>
+
+// utf8/utf16/utf32 can be directly written to file without conversions
+// sizeof(T) gives you bytes of the type
+// .size() .length() gives count of chars
+// if char16_t is stored the open the file with utf16 encoding
+// utf8 is slowly becoming the standard; MacRoman was apple's default but now it's utf8
+// 'locale' can give you system's default local language, curr, date settings
+// you need to know the encoding prior to decoding
+// you can convert a char16_t to machine's default charset by using std::locale
+// utf16 stores less char bytes vs utf8
+// endiness is byte ordering; little endiness means bytes will end with small number
+// utf8 is comaptible with basic ascii and they're of single byte length and most significant bit is always 0
+// basic ascii is only 0-127, but there are about 2^23 possible in utf8
+// compiler: to calculate the byte length, or copy a utf8 string, it doesn't need to know about utf8
+// compiler: to calculate the number of code points, or to split a string correctly, it does need to know about utf8
+// splitting a string is the better example here. If you're interpreting it as ascii, but it actually has multi-byte utf8 characters in it, you can split in the middle of a code point by accident and produce two invalid or incorrect utf8 strings
+
+int main() {
+  std::u16string u16str = u"ßx";
+
+  // convert u16 to u8. you can imbue it also!
+  std::string u8conv = std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}.to_bytes(u16str);
+
+  std::string u8str = u8"ßx";
+
+
+  std::cout << "\n";
+
+  std::cout << "u16 Type size: " << sizeof(char16_t) << "\n";
+  std::cout << "u16 String size: " << u16str.size() << "\n";
+  std::cout << "u16->u8 String size: " << u8conv.size() << "\n";
+  std::cout << "u16 pointer address: " << u16str.c_str() << "\n";
+  std::cout << "UTF-16 produced: ";
+  for(char16_t c : u16str)
+    std::cout << std::hex << std::showbase << c << ' ';
+
+  std::cout << "\n";
+  std::cout << "\n";
+
+  std::cout << "u8 Type size: " << sizeof(u8str) << "\n";
+  std::cout << "u8 String size: " << u8str.size() << "\n";
+  std::cout << "u8 pointer: " << u8str.c_str() << "\n";
+  std::cout << "UTF-16 to UTF-8 conversion produced: ";
+  for(unsigned char c : u8conv) /* char is signed, so numbers >127 are effectively negative numbers */
+    std::cout << std::hex << std::showbase << +(c) << ' '; /* dont print leading zeros */
+
+  std::cout << "\n";
+
+  std::ofstream file("from_utf16.txt");
+  file.write(u8conv.c_str(), sizeof(char)*u8conv.size());
+
+  return 0;
+}