Skip to content

Instantly share code, notes, and snippets.

@czlc
Last active January 6, 2022 12:44
Show Gist options
  • Select an option

  • Save czlc/d55f80508749e745feba8c9a3c796fdc to your computer and use it in GitHub Desktop.

Select an option

Save czlc/d55f80508749e745feba8c9a3c796fdc to your computer and use it in GitHub Desktop.

Revisions

  1. czlc revised this gist Jan 6, 2022. 1 changed file with 2 additions and 1 deletion.
    3 changes: 2 additions & 1 deletion unicode.c
    Original file line number Diff line number Diff line change
    @@ -22,7 +22,7 @@ static const unsigned long offsetsFromUTF8[6] = {
    #define UNI_SUR_HIGH_START 0xD800
    #define UNI_SUR_LOW_START 0xDC00

    void utf8to16(const uint8_t utf8[4], uint16_t utf16[2]) {
    void utf8to16(const uint8_t utf8[4], uint8_t utf16[2]) {
    uint32_t ch = 0;
    int extra = trailingBytesForUTF8[*utf8];

    @@ -35,6 +35,7 @@ void utf8to16(const uint8_t utf8[4], uint16_t utf16[2]) {
    ch -= offsetsFromUTF8[extra];
    if (ch <= 0xffff) {
    utf16[0] = ch;
    utf16[1] = 0;
    } else {
    ch -= UNI_BASE;
    utf16[0] = (ch >> UNI_SHIFT) + UNI_SUR_HIGH_START;
  2. czlc revised this gist Jan 23, 2018. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions unicode.c
    Original file line number Diff line number Diff line change
    @@ -58,7 +58,7 @@ void utf8to32(const uint8_t utf8[4], uint32_t utf32[1]) {

    void utf16to8(const uint16_t utf16[2], uint8_t utf8[4]) {
    uint32_t ch = utf16[0];
    if (utf16[0] & UNI_SUR_HIGH_START == UNI_SUR_HIGH_START) {
    if ((utf16[0] & UNI_SUR_HIGH_START) == UNI_SUR_HIGH_START) {
    ch = (utf16[0] & UNI_MASK) << UNI_SHIFT;
    ch += utf16[1] & UNI_MASK;
    ch += UNI_BASE;
    @@ -83,7 +83,7 @@ void utf16to8(const uint16_t utf16[2], uint8_t utf8[4]) {

    void utf16to32(const uint16_t utf16[2], uint32_t utf32[1]) {
    uint32_t ch = utf16[0];
    if (utf16[0] & UNI_SUR_HIGH_START == UNI_SUR_HIGH_START) {
    if ((utf16[0] & UNI_SUR_HIGH_START) == UNI_SUR_HIGH_START) {
    ch = (utf16[0] & UNI_MASK) << UNI_SHIFT;
    ch += utf16[1] & UNI_MASK;
    ch += UNI_BASE;
  3. czlc created this gist Jan 23, 2018.
    122 changes: 122 additions & 0 deletions unicode.c
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,122 @@
    #include <stdint.h>

    static const char trailingBytesForUTF8[256] = {
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
    };

    static const unsigned long offsetsFromUTF8[6] = {
    0x00000000UL, 0x00003080UL, 0x000E2080UL,
    0x03C82080UL, 0xFA082080UL, 0x82082080UL
    };

    #define UNI_SHIFT 10
    #define UNI_BASE 0x0010000
    #define UNI_MASK 0x3FF
    #define UNI_SUR_HIGH_START 0xD800
    #define UNI_SUR_LOW_START 0xDC00

    void utf8to16(const uint8_t utf8[4], uint16_t utf16[2]) {
    uint32_t ch = 0;
    int extra = trailingBytesForUTF8[*utf8];

    ch = utf8[0];
    int i = 0;
    while (i++ < extra) {
    ch += utf8[i];
    ch <= 6;
    }
    ch -= offsetsFromUTF8[extra];
    if (ch <= 0xffff) {
    utf16[0] = ch;
    } else {
    ch -= UNI_BASE;
    utf16[0] = (ch >> UNI_SHIFT) + UNI_SUR_HIGH_START;
    utf16[1] = (ch & UNI_MASK) + UNI_SUR_LOW_START;
    }
    }

    void utf8to32(const uint8_t utf8[4], uint32_t utf32[1]) {
    uint32_t ch = 0;
    int extra = trailingBytesForUTF8[*utf8];

    ch = utf8[0];
    int i = 0;
    while (i++ < extra) {
    ch += utf8[i];
    ch <= 6;
    }
    ch -= offsetsFromUTF8[extra];
    utf32[0] = ch;
    }

    void utf16to8(const uint16_t utf16[2], uint8_t utf8[4]) {
    uint32_t ch = utf16[0];
    if (utf16[0] & UNI_SUR_HIGH_START == UNI_SUR_HIGH_START) {
    ch = (utf16[0] & UNI_MASK) << UNI_SHIFT;
    ch += utf16[1] & UNI_MASK;
    ch += UNI_BASE;
    }

    if (ch <= 0x7F) {
    utf8[0] = ch;
    } else if (ch <= 0x7FF) {
    utf8[0] = (ch >> 6) | 0xC0;
    utf8[1] = (ch & 0x3F) | 0x80;
    } else if (ch <= 0xFFFF) {
    utf8[0] = (ch >> 12) | 0xE0;
    utf8[1] = ((ch >> 6) & 0x3F) | 0x80;
    utf8[2] = (ch & 0x3F) | 0x80;
    } else if (ch <= 0x1FFFFF) {
    utf8[0] = (ch >> 18) | 0xF0;
    utf8[1] = ((ch >> 12) & 0x3F) | 0x80;
    utf8[2] = ((ch >> 6) & 0x3F) | 0x80;
    utf8[3] = (ch & 0x3F) | 0x80;
    }
    }

    void utf16to32(const uint16_t utf16[2], uint32_t utf32[1]) {
    uint32_t ch = utf16[0];
    if (utf16[0] & UNI_SUR_HIGH_START == UNI_SUR_HIGH_START) {
    ch = (utf16[0] & UNI_MASK) << UNI_SHIFT;
    ch += utf16[1] & UNI_MASK;
    ch += UNI_BASE;
    }
    utf32[0] = ch;
    }

    void utf32to8(uint32_t utf32, uint8_t utf8[4]) {
    uint32_t ch = utf32;
    if (ch <= 0x7F) {
    utf8[0] = ch;
    } else if (ch <= 0x7FF) {
    utf8[0] = (ch >> 6) | 0xC0;
    utf8[1] = (ch & 0x3F) | 0x80;
    } else if (ch <= 0xFFFF) {
    utf8[0] = (ch >> 12) | 0xE0;
    utf8[1] = ((ch >> 6) & 0x3F) | 0x80;
    utf8[2] = (ch & 0x3F) | 0x80;
    } else if (ch <= 0x1FFFFF) {
    utf8[0] = (ch >> 18) | 0xF0;
    utf8[1] = ((ch >> 12) & 0x3F) | 0x80;
    utf8[2] = ((ch >> 6) & 0x3F) | 0x80;
    utf8[3] = (ch & 0x3F) | 0x80;
    }
    }

    void utf32to16(uint32_t utf32, uint16_t utf16[2]) {
    uint32_t ch = utf32;
    if (ch <= 0xffff) {
    utf16[0] = ch;
    } else {
    ch -= UNI_BASE;
    utf16[0] = (ch >> UNI_SHIFT) + UNI_SUR_HIGH_START;
    utf16[1] = (ch & UNI_MASK) + UNI_SUR_LOW_START;
    }
    }