Skip to content

Instantly share code, notes, and snippets.

@m1el
Created December 18, 2023 20:15
Show Gist options
  • Select an option

  • Save m1el/48339a83a49f46aa0f6cfd13d3f80e09 to your computer and use it in GitHub Desktop.

Select an option

Save m1el/48339a83a49f46aa0f6cfd13d3f80e09 to your computer and use it in GitHub Desktop.

Revisions

  1. m1el created this gist Dec 18, 2023.
    47 changes: 47 additions & 0 deletions utf8.rs
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,47 @@
    fn utf8_len(start: u8) -> usize {
    match start {
    0b0000_0000..=0b0111_1111 => 1,
    0b1100_0000..=0b1101_1111 => 2,
    0b1110_0000..=0b1110_1111 => 3,
    0b1111_0000..=0b1111_0111 => 4,
    _ => 0,
    }
    }
    enum Utf8Error {
    InvalidStart,
    InvalidContinuation,
    InvalidCodepoint,
    TruncatedCodepoint,
    }
    fn utf8_split(bytes: &[u8]) -> Option<Result<(char, &[u8]), Utf8Error>> {
    if bytes.is_empty() {
    return None;
    }
    let head = bytes[0];
    let len = utf8_len(head);
    if len == 0 {
    return Some(Err(Utf8Error::InvalidStart));
    }
    if bytes.len() < len {
    return Some(Err(Utf8Error::TruncatedCodepoint));
    }
    let mut chr = (head << len >> len) as u32;
    for &byte in &bytes[1..len] {
    if byte & 0b1100_0000 != 0b1000_0000 {
    return Some(Err(Utf8Error::InvalidContinuation));
    }
    chr = (chr << 6) | (byte & 0b0011_1111) as u32;
    }
    match char::from_u32(chr) {
    Some(chr) => Some(Ok((chr, &bytes[len..]))),
    None => Some(Err(Utf8Error::InvalidCodepoint)),
    }
    }

    fn main() {
    let mut bytes = "привіт, світ! ❤️🧡💛💚💙💜".as_bytes();
    while let Some(Ok((chr, tail))) = utf8_split(bytes) {
    bytes = tail;
    print!("'{}', ", chr);
    }
    }