Skip to content

Instantly share code, notes, and snippets.

@theraot
Created November 28, 2018 23:16
Show Gist options
  • Save theraot/0d92d4f6c6e29e5cfe5572dbb5cbe9f2 to your computer and use it in GitHub Desktop.
Save theraot/0d92d4f6c6e29e5cfe5572dbb5cbe9f2 to your computer and use it in GitHub Desktop.

Revisions

  1. theraot created this gist Nov 28, 2018.
    280 changes: 280 additions & 0 deletions UTF8.lib.php
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,280 @@
    <?php
    /**
    * CC-BY 3.0 Alfonso J. Ramos (theraot)
    * UTF8
    */
    final class UTF8
    {
    //------------------------------------------------------------
    // Private (Class)
    //------------------------------------------------------------

    private static function CodePointLength($ord)
    {
    if (($ord >> 7) === 0)
    {
    return 1;
    }
    if (($ord >> 5) === 6)
    {
    return 2;
    }
    if (($ord >> 4) === 14)
    {
    return 3;
    }
    if (($ord >> 3) === 30)
    {
    return 4;
    }
    return false;
    }

    private static function CharacterIndex($string, $position, $after)
    {
    $strlen = strlen($string);
    if ($position < 0)
    {
    for ($index = $strlen - 1; $index >= 0; $index--)
    {
    $ord = ord($string{$index});
    if (($ord >> 6) !== 2)
    {
    $position++;
    }
    if ($position === 0)
    {
    return $index;
    }
    }
    return null;
    }
    $count = 0;
    for ($index = $after; $index < $strlen; $count++)
    {
    if ($count === $position)
    {
    return $index;
    }
    $ord = ord($string{$index});
    $add = UTF8::CodePointLength($ord);
    if ($add === false)
    {
    return false;
    }
    /*for ($check = $index + 1; $check < $index + $add; $check++)
    {
    $ord = ord($string{$check});
    if ($ord < 0x80 || $ord > 0xbf)
    {
    return false;
    }
    }*/
    $index += $add;
    }
    if ($count < $position)
    {
    return false;
    }
    return $strlen;
    }

    //------------------------------------------------------------
    // Public (Class)
    //------------------------------------------------------------

    /**
    * UTF-8 aware replacement of char
    */
    public static function Character(/*int*/ $codepoint)
    {
    $codepoint = intval($codepoint);
    if ($codepoint < 127)
    {
    return chr($codepoint);
    }
    if ($codepoint < 2047)
    {
    return chr(192 | (($codepoint >> 6) & 31)).chr(128 | ($codepoint & 63));
    }
    if ($codepoint < 65535)
    {
    return chr(224 | (($codepoint >> 12) & 31)).chr(128 | (($codepoint >> 6) & 63)).chr(128 | ($codepoint & 63));
    }
    if ($codepoint < 1114111)
    {
    return chr(240 | (($codepoint >> 18) & 31)).chr(128 | (($codepoint >> 12) & 63)).chr(128 | (($codepoint >> 6) & 63)).chr(128 | ($codepoint & 63));
    }
    }

    public static function CharacterAt($string, $index)
    {
    $nextIndex = UTF8::CharacterIndex($string, 1, $index);
    if ($nextIndex === false)
    {
    return '';
    }
    return substr($string, $index, $nextIndex - $index);
    }

    public static function CharactersAt($string, $index, $length)
    {
    $nextIndex = UTF8::CharacterIndex($string, $length, $index);
    if ($nextIndex === false)
    {
    return substr($string, $index);
    }
    return substr($string, $index, $nextIndex - $index);
    }

    /**
    * UTF-8 aware replacement of ord
    */
    public static function CodePoint(/*string*/ $character)
    {
    $ord0 = ord($character{0});
    switch(UTF8::CodePointLength($ord0))
    {
    case 1:
    return $ord0;
    case 2:
    return ($ord0 - 192) * 64 + (ord($character{1}) - 128);
    case 3:
    return ($ord0 - 224) * 4096 + (ord($character{1}) - 128) * 64 + (ord($character{2}) - 128);
    case 4:
    return ($ord0 - 240) * 262144 + (ord($character{1}) - 128) * 4096 + (ord($character{2}) - 128) * 64 + (ord($character{3}) - 128);
    default:
    return false;
    }
    }

    public static function Enumerate($string)
    {
    $strlen = strlen($string);
    for ($index = 0; $index < $strlen; )
    {
    $chr = UTF8::CharacterAt($string, $index);
    if ($chr === '')
    {
    return;
    }
    $index += strlen($chr);
    yield $chr;
    }
    }

    public static function IsASCII($string)
    {
    $strlen = strlen($string);
    $count = 0;
    for ($index = 0; $index < $strlen; $count++)
    {
    $ord = ord($string{$index});
    $add = UTF8::CodePointLength($ord);
    if ($add === false || $add !== 1)
    {
    return false;
    }
    $index += $add;
    }
    return $count;
    }

    public static function IsUTF8($string)
    {
    return UTF8::Length($string) !== false;
    }

    public static function Length($string)
    {
    $strlen = strlen($string);
    $count = 0;
    for ($index = 0; $index < $strlen; $count++)
    {
    $ord = ord($string{$index});
    $add = UTF8::CodePointLength($ord);
    if ($add === false)
    {
    return false;
    }
    for ($check = $index + 1; $check < $index + $add; $check++)
    {
    $ord = ord($string{$check});
    if ($ord < 0x80 || $ord > 0xbf)
    {
    return false;
    }
    }
    $index += $add;
    }
    return $count;
    }

    public static function Split(/*string*/ $string, /*int*/ $length = 1)
    {
    if (intval($length) !== $length || $length < 1)
    {
    trigger_error('The length of each segment must be greater than zero', E_USER_WARNING);
    return false;
    }
    else
    {
    $strlen = strlen($string);
    $result = [];
    $index = 0;
    while($index < $strlen)
    {
    $block = UTF8::CharactersAt($string, $index, $length);
    $result[] = $block;
    $index += strlen($block);
    }
    return $result;
    }
    }

    public static function Substr($string, $start, $length = null)
    {
    $startIndex = UTF8::CharacterIndex($string, $start, 0);
    if ($startIndex === null)
    {
    $startIndex = 0;
    }
    if ($startIndex === false)
    {
    return false;
    }
    if ($length === null)
    {
    return substr($string, $startIndex);
    }
    else
    {
    $endIndex = UTF8::CharacterIndex($string, $length, $startIndex);
    if ($endIndex === null)
    {
    return false;
    }
    if ($endIndex === false)
    {
    return substr($string, $startIndex);
    }
    if ($endIndex < $startIndex)
    {
    return $start < 0 ? '' : false;
    }
    return substr($string, $startIndex, $endIndex - $startIndex);
    }
    }

    //------------------------------------------------------------
    // Public (Constructor)
    //------------------------------------------------------------

    /**
    * Creating instances of this class is not allowed.
    */
    public function __construct()
    {
    trigger_error('Creating instances of '.__CLASS__.' is forbidden');
    }
    }