|
|
@@ -0,0 +1,280 @@ |
|
|
<?php |
|
|
/** |
|
|
* CC-BY 3.0 Alfonso J. Ramos (theraot) |
|
|
* UTF8 |
|
|
*/ |
|
|
final class UTF8 |
|
|
{ |
|
|
//------------------------------------------------------------ |
|
|
// Private (Class) |
|
|
//------------------------------------------------------------ |
|
|
|
|
|
private static function CodePointLength($ord) |
|
|
{ |
|
|
if (($ord >> 7) === 0) |
|
|
{ |
|
|
return 1; |
|
|
} |
|
|
if (($ord >> 5) === 6) |
|
|
{ |
|
|
return 2; |
|
|
} |
|
|
if (($ord >> 4) === 14) |
|
|
{ |
|
|
return 3; |
|
|
} |
|
|
if (($ord >> 3) === 30) |
|
|
{ |
|
|
return 4; |
|
|
} |
|
|
return false; |
|
|
} |
|
|
|
|
|
private static function CharacterIndex($string, $position, $after) |
|
|
{ |
|
|
$strlen = strlen($string); |
|
|
if ($position < 0) |
|
|
{ |
|
|
for ($index = $strlen - 1; $index >= 0; $index--) |
|
|
{ |
|
|
$ord = ord($string{$index}); |
|
|
if (($ord >> 6) !== 2) |
|
|
{ |
|
|
$position++; |
|
|
} |
|
|
if ($position === 0) |
|
|
{ |
|
|
return $index; |
|
|
} |
|
|
} |
|
|
return null; |
|
|
} |
|
|
$count = 0; |
|
|
for ($index = $after; $index < $strlen; $count++) |
|
|
{ |
|
|
if ($count === $position) |
|
|
{ |
|
|
return $index; |
|
|
} |
|
|
$ord = ord($string{$index}); |
|
|
$add = UTF8::CodePointLength($ord); |
|
|
if ($add === false) |
|
|
{ |
|
|
return false; |
|
|
} |
|
|
/*for ($check = $index + 1; $check < $index + $add; $check++) |
|
|
{ |
|
|
$ord = ord($string{$check}); |
|
|
if ($ord < 0x80 || $ord > 0xbf) |
|
|
{ |
|
|
return false; |
|
|
} |
|
|
}*/ |
|
|
$index += $add; |
|
|
} |
|
|
if ($count < $position) |
|
|
{ |
|
|
return false; |
|
|
} |
|
|
return $strlen; |
|
|
} |
|
|
|
|
|
//------------------------------------------------------------ |
|
|
// Public (Class) |
|
|
//------------------------------------------------------------ |
|
|
|
|
|
/** |
|
|
* UTF-8 aware replacement of char |
|
|
*/ |
|
|
public static function Character(/*int*/ $codepoint) |
|
|
{ |
|
|
$codepoint = intval($codepoint); |
|
|
if ($codepoint < 127) |
|
|
{ |
|
|
return chr($codepoint); |
|
|
} |
|
|
if ($codepoint < 2047) |
|
|
{ |
|
|
return chr(192 | (($codepoint >> 6) & 31)).chr(128 | ($codepoint & 63)); |
|
|
} |
|
|
if ($codepoint < 65535) |
|
|
{ |
|
|
return chr(224 | (($codepoint >> 12) & 31)).chr(128 | (($codepoint >> 6) & 63)).chr(128 | ($codepoint & 63)); |
|
|
} |
|
|
if ($codepoint < 1114111) |
|
|
{ |
|
|
return chr(240 | (($codepoint >> 18) & 31)).chr(128 | (($codepoint >> 12) & 63)).chr(128 | (($codepoint >> 6) & 63)).chr(128 | ($codepoint & 63)); |
|
|
} |
|
|
} |
|
|
|
|
|
public static function CharacterAt($string, $index) |
|
|
{ |
|
|
$nextIndex = UTF8::CharacterIndex($string, 1, $index); |
|
|
if ($nextIndex === false) |
|
|
{ |
|
|
return ''; |
|
|
} |
|
|
return substr($string, $index, $nextIndex - $index); |
|
|
} |
|
|
|
|
|
public static function CharactersAt($string, $index, $length) |
|
|
{ |
|
|
$nextIndex = UTF8::CharacterIndex($string, $length, $index); |
|
|
if ($nextIndex === false) |
|
|
{ |
|
|
return substr($string, $index); |
|
|
} |
|
|
return substr($string, $index, $nextIndex - $index); |
|
|
} |
|
|
|
|
|
/** |
|
|
* UTF-8 aware replacement of ord |
|
|
*/ |
|
|
public static function CodePoint(/*string*/ $character) |
|
|
{ |
|
|
$ord0 = ord($character{0}); |
|
|
switch(UTF8::CodePointLength($ord0)) |
|
|
{ |
|
|
case 1: |
|
|
return $ord0; |
|
|
case 2: |
|
|
return ($ord0 - 192) * 64 + (ord($character{1}) - 128); |
|
|
case 3: |
|
|
return ($ord0 - 224) * 4096 + (ord($character{1}) - 128) * 64 + (ord($character{2}) - 128); |
|
|
case 4: |
|
|
return ($ord0 - 240) * 262144 + (ord($character{1}) - 128) * 4096 + (ord($character{2}) - 128) * 64 + (ord($character{3}) - 128); |
|
|
default: |
|
|
return false; |
|
|
} |
|
|
} |
|
|
|
|
|
public static function Enumerate($string) |
|
|
{ |
|
|
$strlen = strlen($string); |
|
|
for ($index = 0; $index < $strlen; ) |
|
|
{ |
|
|
$chr = UTF8::CharacterAt($string, $index); |
|
|
if ($chr === '') |
|
|
{ |
|
|
return; |
|
|
} |
|
|
$index += strlen($chr); |
|
|
yield $chr; |
|
|
} |
|
|
} |
|
|
|
|
|
public static function IsASCII($string) |
|
|
{ |
|
|
$strlen = strlen($string); |
|
|
$count = 0; |
|
|
for ($index = 0; $index < $strlen; $count++) |
|
|
{ |
|
|
$ord = ord($string{$index}); |
|
|
$add = UTF8::CodePointLength($ord); |
|
|
if ($add === false || $add !== 1) |
|
|
{ |
|
|
return false; |
|
|
} |
|
|
$index += $add; |
|
|
} |
|
|
return $count; |
|
|
} |
|
|
|
|
|
public static function IsUTF8($string) |
|
|
{ |
|
|
return UTF8::Length($string) !== false; |
|
|
} |
|
|
|
|
|
public static function Length($string) |
|
|
{ |
|
|
$strlen = strlen($string); |
|
|
$count = 0; |
|
|
for ($index = 0; $index < $strlen; $count++) |
|
|
{ |
|
|
$ord = ord($string{$index}); |
|
|
$add = UTF8::CodePointLength($ord); |
|
|
if ($add === false) |
|
|
{ |
|
|
return false; |
|
|
} |
|
|
for ($check = $index + 1; $check < $index + $add; $check++) |
|
|
{ |
|
|
$ord = ord($string{$check}); |
|
|
if ($ord < 0x80 || $ord > 0xbf) |
|
|
{ |
|
|
return false; |
|
|
} |
|
|
} |
|
|
$index += $add; |
|
|
} |
|
|
return $count; |
|
|
} |
|
|
|
|
|
public static function Split(/*string*/ $string, /*int*/ $length = 1) |
|
|
{ |
|
|
if (intval($length) !== $length || $length < 1) |
|
|
{ |
|
|
trigger_error('The length of each segment must be greater than zero', E_USER_WARNING); |
|
|
return false; |
|
|
} |
|
|
else |
|
|
{ |
|
|
$strlen = strlen($string); |
|
|
$result = []; |
|
|
$index = 0; |
|
|
while($index < $strlen) |
|
|
{ |
|
|
$block = UTF8::CharactersAt($string, $index, $length); |
|
|
$result[] = $block; |
|
|
$index += strlen($block); |
|
|
} |
|
|
return $result; |
|
|
} |
|
|
} |
|
|
|
|
|
public static function Substr($string, $start, $length = null) |
|
|
{ |
|
|
$startIndex = UTF8::CharacterIndex($string, $start, 0); |
|
|
if ($startIndex === null) |
|
|
{ |
|
|
$startIndex = 0; |
|
|
} |
|
|
if ($startIndex === false) |
|
|
{ |
|
|
return false; |
|
|
} |
|
|
if ($length === null) |
|
|
{ |
|
|
return substr($string, $startIndex); |
|
|
} |
|
|
else |
|
|
{ |
|
|
$endIndex = UTF8::CharacterIndex($string, $length, $startIndex); |
|
|
if ($endIndex === null) |
|
|
{ |
|
|
return false; |
|
|
} |
|
|
if ($endIndex === false) |
|
|
{ |
|
|
return substr($string, $startIndex); |
|
|
} |
|
|
if ($endIndex < $startIndex) |
|
|
{ |
|
|
return $start < 0 ? '' : false; |
|
|
} |
|
|
return substr($string, $startIndex, $endIndex - $startIndex); |
|
|
} |
|
|
} |
|
|
|
|
|
//------------------------------------------------------------ |
|
|
// Public (Constructor) |
|
|
//------------------------------------------------------------ |
|
|
|
|
|
/** |
|
|
* Creating instances of this class is not allowed. |
|
|
*/ |
|
|
public function __construct() |
|
|
{ |
|
|
trigger_error('Creating instances of '.__CLASS__.' is forbidden'); |
|
|
} |
|
|
} |