11 const WSC_CHARSET =
'UTF-8';
13 static public $aliases = array(
14 'USASCII' =>
'WINDOWS-1252',
15 'ANSIX31101983' =>
'WINDOWS-1252',
16 'ANSIX341968' =>
'WINDOWS-1252',
17 'UNKNOWN8BIT' =>
'ISO-8859-15',
18 'UNKNOWN' =>
'ISO-8859-15',
19 'USERDEFINED' =>
'ISO-8859-15',
20 'KSC56011987' =>
'EUC-KR',
24 'UTF7IMAP' =>
'UTF7-IMAP',
25 'TIS620' =>
'WINDOWS-874',
26 'ISO88599' =>
'WINDOWS-1254',
27 'ISO885911' =>
'WINDOWS-874',
28 'MACROMAN' =>
'MACINTOSH',
35 '161' =>
'WINDOWS-1253',
36 '162' =>
'WINDOWS-1254',
37 '163' =>
'WINDOWS-1258',
38 '177' =>
'WINDOWS-1255',
39 '178' =>
'WINDOWS-1256',
40 '186' =>
'WINDOWS-1257',
41 '204' =>
'WINDOWS-1251',
42 '222' =>
'WINDOWS-874',
43 '238' =>
'WINDOWS-1250',
45 'WINDOWS949' =>
'UHC',
55 throw new \Exception($errstr, 0, $errno);
68 static $charsets = array();
69 $m = array(null, null, null);
70 $charset = strtoupper($input);
71 if (isset($charsets[$input]))
73 return $charsets[$input];
75 $charset = preg_replace(array(
81 if ($charset ==
'BINARY')
83 return $charsets[$input] = null;
86 $str = preg_replace(
'/[^A-Z0-9]/',
'', $charset);
87 if (isset(self::$aliases[$str]))
89 $result = self::$aliases[$str];
92 else if (preg_match(
'/U[A-Z][A-Z](7|8|16|32)(BE|LE)*/', $str, $m))
94 $result =
'UTF-' . (isset($m[1])?$m[1]:
'') . (isset($m[2])?$m[2]:
'');
97 else if (preg_match(
'/ISO8859([0-9]{0,2})/', $str, $m))
99 $iso =
'ISO-8859-' . isset($m[1])?$m[1]:
'1';
102 $result = $iso ==
'ISO-8859-1' ?
'WINDOWS-1252' : $iso;
105 else if (preg_match(
'/(WIN|WINDOWS)([0-9]+)/', $str, $m))
107 $result =
'WINDOWS-' . isset($m[2])?$m[2]:
'';
110 else if (preg_match(
'/LATIN(.*)/', $str, $m))
112 $aliases = array(
'2' => 2,
'3' => 3,
'4' => 4,
'5' => 9,
'6' => 10,
113 '7' => 13,
'8' => 14,
'9' => 15,
'10' => 16,
114 'ARABIC' => 6,
'CYRILLIC' => 5,
'GREEK' => 7,
'GREEK1' => 7,
'HEBREW' => 8
120 $result =
'WINDOWS-1252';
123 else if (!empty($aliases[$m[1]]))
125 $result =
'ISO-8859-'.$aliases[$m[1]];
137 $charsets[$input] = $result;
152 public static function convert($str, $from, $to = null)
154 static $iconv_options = null;
155 static $mbstring_list = null;
156 static $mbstring_sch = null;
157 $to = empty($to) ? self::WSC_CHARSET : strtoupper($to);
158 $from = self::parse_charset($from);
163 if ($from ==
'UTF-16' && !preg_match(
'/[^\x00-\x7F]/', $str)) {
166 if ($from == $to || empty($str) || empty($from)) {
169 if ($iconv_options === null) {
170 if (function_exists(
'iconv')) {
172 $iconv_options =
'//IGNORE';
173 if (iconv(
'', $iconv_options,
'') ===
false) {
179 $iconv_options =
false;
183 if ($iconv_options !==
false && $from !=
'UTF7-IMAP' && $to !=
'UTF7-IMAP') {
186 $out = @iconv($from, $to . $iconv_options, $str);
191 restore_error_handler();
192 if ($out !==
false) {
196 if ($mbstring_list === null) {
197 if (extension_loaded(
'mbstring')) {
198 $mbstring_sch = mb_substitute_character();
199 $mbstring_list = mb_list_encodings();
200 $mbstring_list = array_map(
'strtoupper', $mbstring_list);
203 $mbstring_list =
false;
208 if ($mbstring_list !==
false) {
209 $aliases[
'WINDOWS-1257'] =
'ISO-8859-13';
211 if (($from ==
'US-ASCII' || $to ==
'US-ASCII') && !in_array(
'US-ASCII', $mbstring_list)) {
212 $aliases[
'US-ASCII'] =
'ASCII';
214 $mb_from = $aliases[$from] ?: $from;
215 $mb_to = $aliases[$to] ?: $to;
217 if (in_array($mb_from, $mbstring_list) && in_array($mb_to, $mbstring_list)) {
219 mb_substitute_character(
'none');
221 $out = mb_convert_encoding($str, $mb_to, $mb_from);
226 restore_error_handler();
227 mb_substitute_character($mbstring_sch);
228 if ($out !==
false) {
234 if ($to ==
'UTF-8') {
235 if ($from ==
'UTF7-IMAP') {
236 if ($out = self::utf7imap_to_utf8($str)) {
240 else if ($from ==
'UTF-7') {
241 if ($out = self::utf7_to_utf8($str)) {
245 else if ($from ==
'ISO-8859-1' && function_exists(
'utf8_encode')) {
246 return utf8_encode($str);
250 if ($from ==
'UTF-8') {
252 if ($to ==
'UTF7-IMAP' || $to ==
'UTF-7') {
253 if ($out = self::utf8_to_utf7imap($str)) {
257 else if ($to ==
'ISO-8859-1' && function_exists(
'utf8_decode')) {
258 return utf8_decode($str);
262 trigger_error(
"No suitable function found for '$from' to '$to' conversion");
277 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
278 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
279 0,0,0,0, 0,0,0,0, 0,0,0,1, 0,0,0,0,
280 1,1,1,1, 1,1,1,1, 1,1,0,0, 0,0,0,0,
281 0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
282 1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,
283 0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
284 1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,
286 $u7len = strlen($str);
289 for ($i=0; $u7len > 0; $i++, $u7len--) {
295 for (; $u7len > 0; $i++, $u7len--) {
297 if (!$Index_64[ord($u7)]) {
308 $res .= self::utf16_to_utf8(base64_decode($ch));
327 for ($i = 0; $i < $len; $i += 2) {
328 $c = ord($str[$i]) << 8 | ord($str[$i + 1]);
329 if ($c >= 0x0001 && $c <= 0x007F) {
332 else if ($c > 0x07FF) {
333 $dec .= chr(0xE0 | (($c >> 12) & 0x0F));
334 $dec .= chr(0x80 | (($c >> 6) & 0x3F));
335 $dec .= chr(0x80 | (($c >> 0) & 0x3F));
338 $dec .= chr(0xC0 | (($c >> 6) & 0x1F));
339 $dec .= chr(0x80 | (($c >> 0) & 0x3F));
361 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
362 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
363 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, 63,-1,-1,-1,
364 52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
365 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
366 15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
367 -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
368 41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
370 $u7len = strlen($str);
374 for ($i=0; $u7len > 0; $i++, $u7len--) {
380 if ($u7len && $u7 ==
'-') {
386 for (; $u7len > 0; $i++, $u7len--) {
388 if ((ord($u7) & 0x80) || ($b = $Index_64[ord($u7)]) == -1) {
399 if (0x20 <= $ch && $ch < 0x7f) {
404 else if ($ch < 0x800) {
405 $p .= chr(0xc0 | ($ch >> 6));
406 $p .= chr(0x80 | ($ch & 0x3f));
409 $p .= chr(0xe0 | ($ch >> 12));
410 $p .= chr(0x80 | (($ch >> 6) & 0x3f));
411 $p .= chr(0x80 | ($ch & 0x3f));
413 $ch = ($b << (16 + $k)) & 0xffff;
422 if (!$u7len || $u7 !=
'-') {
426 if ($u7len > 2 && $str[$i+1] ==
'&' && $str[$i+2] !=
'-') {
431 else if (ord($u7) < 0x20 || ord($u7) >= 0x7f) {
454 'A',
'B',
'C',
'D',
'E',
'F',
'G',
'H',
'I',
'J',
'K',
'L',
'M',
'N',
'O',
455 'P',
'Q',
'R',
'S',
'T',
'U',
'V',
'W',
'X',
'Y',
'Z',
'a',
'b',
'c',
'd',
456 'e',
'f',
'g',
'h',
'i',
'j',
'k',
'l',
'm',
'n',
'o',
'p',
'q',
'r',
's',
457 't',
'u',
'v',
'w',
'x',
'y',
'z',
'0',
'1',
'2',
'3',
'4',
'5',
'6',
'7',
460 $u8len = strlen($str);
472 else if ($c < 0xc2) {
475 else if ($c < 0xe0) {
479 else if ($c < 0xf0) {
483 else if ($c < 0xf8) {
487 else if ($c < 0xfc) {
491 else if ($c < 0xfe) {
503 for ($j=0; $j < $n; $j++) {
504 $o = ord($str[$i+$j]);
505 if (($o & 0xc0) != 0x80) {
508 $ch = ($ch << 6) | ($o & 0x3f);
510 if ($n > 1 && !($ch >> ($n * 5 + 1))) {
515 if ($ch < 0x20 || $ch >= 0x7f) {
525 $p .= $B64Chars[($b | $ch >> $k)];
527 for (; $k >= 0; $k -= 6) {
528 $p .= $B64Chars[(($ch >> $k) & 0x3f)];
530 $b = ($ch << (-$k)) & 0x3f;
542 if (chr($ch) ==
'&') {
564 public static function detect($string, $failover = null, $language =
'es_ES')
566 if (substr($string, 0, 4) ==
"\0\0\xFE\xFF")
return 'UTF-32BE';
567 if (substr($string, 0, 4) ==
"\xFF\xFE\0\0")
return 'UTF-32LE';
568 if (substr($string, 0, 2) ==
"\xFE\xFF")
return 'UTF-16BE';
569 if (substr($string, 0, 2) ==
"\xFF\xFE")
return 'UTF-16LE';
570 if (substr($string, 0, 3) ==
"\xEF\xBB\xBF")
return 'UTF-8';
572 if ($string[0] ==
"\0" && $string[1] ==
"\0" && $string[2] ==
"\0" && $string[3] !=
"\0")
return 'UTF-32BE';
573 if ($string[0] !=
"\0" && $string[1] ==
"\0" && $string[2] ==
"\0" && $string[3] ==
"\0")
return 'UTF-32LE';
574 if ($string[0] ==
"\0" && $string[1] !=
"\0" && $string[2] ==
"\0" && $string[3] !=
"\0")
return 'UTF-16BE';
575 if ($string[0] !=
"\0" && $string[1] ==
"\0" && $string[2] !=
"\0" && $string[3] ==
"\0")
return 'UTF-16LE';
580 $prio = array(
'ISO-2022-JP',
'JIS',
'UTF-8',
'EUC-JP',
'eucJP-win',
'SJIS',
'SJIS-win');
584 $prio = array(
'UTF-8',
'BIG-5',
'GB2312',
'EUC-TW');
587 $prio = array(
'UTF-8',
'EUC-KR',
'ISO-2022-KR');
590 $prio = array(
'UTF-8',
'WINDOWS-1251',
'KOI8-R');
593 $prio = array(
'UTF-8',
'ISO-8859-9',
'WINDOWS-1254');
598 if ($prio && function_exists(
'mb_check_encoding')) {
599 foreach ($prio as $encoding) {
600 if (mb_check_encoding($string, $encoding)) {
605 if (function_exists(
'mb_detect_encoding')) {
607 $prio = array(
'UTF-8',
'SJIS',
'GB2312',
608 'ISO-8859-1',
'ISO-8859-2',
'ISO-8859-3',
'ISO-8859-4',
609 'ISO-8859-5',
'ISO-8859-6',
'ISO-8859-7',
'ISO-8859-8',
'ISO-8859-9',
610 'ISO-8859-10',
'ISO-8859-13',
'ISO-8859-14',
'ISO-8859-15',
'ISO-8859-16',
611 'WINDOWS-1252',
'WINDOWS-1251',
'EUC-JP',
'EUC-TW',
'KOI8-R',
'BIG-5',
612 'ISO-2022-KR',
'ISO-2022-JP',
615 $encodings = array_unique(array_merge($prio, mb_list_encodings()));
616 if ($encoding = mb_detect_encoding($string, $encodings)) {
623 [\x09\x0A\x0D\x20-\x7E] 624 | [\xC2-\xDF][\x80-\xBF] 625 | \xE0[\xA0-\xBF][\x80-\xBF] 626 | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} 627 | \xED[\x80-\x9F][\x80-\xBF] 628 | \xF0[\x90-\xBF][\x80-\xBF]{2} 629 | [\xF1-\xF3][\x80-\xBF]{3} 630 | \xF4[\x80-\x8F][\x80-\xBF]{2} 631 )*\z/xs', substr($string, 0, 2048))
644 public static function clean($input)
647 if (is_array($input)) {
648 foreach ($input as $idx => $val) {
649 $input[$idx] = self::clean($val);
653 if (!is_string($input) || $input ==
'') {
657 if (function_exists(
'mb_convert_encoding')) {
658 $msch = mb_substitute_character();
659 mb_substitute_character(
'none');
660 $res = mb_convert_encoding($input,
'UTF-8',
'UTF-8');
661 mb_substitute_character($msch);
662 if ($res !==
false) {
666 if (function_exists(
'iconv')) {
667 if (($res = @iconv(
'UTF-8',
'UTF-8//IGNORE', $input)) !==
false) {
675 '|[\xC2-\xDF][\x80-\xBF]'.
676 '|\xE0[\xA0-\xBF][\x80-\xBF]'.
677 '|[\xE1-\xEC][\x80-\xBF][\x80-\xBF]'.
678 '|\xED[\x80-\x9F][\x80-\xBF]'.
679 '|[\xEE-\xEF][\x80-\xBF][\x80-\xBF]'.
680 '|\xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]'.
681 '|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]'.
682 '|\xF4[\x80-\x8F][\x80-\xBF][\x80-\xBF]'.
684 for ($i = 0, $len = strlen($input); $i < $len; $i++) {
690 $out .= preg_match($regexp, $seq) ? $seq :
'';
696 else if ($ord >= 0xC0) {
698 $out .= preg_match($regexp, $seq) ? $seq :
'';
704 else if ($seq !==
'') {
709 $out .= preg_match($regexp, $seq) ? $seq :
'';
static utf8_to_utf7imap($str)
static utf7_to_utf8($str)
static parse_charset($input)
static convert($str, $from, $to=null)
static utf7imap_to_utf8($str)
static utf16_to_utf8($str)
static error_handler($errno, $errstr)
static detect($string, $failover=null, $language= 'es_ES')