mirror of
https://github.com/php/php-src.git
synced 2025-08-17 14:38:49 +02:00

mbstring had an 'identify filter' for almost every supported text encoding which was used when auto-detecting the most likely encoding for a string. It would run over the string and set a 'flag' if it saw anything which did not appear likely to be the encoding in question. One problem with this scheme was that encodings which merely appeared less likely to be the correct one were completely rejected, even if there was no better candidate. Another problem was that the 'identify filters' had a huge amount of code duplication with the 'conversion filters'. Eliminate the identify filters. Instead, when auto-detecting text encoding, use conversion filters to see whether the input string is valid in candidate encodings or not. At the same type, watch the type of codepoints which the string decodes to and mark it as less likely if non-printable characters (ESC, form feed, bell, etc.) or 'private use area' codepoints are seen. Interestingly, one old test case in which JIS text was misidentified as UTF-8 (and this wrong behavior was enshrined in the test) was 'fixed' and the JIS string is now auto-detected as JIS.
110 lines
2.2 KiB
PHP
110 lines
2.2 KiB
PHP
--TEST--
|
||
mb_detect_encoding()
|
||
--SKIPIF--
|
||
<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
|
||
--INI--
|
||
mbstring.language=Japanese
|
||
--FILE--
|
||
<?php
|
||
// TODO: Add more tests
|
||
|
||
// SJIS string (BASE64 encoded)
|
||
$sjis = base64_decode('k/qWe4zqg2WDTINYg2eCxYK3gUIwMTIzNIJUglWCVoJXgliBQg==');
|
||
// JIS string (BASE64 encoded)
|
||
$jis = base64_decode('GyRCRnxLXDhsJUYlLSU5JUgkRyQ5ISMbKEIwMTIzNBskQiM1IzYjNyM4IzkhIxsoQg==');
|
||
// EUC-JP string
|
||
$euc_jp = '日本語テキストです。0123456789。';
|
||
|
||
// Test with single "form encoding"
|
||
// Note: For some reason it complains, results are different. Not researched.
|
||
echo "== BASIC TEST ==\n";
|
||
$s = $sjis;
|
||
$s = mb_detect_encoding($s, 'SJIS');
|
||
print("SJIS: $s\n");
|
||
|
||
$s = $jis;
|
||
$s = mb_detect_encoding($s, 'JIS');
|
||
print("JIS: $s\n");
|
||
|
||
$s = $euc_jp;
|
||
$s = mb_detect_encoding($s, 'UTF-8,EUC-JP,JIS');
|
||
print("EUC-JP: $s\n");
|
||
|
||
$s = $euc_jp;
|
||
$s = mb_detect_encoding($s, 'JIS,EUC-JP');
|
||
print("EUC-JP: $s\n");
|
||
|
||
|
||
|
||
// Using Encoding List Array
|
||
echo "== ARRAY ENCODING LIST ==\n";
|
||
|
||
$a = array(0=>'UTF-8',1=>'EUC-JP', 2=>'SJIS', 3=>'JIS');
|
||
|
||
$s = $jis;
|
||
$s = mb_detect_encoding($s, $a);
|
||
print("JIS: $s\n");
|
||
|
||
$s = $euc_jp;
|
||
$s = mb_detect_encoding($s, $a);
|
||
print("EUC-JP: $s\n");
|
||
|
||
$s = $sjis;
|
||
$s = mb_detect_encoding($s, $a);
|
||
print("SJIS: $s\n");
|
||
|
||
|
||
// Using Detect Order
|
||
echo "== DETECT ORDER ==\n";
|
||
|
||
mb_detect_order('auto');
|
||
|
||
|
||
$s = $jis;
|
||
$s = mb_detect_encoding($s);
|
||
print("JIS: $s\n");
|
||
|
||
$s = $euc_jp;
|
||
$s = mb_detect_encoding($s);
|
||
print("EUC-JP: $s\n");
|
||
|
||
$s = $sjis;
|
||
$s = mb_detect_encoding($s);
|
||
print("SJIS: $s\n");
|
||
|
||
|
||
// Invalid(?) Parameters
|
||
echo "== INVALID PARAMETER ==\n";
|
||
|
||
$s = mb_detect_encoding(1234, 'EUC-JP');
|
||
print("INT: $s\n"); // EUC-JP
|
||
|
||
$s = mb_detect_encoding('', 'EUC-JP');
|
||
print("EUC-JP: $s\n"); // SJIS
|
||
|
||
$s = $euc_jp;
|
||
try {
|
||
var_dump(mb_detect_encoding($s, 'BAD'));
|
||
} catch (\ValueError $e) {
|
||
echo $e->getMessage() . \PHP_EOL;
|
||
}
|
||
|
||
?>
|
||
--EXPECT--
|
||
== BASIC TEST ==
|
||
SJIS: SJIS
|
||
JIS: JIS
|
||
EUC-JP: EUC-JP
|
||
EUC-JP: EUC-JP
|
||
== ARRAY ENCODING LIST ==
|
||
JIS: JIS
|
||
EUC-JP: EUC-JP
|
||
SJIS: SJIS
|
||
== DETECT ORDER ==
|
||
JIS: JIS
|
||
EUC-JP: EUC-JP
|
||
SJIS: SJIS
|
||
== INVALID PARAMETER ==
|
||
INT: EUC-JP
|
||
EUC-JP: EUC-JP
|
||
mb_detect_encoding(): Argument #2 ($encodings) contains invalid encoding "BAD"
|