mirror of
https://github.com/php/php-src.git
synced 2025-08-16 05:58:45 +02:00
mb_detect_encoding is more accurate on strings with UTF-8/16 BOM
Thanks to the GitHub user 'titanz35' for pointing out that the new implementation of mb_detect_encoding had poor detection accuracy on UTF-8 and UTF-16 strings with a byte-order mark.
This commit is contained in:
parent
e8f14da4c5
commit
cb840799b4
3 changed files with 38 additions and 0 deletions
2
NEWS
2
NEWS
|
@ -51,6 +51,8 @@ PHP NEWS
|
|||
casing rules for the Greek letter sigma. For mb_convert_case, conditional
|
||||
casing only applies to MB_CASE_LOWER and MB_CASE_TITLE modes, not to
|
||||
MB_CASE_LOWER_SIMPLE and MB_CASE_TITLE_SIMPLE. (Alex Dowad)
|
||||
. mb_detect_encoding is better able to identify UTF-8 and UTF-16 strings
|
||||
with a byte-order mark. (Alex Dowad)
|
||||
|
||||
- Opcache:
|
||||
. Added start, restart and force restart time to opcache's
|
||||
|
|
|
@ -43,6 +43,7 @@
|
|||
#include "libmbfl/filters/mbfilter_uuencode.h"
|
||||
#include "libmbfl/filters/mbfilter_ucs4.h"
|
||||
#include "libmbfl/filters/mbfilter_utf8.h"
|
||||
#include "libmbfl/filters/mbfilter_utf16.h"
|
||||
#include "libmbfl/filters/mbfilter_singlebyte.h"
|
||||
#include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"
|
||||
|
||||
|
@ -2994,6 +2995,24 @@ static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len,
|
|||
data[i].in_len = in_len;
|
||||
data[i].state = 0;
|
||||
data[i].demerits = 0;
|
||||
|
||||
/* Skip byte order mark for UTF-8, UTF-16BE, or UTF-16LE */
|
||||
if (elist[i] == &mbfl_encoding_utf8) {
|
||||
if (in_len >= 3 && in[0] == 0xEF && in[1] == 0xBB && in[2] == 0xBF) {
|
||||
data[i].in_len -= 3;
|
||||
data[i].in += 3;
|
||||
}
|
||||
} else if (elist[i] == &mbfl_encoding_utf16be) {
|
||||
if (in_len >= 2 && in[0] == 0xFE && in[1] == 0xFF) {
|
||||
data[i].in_len -= 2;
|
||||
data[i].in += 2;
|
||||
}
|
||||
} else if (elist[i] == &mbfl_encoding_utf16le) {
|
||||
if (in_len >= 2 && in[0] == 0xFF && in[1] == 0xFE) {
|
||||
data[i].in_len -= 2;
|
||||
data[i].in += 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsigned int finished = 0; /* For how many candidate encodings have we processed all the input? */
|
||||
|
|
|
@ -100,6 +100,19 @@ try {
|
|||
echo $e->getMessage() . \PHP_EOL;
|
||||
}
|
||||
|
||||
echo "== BOM TEST ==\n";
|
||||
|
||||
$str = chr(239).chr(187).chr(191).chr(195).chr(180); // UTF-8 BOM followed by ô
|
||||
var_dump(mb_detect_encoding($str, ['UTF-8', 'ISO-8859-1'], true));
|
||||
// U+4E4E is the Chinese character 乎; normally it would be impossible to distinguish UTF-16LE from UTF-16BE
|
||||
// But the BOM can tell us which one it is
|
||||
var_dump(mb_detect_encoding("\xFE\xFF\x4E\x4E", ['UTF-8', 'ISO-8859-1', 'UTF-16LE', 'UTF-16BE'], true));
|
||||
var_dump(mb_detect_encoding("\xFF\xFE\x4E\x4E", ['UTF-8', 'ISO-8859-1', 'UTF-16LE', 'UTF-16BE'], true));
|
||||
// However, a BOM should only appear at the beginning of the string
|
||||
$detected = mb_detect_encoding("\x4E\x4E\xFE\xFF\x4E\x4E", ['UTF-8', 'ISO-8859-1', 'UTF-16LE', 'UTF-16BE'], true);
|
||||
if ($detected === 'UTF-16BE' || $detected === 'UTF-16LE')
|
||||
die("Don't accept a BOM in the middle of a string");
|
||||
|
||||
echo "== TORTURE TEST ==\n";
|
||||
|
||||
function test($strings, $encodings) {
|
||||
|
@ -373,5 +386,9 @@ SJIS: SJIS
|
|||
INT: EUC-JP
|
||||
EUC-JP: EUC-JP
|
||||
mb_detect_encoding(): Argument #2 ($encodings) contains invalid encoding "BAD"
|
||||
== BOM TEST ==
|
||||
string(5) "UTF-8"
|
||||
string(8) "UTF-16BE"
|
||||
string(8) "UTF-16LE"
|
||||
== TORTURE TEST ==
|
||||
Done!
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue