mirror of
https://github.com/php/php-src.git
synced 2025-08-16 22:18:50 +02:00
Add test suite for UTF-{7,8,16,32}
Also fix a couple small problems with UTF-32 and UTF-8 support: - UTF-32 would pass very large codepoints (>= 0x80000000), which are not valid. - UTF-8 would sometimes emit two error marker characters for a single bad input byte.
This commit is contained in:
parent
cc0aac3f73
commit
7502c86342
4 changed files with 1048 additions and 35 deletions
|
@ -131,7 +131,7 @@ const struct mbfl_convert_vtbl vtbl_wchar_utf32le = {
|
||||||
|
|
||||||
static int emit_char_if_valid(int n, mbfl_convert_filter *filter)
|
static int emit_char_if_valid(int n, mbfl_convert_filter *filter)
|
||||||
{
|
{
|
||||||
if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xD800 || n > 0xDFFF)) {
|
if (n >= 0 && n < MBFL_WCSPLANE_UTF32MAX && (n < 0xD800 || n > 0xDFFF)) {
|
||||||
CK((*filter->output_function)(n, filter->data));
|
CK((*filter->output_function)(n, filter->data));
|
||||||
} else {
|
} else {
|
||||||
n = (n & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH;
|
n = (n & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH;
|
||||||
|
|
|
@ -86,19 +86,11 @@ const struct mbfl_convert_vtbl vtbl_wchar_utf8 = {
|
||||||
|
|
||||||
int mbfl_filt_put_invalid_char(int c, mbfl_convert_filter *filter)
|
int mbfl_filt_put_invalid_char(int c, mbfl_convert_filter *filter)
|
||||||
{
|
{
|
||||||
int w;
|
filter->status = filter->cache = 0;
|
||||||
w = c & MBFL_WCSGROUP_MASK;
|
CK((*filter->output_function)((c & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||||
w |= MBFL_WCSGROUP_THROUGH;
|
|
||||||
filter->status = 0;
|
|
||||||
filter->cache = 0;
|
|
||||||
CK((*filter->output_function)(w, filter->data));
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
* UTF-8 => wchar
|
|
||||||
*/
|
|
||||||
int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
|
int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
|
||||||
{
|
{
|
||||||
int s, c1;
|
int s, c1;
|
||||||
|
@ -131,6 +123,7 @@ retry:
|
||||||
CK((*filter->output_function)(s, filter->data));
|
CK((*filter->output_function)(s, filter->data));
|
||||||
} else {
|
} else {
|
||||||
CK(mbfl_filt_put_invalid_char(filter->cache, filter));
|
CK(mbfl_filt_put_invalid_char(filter->cache, filter));
|
||||||
|
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4))
|
||||||
goto retry;
|
goto retry;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -146,6 +139,7 @@ retry:
|
||||||
filter->status++;
|
filter->status++;
|
||||||
} else {
|
} else {
|
||||||
CK(mbfl_filt_put_invalid_char(filter->cache, filter));
|
CK(mbfl_filt_put_invalid_char(filter->cache, filter));
|
||||||
|
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4))
|
||||||
goto retry;
|
goto retry;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -161,6 +155,7 @@ retry:
|
||||||
filter->status++;
|
filter->status++;
|
||||||
} else {
|
} else {
|
||||||
CK(mbfl_filt_put_invalid_char(filter->cache, filter));
|
CK(mbfl_filt_put_invalid_char(filter->cache, filter));
|
||||||
|
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4))
|
||||||
goto retry;
|
goto retry;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -170,6 +165,7 @@ retry:
|
||||||
filter->status++;
|
filter->status++;
|
||||||
} else {
|
} else {
|
||||||
CK(mbfl_filt_put_invalid_char(filter->cache, filter));
|
CK(mbfl_filt_put_invalid_char(filter->cache, filter));
|
||||||
|
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4))
|
||||||
goto retry;
|
goto retry;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -183,27 +179,21 @@ retry:
|
||||||
|
|
||||||
int mbfl_filt_conv_utf8_wchar_flush(mbfl_convert_filter *filter)
|
int mbfl_filt_conv_utf8_wchar_flush(mbfl_convert_filter *filter)
|
||||||
{
|
{
|
||||||
int status, cache;
|
int status = filter->status, cache = filter->cache;
|
||||||
|
|
||||||
status = filter->status;
|
filter->status = filter->cache = 0;
|
||||||
cache = filter->cache;
|
|
||||||
|
|
||||||
filter->status = 0;
|
if (status) {
|
||||||
filter->cache = 0;
|
|
||||||
|
|
||||||
if (status != 0) {
|
|
||||||
CK(mbfl_filt_put_invalid_char(cache, filter));
|
CK(mbfl_filt_put_invalid_char(cache, filter));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (filter->flush_function != NULL) {
|
if (filter->flush_function) {
|
||||||
(*filter->flush_function)(filter->data);
|
(*filter->flush_function)(filter->data);
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* wchar => UTF-8
|
|
||||||
*/
|
|
||||||
int mbfl_filt_conv_wchar_utf8(int c, mbfl_convert_filter *filter)
|
int mbfl_filt_conv_wchar_utf8(int c, mbfl_convert_filter *filter)
|
||||||
{
|
{
|
||||||
if (c >= 0 && c < 0x110000) {
|
if (c >= 0 && c < 0x110000) {
|
||||||
|
|
|
@ -22,28 +22,28 @@ var_dump(chk_enc("\x31\x32\x33", 0));
|
||||||
var_dump(chk_enc("\x41\x42\x43", 0));
|
var_dump(chk_enc("\x41\x42\x43", 0));
|
||||||
var_dump(chk_enc("\xc0\xb1\xc0\xb2\xc0\xb3", 6));
|
var_dump(chk_enc("\xc0\xb1\xc0\xb2\xc0\xb3", 6));
|
||||||
var_dump(chk_enc("\xc1\x81\xc1\x82\xc1\x83", 6));
|
var_dump(chk_enc("\xc1\x81\xc1\x82\xc1\x83", 6));
|
||||||
var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 9));
|
var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 6));
|
||||||
var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 9));
|
var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 6));
|
||||||
var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 12));
|
var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 9));
|
||||||
var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 11));
|
var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 8));
|
||||||
var_dump(chk_enc("\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", 15));
|
var_dump(chk_enc("\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", 15));
|
||||||
var_dump(chk_enc("\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", 15));
|
var_dump(chk_enc("\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", 15));
|
||||||
var_dump(chk_enc("\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", 18));
|
var_dump(chk_enc("\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", 18));
|
||||||
var_dump(chk_enc("\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", 18));
|
var_dump(chk_enc("\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", 18));
|
||||||
|
|
||||||
var_dump(chk_enc("\xc2\xa2\xc2\xa3\xc2\xa5", 0));
|
var_dump(chk_enc("\xc2\xa2\xc2\xa3\xc2\xa5", 0));
|
||||||
var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 9));
|
var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 6));
|
||||||
var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 12));
|
var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 9));
|
||||||
var_dump(chk_enc("\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", 15));
|
var_dump(chk_enc("\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", 15));
|
||||||
var_dump(chk_enc("\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", 18));
|
var_dump(chk_enc("\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", 18));
|
||||||
|
|
||||||
var_dump(chk_enc("\xc1\xbf", 2));
|
var_dump(chk_enc("\xc1\xbf", 2));
|
||||||
var_dump(chk_enc("\xc2\x80", 0));
|
var_dump(chk_enc("\xc2\x80", 0));
|
||||||
var_dump(chk_enc("\xdf\xbf", 0));
|
var_dump(chk_enc("\xdf\xbf", 0));
|
||||||
var_dump(chk_enc("\xe0\x9f\xff", 3));
|
var_dump(chk_enc("\xe0\x9f\xff", 2));
|
||||||
var_dump(chk_enc("\xe0\xa0\x80", 2));
|
var_dump(chk_enc("\xe0\xa0\x80", 2));
|
||||||
var_dump(chk_enc("\xef\xbf\xbf", 0));
|
var_dump(chk_enc("\xef\xbf\xbf", 0));
|
||||||
var_dump(chk_enc("\xf0\x8f\xbf\xbf", 4));
|
var_dump(chk_enc("\xf0\x8f\xbf\xbf", 3));
|
||||||
var_dump(chk_enc("\xf0\x90\x80\x80", 0));
|
var_dump(chk_enc("\xf0\x90\x80\x80", 0));
|
||||||
var_dump(chk_enc("\xf7\xbf\xbf\xbf", 4));
|
var_dump(chk_enc("\xf7\xbf\xbf\xbf", 4));
|
||||||
var_dump(chk_enc("\xf8\x87\xbf\xbf\xbf", 5));
|
var_dump(chk_enc("\xf8\x87\xbf\xbf\xbf", 5));
|
||||||
|
@ -58,7 +58,7 @@ echo "UTF-8 and surrogates area\n";
|
||||||
$out = '';
|
$out = '';
|
||||||
$cnt = 0;
|
$cnt = 0;
|
||||||
for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
|
for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
|
||||||
$s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 3);
|
$s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 2);
|
||||||
if ($s === false) {
|
if ($s === false) {
|
||||||
$cnt++;
|
$cnt++;
|
||||||
} else {
|
} else {
|
||||||
|
|
1023
ext/mbstring/tests/utf_encodings.phpt
Normal file
1023
ext/mbstring/tests/utf_encodings.phpt
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue