From 2cc6462edc79b711930368173119d0bf045d8aa9 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 17 Oct 2018 12:10:16 +0200 Subject: [PATCH 1/3] Add vtbls for EUC-TW encoding --- ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c index 9b28556dee6..141a2fee7c5 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c @@ -66,7 +66,9 @@ const mbfl_encoding mbfl_encoding_euc_tw = { "EUC-TW", (const char *(*)[])&mbfl_encoding_euc_tw_aliases, mblen_table_euctw, - MBFL_ENCTYPE_MBCS + MBFL_ENCTYPE_MBCS, + &vtbl_euctw_wchar, + &vtbl_wchar_euctw }; const struct mbfl_identify_vtbl vtbl_identify_euctw = { From 56665a1b17b8877164473872b90b68d5ac311306 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 17 Oct 2018 12:37:52 +0200 Subject: [PATCH 2/3] Fixed bug #77025 Implements 8bit conversions equivalently to iso-8859-1 conversions. This seems quite dubious to me, but seems to match the previous behavior. It might make more sense to map the characters into a private area instead, so that the 8bit encoding is treated as binary data with no case conversions (including no case conversions in the ascii range). --- NEWS | 4 ++ ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c | 45 ++++++++++++++++++++++- ext/mbstring/tests/bug77025.phpt | 12 ++++++ 3 files changed, 59 insertions(+), 2 deletions(-) create mode 100644 ext/mbstring/tests/bug77025.phpt diff --git a/NEWS b/NEWS index e029cdaaf72..64140070e17 100644 --- a/NEWS +++ b/NEWS @@ -11,6 +11,10 @@ PHP NEWS . Fixed bug #77007 (fractions in `diff()` are not correctly normalized). (Derick) +- Mbstring: + . Fixed bug #77025 (mb_strpos throws Unknown encoding or conversion error). + (Nikita) + - Tokenizer: . Fixed bug #76991 (Incorrect tokenization of multiple invalid flexible heredoc strings). (Nikita) diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c b/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c index 4faa9b8b1bb..6b5b0095357 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c @@ -38,6 +38,11 @@ #include "mbfilter.h" +const struct mbfl_convert_vtbl vtbl_8bit_wchar; +const struct mbfl_convert_vtbl vtbl_wchar_8bit; +static int mbfl_filt_conv_8bit_wchar(int c, mbfl_convert_filter *filter); +static int mbfl_filt_conv_wchar_8bit(int c, mbfl_convert_filter *filter); + static const char *mbfl_encoding_8bit_aliases[] = {"binary", NULL}; const mbfl_encoding mbfl_encoding_8bit = { @@ -47,6 +52,42 @@ const mbfl_encoding mbfl_encoding_8bit = { (const char *(*)[])&mbfl_encoding_8bit_aliases, NULL, MBFL_ENCTYPE_SBCS, - NULL, - NULL + &vtbl_8bit_wchar, + &vtbl_wchar_8bit }; + +const struct mbfl_convert_vtbl vtbl_8bit_wchar = { + mbfl_no_encoding_8bit, + mbfl_no_encoding_wchar, + mbfl_filt_conv_common_ctor, + mbfl_filt_conv_common_dtor, + mbfl_filt_conv_8bit_wchar, + mbfl_filt_conv_common_flush +}; + +const struct mbfl_convert_vtbl vtbl_wchar_8bit = { + mbfl_no_encoding_wchar, + mbfl_no_encoding_8bit, + mbfl_filt_conv_common_ctor, + mbfl_filt_conv_common_dtor, + mbfl_filt_conv_wchar_8bit, + mbfl_filt_conv_common_flush +}; + +#define CK(statement) do { if ((statement) < 0) return (-1); } while (0) + +static int mbfl_filt_conv_8bit_wchar(int c, mbfl_convert_filter *filter) +{ + return (*filter->output_function)(c, filter->data); +} + +static int mbfl_filt_conv_wchar_8bit(int c, mbfl_convert_filter *filter) +{ + if (c >= 0 && c < 0x100) { + CK((*filter->output_function)(c, filter->data)); + } else { + CK(mbfl_filt_conv_illegal_output(c, filter)); + } + + return c; +} diff --git a/ext/mbstring/tests/bug77025.phpt b/ext/mbstring/tests/bug77025.phpt new file mode 100644 index 00000000000..46416c1df18 --- /dev/null +++ b/ext/mbstring/tests/bug77025.phpt @@ -0,0 +1,12 @@ +--TEST-- +Bug #77025: mb_strpos throws Unknown encoding or conversion error +--FILE-- + +--EXPECT-- +int(1) +int(1) From 1151554668557ae649d96743c3ccb2a3ea5b278f Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 17 Oct 2018 12:47:45 +0200 Subject: [PATCH 3/3] Remove the "auto" encoding "auto" is only meaningful in functions which accept an encoding *list* and support encoding detection. These functions have explicit checks for "auto". It cannot be used as a standalone encoding in any meaningful capacity, so I'm dropping it entirely. --- ext/mbstring/libmbfl/mbfl/mbfl_encoding.c | 14 -------------- ext/mbstring/libmbfl/mbfl/mbfl_encoding.h | 1 - ext/mbstring/tests/mb_preferred_mime_name.phpt | 4 ++-- 3 files changed, 2 insertions(+), 17 deletions(-) diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c index 77c5daeeffb..094962090d1 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c +++ b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c @@ -119,22 +119,8 @@ #endif -static const char *mbfl_encoding_auto_aliases[] = {"unknown", NULL}; - -static const mbfl_encoding mbfl_encoding_auto = { - mbfl_no_encoding_auto, - "auto", - NULL, - (const char *(*)[])&mbfl_encoding_auto_aliases, - NULL, - 0, - NULL, - NULL -}; - static const mbfl_encoding *mbfl_encoding_ptr_list[] = { &mbfl_encoding_pass, - &mbfl_encoding_auto, &mbfl_encoding_wchar, &mbfl_encoding_byte2be, &mbfl_encoding_byte2le, diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h index 2952977fd88..c3a9c0affc7 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h +++ b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h @@ -36,7 +36,6 @@ enum mbfl_no_encoding { mbfl_no_encoding_invalid = -1, mbfl_no_encoding_pass, - mbfl_no_encoding_auto, mbfl_no_encoding_wchar, mbfl_no_encoding_byte2be, mbfl_no_encoding_byte2le, diff --git a/ext/mbstring/tests/mb_preferred_mime_name.phpt b/ext/mbstring/tests/mb_preferred_mime_name.phpt index 1fa73421f69..df05027c3c6 100644 --- a/ext/mbstring/tests/mb_preferred_mime_name.phpt +++ b/ext/mbstring/tests/mb_preferred_mime_name.phpt @@ -38,7 +38,7 @@ echo "== INVALID PARAMETER ==\n"; var_dump(mb_preferred_mime_name('BAD_NAME')); // No preferred name -var_dump(mb_preferred_mime_name('auto')); +var_dump(mb_preferred_mime_name('pass')); ?> --EXPECTF-- Shift_JIS @@ -55,5 +55,5 @@ UCS-4 Warning: mb_preferred_mime_name(): Unknown encoding "BAD_NAME" in %s on line %d bool(false) -Warning: mb_preferred_mime_name(): No MIME preferred name corresponding to "auto" in %s on line %d +Warning: mb_preferred_mime_name(): No MIME preferred name corresponding to "pass" in %s on line %d bool(false)