From d9269becca3211465736e32fae19e8879c488adc Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Tue, 9 Aug 2022 10:37:46 +0200 Subject: [PATCH] Fix problems with ISO-2022-KR conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • The legacy conversion code did not emit an error marker if an escape sequence was truncated. • BOTH old and new conversion code would shift from KSC5601 (KS X 1001) mode to ASCII mode on an invalid escape sequence. This doesn't make any sense. --- ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c | 9 ++++++--- ext/mbstring/tests/iso2022kr_encoding.phpt | 3 +++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c index 94fe35f9423..c4b2bf0b9f1 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c @@ -145,7 +145,7 @@ int mbfl_filt_conv_2022kr_wchar(int c, mbfl_convert_filter *filter) if (c == '$') { filter->status++; } else { - filter->status = 0; + filter->status &= ~0xF; CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); } break; @@ -154,7 +154,7 @@ int mbfl_filt_conv_2022kr_wchar(int c, mbfl_convert_filter *filter) if (c == ')') { filter->status++; } else { - filter->status = 0; + filter->status &= ~0xF; CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); } break; @@ -258,6 +258,10 @@ int mbfl_filt_conv_wchar_2022kr(int c, mbfl_convert_filter *filter) static int mbfl_filt_conv_any_2022kr_flush(mbfl_convert_filter *filter) { + if (filter->status & 0xF) { + /* Escape sequence or 2-byte character was truncated */ + (*filter->output_function)(MBFL_BAD_INPUT, filter->data); + } /* back to ascii */ if (filter->status & 0x10) { CK((*filter->output_function)(0x0f, filter->data)); /* shift in */ @@ -305,7 +309,6 @@ static size_t mb_iso2022kr_to_wchar(unsigned char **in, size_t *in_len, uint32_t p--; } *out++ = MBFL_BAD_INPUT; - *state = ASCII; } } else if (c == 0xF) { *state = ASCII; diff --git a/ext/mbstring/tests/iso2022kr_encoding.phpt b/ext/mbstring/tests/iso2022kr_encoding.phpt index 67adb103fce..d9e6ff933cb 100644 --- a/ext/mbstring/tests/iso2022kr_encoding.phpt +++ b/ext/mbstring/tests/iso2022kr_encoding.phpt @@ -114,6 +114,9 @@ convertInvalidString("\xFF\x86", "\x1B\$)C%", "UTF-16BE", "ISO-2022-KR"); // character at the end of a string, although the string was already ending in ASCII mode convertValidString("\x68\x46\x00a", "\x1B\$)C\x0E\x68\x46\x0Fa", "UTF-16BE", "ISO-2022-KR", false); +// Regression test: Don't shift from KS X 1001 to ASCII mode on invalid escape sequence +convertInvalidString("\x0E\x1BX\x74\x30", "\x00%\x76\x20", "ISO-2022-KR", "UTF-16BE", false); + // Test "long" illegal character markers mb_substitute_character("long"); convertInvalidString("\x1B", "%", "ISO-2022-KR", "UTF-8");