From a1a69c37342ebc13077735f1e048a38c09d30d49 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Tue, 6 Dec 2022 17:33:53 +0200 Subject: [PATCH] Support Microsoft's "Best Fit" mappings for Windows-1252 text encoding In b5ff87ca71, I made a number of adjustments to our conversion code for CP1252. One of the adjustments was to make the mappings match those published by the Unicode Consortium in the file CP1252.TXT. These do not include mappings for the CP1252 bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D. Rostyslav Gulka reported that this caused a problem. His application stores binary JPEG data in an MS-SQL database. When they SELECT the binary data out of the database, it is treated as CP1252 text and automatically converted to UTF-8. To recover the original binary data, they then do a conversion from UTF-8 to CP1252. Obviously, that does not work if certain CP1252 bytes do not map to any Unicode codepoint at all. While this is a very unusual application of text encoding conversion, and we might choose not to support it if there was no other basis for including those mappings, it seems that Microsoft does actually include them in the Win32 API as "best fit" mappings. These are extra mappings from Unicode to other text encodings, which the Win32 API function WideCharToMultiByte uses by default unless the WC_NO_BEST_FIT_CHARS flag was passed. A list of these "best fit" mappings for CP1252 can be found here: https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WindowsBestFit/bestfit1252.txt --- ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c | 10 +++++----- ext/mbstring/tests/cp1252_encoding.phpt | 5 ----- ext/mbstring/tests/data/CP1252.txt | 10 +++++----- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c b/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c index 43ab8a2e487..e9b2b040865 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c @@ -379,10 +379,10 @@ DEF_SB_TBL(cp1251, "Windows-1251", "Windows-1251", cp1251_aliases, 0x80, cp1251_ static const char *cp1252_aliases[] = {"cp1252", NULL}; static const unsigned short cp1252_ucs_table[] = { - 0X20AC,0X0000,0X201A,0X0192,0X201E,0X2026,0X2020,0X2021, - 0X02C6,0X2030,0X0160,0X2039,0X0152,0X0000,0X017D,0X0000, - 0X0000,0X2018,0X2019,0X201C,0X201D,0X2022,0X2013,0X2014, - 0X02DC,0X2122,0X0161,0X203A,0X0153,0X0000,0X017E,0X0178 + 0X20AC,0X0081,0X201A,0X0192,0X201E,0X2026,0X2020,0X2021, + 0X02C6,0X2030,0X0160,0X2039,0X0152,0X008D,0X017D,0X008F, + 0X0090,0X2018,0X2019,0X201C,0X201D,0X2022,0X2013,0X2014, + 0X02DC,0X2122,0X0161,0X203A,0X0153,0X009D,0X017E,0X0178 }; DEF_SB(cp1252, "Windows-1252", "Windows-1252", cp1252_aliases); @@ -396,7 +396,7 @@ static int mbfl_filt_conv_wchar_cp1252(int c, mbfl_convert_filter *filter) } } CK(mbfl_filt_conv_illegal_output(c, filter)); - } else if ((c <= 0x7F || c >= 0xA0) && c != MBFL_BAD_INPUT) { + } else if ((c <= 0x7F || c >= 0xA0 || c == 0x81 || c == 0x8D || c == 0x8F || c == 0x90 || c == 0x9D) && c != MBFL_BAD_INPUT) { CK((*filter->output_function)(c, filter->data)); } else { CK(mbfl_filt_conv_illegal_output(c, filter)); diff --git a/ext/mbstring/tests/cp1252_encoding.phpt b/ext/mbstring/tests/cp1252_encoding.phpt index 706e02ceee6..8705ac2c951 100644 --- a/ext/mbstring/tests/cp1252_encoding.phpt +++ b/ext/mbstring/tests/cp1252_encoding.phpt @@ -11,11 +11,6 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test"); include('encoding_tests.inc'); testEncodingFromUTF16ConversionTable(__DIR__ . '/data/CP1252.txt', 'CP1252'); -// Test "long" illegal character markers -mb_substitute_character("long"); -convertInvalidString("\x81", "%", "CP1252", "UTF-8"); -convertInvalidString("\x9D", "%", "CP1252", "UTF-8"); - echo "Done!\n"; ?> --EXPECT-- diff --git a/ext/mbstring/tests/data/CP1252.txt b/ext/mbstring/tests/data/CP1252.txt index 8ff4b204b75..0ccd4ff1898 100644 --- a/ext/mbstring/tests/data/CP1252.txt +++ b/ext/mbstring/tests/data/CP1252.txt @@ -145,7 +145,7 @@ 0x7E 0x007E #TILDE 0x7F 0x007F #DELETE 0x80 0x20AC #EURO SIGN -0x81 #UNDEFINED +0x81 0x0081 #*** MODIFIED TO FOLLOW WINDOWS "BEST FIT" MAPPINGS 0x82 0x201A #SINGLE LOW-9 QUOTATION MARK 0x83 0x0192 #LATIN SMALL LETTER F WITH HOOK 0x84 0x201E #DOUBLE LOW-9 QUOTATION MARK @@ -157,10 +157,10 @@ 0x8A 0x0160 #LATIN CAPITAL LETTER S WITH CARON 0x8B 0x2039 #SINGLE LEFT-POINTING ANGLE QUOTATION MARK 0x8C 0x0152 #LATIN CAPITAL LIGATURE OE -0x8D #UNDEFINED +0x8D 0x008D #*** MODIFIED TO FOLLOW WINDOWS "BEST FIT" MAPPINGS 0x8E 0x017D #LATIN CAPITAL LETTER Z WITH CARON -0x8F #UNDEFINED -0x90 #UNDEFINED +0x8F 0x008F #*** MODIFIED TO FOLLOW WINDOWS "BEST FIT" MAPPINGS +0x90 0x0090 #*** MODIFIED TO FOLLOW WINDOWS "BEST FIT" MAPPINGS 0x91 0x2018 #LEFT SINGLE QUOTATION MARK 0x92 0x2019 #RIGHT SINGLE QUOTATION MARK 0x93 0x201C #LEFT DOUBLE QUOTATION MARK @@ -173,7 +173,7 @@ 0x9A 0x0161 #LATIN SMALL LETTER S WITH CARON 0x9B 0x203A #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 0x9C 0x0153 #LATIN SMALL LIGATURE OE -0x9D #UNDEFINED +0x9D 0x009D #*** MODIFIED TO FOLLOW WINDOWS "BEST FIT" MAPPINGS 0x9E 0x017E #LATIN SMALL LETTER Z WITH CARON 0x9F 0x0178 #LATIN CAPITAL LETTER Y WITH DIAERESIS 0xA0 0x00A0 #NO-BREAK SPACE