From a1a69c37342ebc13077735f1e048a38c09d30d49 Mon Sep 17 00:00:00 2001
From: Alex Dowad <alexinbeijing@gmail.com>
Date: Tue, 6 Dec 2022 17:33:53 +0200
Subject: [PATCH] Support Microsoft's "Best Fit" mappings for Windows-1252 text
 encoding

In b5ff87ca71, I made a number of adjustments to our conversion code
for CP1252. One of the adjustments was to make the mappings match those
published by the Unicode Consortium in the file CP1252.TXT. These do
not include mappings for the CP1252 bytes 0x81, 0x8D, 0x8F, 0x90, and
0x9D.

Rostyslav Gulka reported that this caused a problem. His application
stores binary JPEG data in an MS-SQL database. When they SELECT the
binary data out of the database, it is treated as CP1252 text and
automatically converted to UTF-8. To recover the original binary
data, they then do a conversion from UTF-8 to CP1252.

Obviously, that does not work if certain CP1252 bytes do not map to
any Unicode codepoint at all.

While this is a very unusual application of text encoding conversion,
and we might choose not to support it if there was no other basis for
including those mappings, it seems that Microsoft does actually include
them in the Win32 API as "best fit" mappings. These are extra mappings
from Unicode to other text encodings, which the Win32 API function
WideCharToMultiByte uses by default unless the WC_NO_BEST_FIT_CHARS
flag was passed.

A list of these "best fit" mappings for CP1252 can be found here:

https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WindowsBestFit/bestfit1252.txt
---
 ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c | 10 +++++-----
 ext/mbstring/tests/cp1252_encoding.phpt            |  5 -----
 ext/mbstring/tests/data/CP1252.txt                 | 10 +++++-----
 3 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c b/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c
index 43ab8a2e487..e9b2b040865 100644
--- a/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c
@@ -379,10 +379,10 @@ DEF_SB_TBL(cp1251, "Windows-1251", "Windows-1251", cp1251_aliases, 0x80, cp1251_
 
 static const char *cp1252_aliases[] = {"cp1252", NULL};
 static const unsigned short cp1252_ucs_table[] = {
- 0X20AC,0X0000,0X201A,0X0192,0X201E,0X2026,0X2020,0X2021,
- 0X02C6,0X2030,0X0160,0X2039,0X0152,0X0000,0X017D,0X0000,
- 0X0000,0X2018,0X2019,0X201C,0X201D,0X2022,0X2013,0X2014,
- 0X02DC,0X2122,0X0161,0X203A,0X0153,0X0000,0X017E,0X0178
+ 0X20AC,0X0081,0X201A,0X0192,0X201E,0X2026,0X2020,0X2021,
+ 0X02C6,0X2030,0X0160,0X2039,0X0152,0X008D,0X017D,0X008F,
+ 0X0090,0X2018,0X2019,0X201C,0X201D,0X2022,0X2013,0X2014,
+ 0X02DC,0X2122,0X0161,0X203A,0X0153,0X009D,0X017E,0X0178
 };
 DEF_SB(cp1252, "Windows-1252", "Windows-1252", cp1252_aliases);
 
@@ -396,7 +396,7 @@ static int mbfl_filt_conv_wchar_cp1252(int c, mbfl_convert_filter *filter)
 			}
 		}
 		CK(mbfl_filt_conv_illegal_output(c, filter));
-	} else if ((c <= 0x7F || c >= 0xA0) && c != MBFL_BAD_INPUT) {
+	} else if ((c <= 0x7F || c >= 0xA0 || c == 0x81 || c == 0x8D || c == 0x8F || c == 0x90 || c == 0x9D) && c != MBFL_BAD_INPUT) {
 		CK((*filter->output_function)(c, filter->data));
 	} else {
 		CK(mbfl_filt_conv_illegal_output(c, filter));
diff --git a/ext/mbstring/tests/cp1252_encoding.phpt b/ext/mbstring/tests/cp1252_encoding.phpt
index 706e02ceee6..8705ac2c951 100644
--- a/ext/mbstring/tests/cp1252_encoding.phpt
+++ b/ext/mbstring/tests/cp1252_encoding.phpt
@@ -11,11 +11,6 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
 include('encoding_tests.inc');
 testEncodingFromUTF16ConversionTable(__DIR__ . '/data/CP1252.txt', 'CP1252');
 
-// Test "long" illegal character markers
-mb_substitute_character("long");
-convertInvalidString("\x81", "%", "CP1252", "UTF-8");
-convertInvalidString("\x9D", "%", "CP1252", "UTF-8");
-
 echo "Done!\n";
 ?>
 --EXPECT--
diff --git a/ext/mbstring/tests/data/CP1252.txt b/ext/mbstring/tests/data/CP1252.txt
index 8ff4b204b75..0ccd4ff1898 100644
--- a/ext/mbstring/tests/data/CP1252.txt
+++ b/ext/mbstring/tests/data/CP1252.txt
@@ -145,7 +145,7 @@
 0x7E	0x007E	#TILDE
 0x7F	0x007F	#DELETE
 0x80	0x20AC	#EURO SIGN
-0x81	      	#UNDEFINED
+0x81	0x0081	#*** MODIFIED TO FOLLOW WINDOWS "BEST FIT" MAPPINGS
 0x82	0x201A	#SINGLE LOW-9 QUOTATION MARK
 0x83	0x0192	#LATIN SMALL LETTER F WITH HOOK
 0x84	0x201E	#DOUBLE LOW-9 QUOTATION MARK
@@ -157,10 +157,10 @@
 0x8A	0x0160	#LATIN CAPITAL LETTER S WITH CARON
 0x8B	0x2039	#SINGLE LEFT-POINTING ANGLE QUOTATION MARK
 0x8C	0x0152	#LATIN CAPITAL LIGATURE OE
-0x8D	      	#UNDEFINED
+0x8D	0x008D	#*** MODIFIED TO FOLLOW WINDOWS "BEST FIT" MAPPINGS
 0x8E	0x017D	#LATIN CAPITAL LETTER Z WITH CARON
-0x8F	      	#UNDEFINED
-0x90	      	#UNDEFINED
+0x8F	0x008F	#*** MODIFIED TO FOLLOW WINDOWS "BEST FIT" MAPPINGS
+0x90	0x0090	#*** MODIFIED TO FOLLOW WINDOWS "BEST FIT" MAPPINGS
 0x91	0x2018	#LEFT SINGLE QUOTATION MARK
 0x92	0x2019	#RIGHT SINGLE QUOTATION MARK
 0x93	0x201C	#LEFT DOUBLE QUOTATION MARK
@@ -173,7 +173,7 @@
 0x9A	0x0161	#LATIN SMALL LETTER S WITH CARON
 0x9B	0x203A	#SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
 0x9C	0x0153	#LATIN SMALL LIGATURE OE
-0x9D	      	#UNDEFINED
+0x9D	0x009D	#*** MODIFIED TO FOLLOW WINDOWS "BEST FIT" MAPPINGS
 0x9E	0x017E	#LATIN SMALL LETTER Z WITH CARON
 0x9F	0x0178	#LATIN CAPITAL LETTER Y WITH DIAERESIS
 0xA0	0x00A0	#NO-BREAK SPACE