Optimize conversion of CP936 to Unicode

In the previous commit, the branch in mb_strlen which implements the function using the mblen_table (when one is available) was removed. This made mb_strlen faster for just about every legacy text encoding which had an mblen_table... except for CP936, which became much slower. This indicated that our decoding filter for CP936 was slow. I checked and found iterating over the PUA table was a major bottleneck. After optimizing that bottleneck out, benchmarks for text encoding conversion speed were as follows: CP936, short - to UTF-8 - faster by 10.44% (0.0003 vs 0.0003) CP936, short - to UTF-16BE - faster by 11.45% (0.0003 vs 0.0003) CP936, medium - to UTF-8 - faster by 139.09% (0.0012 vs 0.0005) CP936, medium - to UTF-16BE - faster by 140.34% (0.0013 vs 0.0005) CP936, long - to UTF-16BE - faster by 215.88% (0.0538 vs 0.0170) CP936, long - to UTF-8 - faster by 232.41% (0.0528 vs 0.0159) This does not fully express how much faster the CP936 decoder is now, since these conversion benchmarks are not only measuring the speed of decoding CP936, but then also re-encoding the codepoints as UTF-8 or UTF-16. For functions like mb_strlen, which just need to decode but not re-encode the text, the gain in performance is much larger.
2025-08-16 14:08:47 +02:00 · 2022-12-31 07:22:54 +02:00 · 2022-12-31 07:22:54 +02:00 · 703725e43b
commit 703725e43b
parent 31e7d6ef05
2 changed files with 365 additions and 142 deletions
--- a/ext/mbstring/libmbfl/filters/mbfilter_cp936.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_cp936.c
@ -291,37 +291,45 @@ static size_t mb_cp936_to_wchar(unsigned char **in, size_t *in_len, uint32_t *bu
 			}

 			unsigned char c2 = *p++;
+			if (c2 < 0x40 || c2 == 0x7F || c2 == 0xFF) {
+				*out++ = MBFL_BAD_INPUT;
+				continue;
+			}

-			if (((c >= 0xAA && c <= 0xAF) || (c >= 0xF8 && c <= 0xFE)) && (c2 >= 0xA1 && c2 <= 0xFE)) {
+			if (((c >= 0xAA && c <= 0xAF) || (c >= 0xF8 && c <= 0xFE)) && c2 >= 0xA1) {
 				/* UDA part 1, 2: U+E000-U+E4C5 */
 				*out++ = 94*(c >= 0xF8 ? c - 0xF2 : c - 0xAA) + (c2 - 0xA1) + 0xE000;
-			} else if (c >= 0xA1 && c <= 0xA7 && c2 >= 0x40 && c2 < 0xA1 && c2 != 0x7F) {
+			} else if (c >= 0xA1 && c <= 0xA7 && c2 < 0xA1) {
 				/* UDA part 3: U+E4C6-U+E765*/
 				*out++ = 96*(c - 0xA1) + c2 - (c2 >= 0x80 ? 0x41 : 0x40) + 0xE4C6;
 			} else {
-				unsigned int w = (c << 8) | c2;
+				unsigned int w = (c - 0x81)*192 + c2 - 0x40; /* Convert c, c2 into GB 2312 table lookup index */

-				if ((w >= 0xA2AB && w <= 0xA9FE) || (w >= 0xD7FA && w <= 0xD7FE) || (w >= 0xFE50 && w <= 0xFEA0)) {
-					for (int k = 0; k < mbfl_cp936_pua_tbl_max; k++) {
-						if (w >= mbfl_cp936_pua_tbl[k][2] && w <= mbfl_cp936_pua_tbl[k][2] + mbfl_cp936_pua_tbl[k][1] - mbfl_cp936_pua_tbl[k][0]) {
-							*out++ = w -  mbfl_cp936_pua_tbl[k][2] + mbfl_cp936_pua_tbl[k][0];
-							goto next_iteration;
+				/* For CP936 and GB18030, certain GB 2312 byte combinations are mapped to PUA codepoints,
+				 * whereas the same combinations aren't mapped to any codepoint for HZ and EUC-CN
+				 * To avoid duplicating the entire GB 2312 -> Unicode lookup table, we have three
+				 * auxiliary tables which are consulted instead for specific ranges of lookup indices */
+				if (w >= 0x192B) {
+					if (w <= 0x1EBE) {
+						*out++ = cp936_pua_tbl1[w - 0x192B];
+						continue;
+					} else if (w >= 0x413A) {
+						if (w <= 0x413E) {
+							*out++ = cp936_pua_tbl2[w - 0x413A];
+							continue;
+						} else if (w >= 0x5DD0 && w <= 0x5E20) {
+							*out++ = cp936_pua_tbl3[w - 0x5DD0];
+							continue;
 						}
 					}
 				}

-				if (c < 0xFF && c > 0x80 && c2 >= 0x40 && c2 < 0xFF && c2 != 0x7F) {
-					w = (c - 0x81)*192 + c2 - 0x40;
-					ZEND_ASSERT(w < cp936_ucs_table_size);
-					*out++ = cp936_ucs_table[w];
-				} else {
-					*out++ = MBFL_BAD_INPUT;
-				}
+				ZEND_ASSERT(w < cp936_ucs_table_size);
+				*out++ = cp936_ucs_table[w];
 			}
 		} else {
 			*out++ = 0xF8F5;
 		}
-next_iteration: ;
 	}

 	*in_len = e - p;
--- a/ext/mbstring/libmbfl/filters/unicode_table_cp936.h
+++ b/ext/mbstring/libmbfl/filters/unicode_table_cp936.h