Implement fast text conversion interface for Base64

This commit is contained in:
Alex Dowad 2022-04-23 20:57:04 +02:00
parent 7c2587b1f6
commit 85690ae26d
2 changed files with 175 additions and 2 deletions

View file

@ -31,6 +31,9 @@
#include "mbfilter.h"
#include "mbfilter_base64.h"
static size_t mb_base64_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_base64(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
const mbfl_encoding mbfl_encoding_base64 = {
mbfl_no_encoding_base64,
"BASE64",
@ -40,8 +43,8 @@ const mbfl_encoding mbfl_encoding_base64 = {
MBFL_ENCTYPE_GL_UNSAFE,
NULL,
NULL,
NULL,
NULL
mb_base64_to_wchar,
mb_wchar_to_base64
};
const struct mbfl_convert_vtbl vtbl_8bit_b64 = {
@ -212,3 +215,122 @@ int mbfl_filt_conv_base64dec_flush(mbfl_convert_filter *filter)
}
return 0;
}
static int decode_base64(char c)
{
if (c >= 'A' && c <= 'Z') {
return c - 'A';
} else if (c >= 'a' && c <= 'z') { /* a - z */
return c - 'a' + 26;
} else if (c >= '0' && c <= '9') { /* 0 - 9 */
return c - '0' + 52;
} else if (c == '+') {
return 62;
} else if (c == '/') {
return 63;
}
return -1;
}
static size_t mb_base64_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
{
unsigned char *p = *in, *e = p + *in_len;
/* Reserve two slots at the end of the output buffer so that we always have
* space to emit any trailing bytes when we hit the end of the input string */
uint32_t *out = buf, *limit = buf + bufsize - 2;
unsigned int bits = *state & 0xFF, cache = *state >> 8;
while (p < e && (limit - out) >= 3) {
unsigned char c = *p++;
if (c == '\r' || c == '\n' || c == ' ' || c == '\t' || c == '=') {
continue;
}
int value = decode_base64(c);
if (value == -1) {
*out++ = MBFL_BAD_INPUT;
} else {
bits += 6;
cache = (cache << 6) | (value & 0x3F);
if (bits == 24) {
*out++ = (cache >> 16) & 0xFF;
*out++ = (cache >> 8) & 0xFF;
*out++ = cache & 0xFF;
bits = cache = 0;
}
}
}
if (p == e) {
if (bits) {
if (bits == 18) {
*out++ = (cache >> 10) & 0xFF;
*out++ = (cache >> 2) & 0xFF;
} else if (bits == 12) {
*out++ = (cache >> 4) & 0xFF;
}
}
} else {
*state = (cache << 8) | (bits & 0xFF);
}
*in_len = e - p;
*in = p;
return out - buf;
}
static void mb_wchar_to_base64(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
{
unsigned int bits = (buf->state & 0x3) * 8;
unsigned int chars_output = ((buf->state >> 2) & 0x3F) * 4;
unsigned int cache = buf->state >> 8;
unsigned char *out, *limit;
MB_CONVERT_BUF_LOAD(buf, out, limit);
/* Every 3 bytes of input converts to 4 bytes of output... but if the number of input
* bytes is not a multiple of 3, we still pad the output out to a multiple of 4
* That's `(len + 2) * 4 / 3`, to calculate the amount of space needed in the output buffer
*
* But also, we add a CR+LF line ending (2 bytes) for every 76 bytes of output
* That means we must multiply the above number by 78/76
* Use `zend_safe_address_guarded` to check that the multiplication doesn't overflow
*
* And since we may enter this function multiple times when converting a large string, and
* we might already be close to where a CR+LF needs to be emitted, make space for an extra
* CR+LF pair in the output buffer */
MB_CONVERT_BUF_ENSURE(buf, out, limit, (zend_safe_address_guarded(len + (bits / 8), 26, 52) / 19) + 2);
while (len--) {
uint32_t w = *in++;
cache = (cache << 8) | (w & 0xFF);
bits += 8;
if (bits == 24) {
if (chars_output > 72) {
out = mb_convert_buf_add2(out, '\r', '\n');
chars_output = 0;
}
out = mb_convert_buf_add4(out,
mbfl_base64_table[(cache >> 18) & 0x3F],
mbfl_base64_table[(cache >> 12) & 0x3F],
mbfl_base64_table[(cache >> 6) & 0x3F],
mbfl_base64_table[cache & 0x3F]);
chars_output += 4;
bits = cache = 0;
}
}
if (end && bits) {
if (bits == 8) {
out = mb_convert_buf_add4(out, mbfl_base64_table[(cache >> 2) & 0x3F], mbfl_base64_table[(cache & 0x3) << 4], '=', '=');
} else {
out = mb_convert_buf_add4(out, mbfl_base64_table[(cache >> 10) & 0x3F], mbfl_base64_table[(cache >> 4) & 0x3F], mbfl_base64_table[(cache & 0xF) << 2], '=');
}
} else {
buf->state = (cache << 8) | (((chars_output / 4) & 0x3F) << 2) | ((bits / 8) & 0x3);
}
MB_CONVERT_BUF_STORE(buf, out, limit);
}

View file

@ -0,0 +1,51 @@
--TEST--
Temporary test of mbstring's Base64 'encoding'
--EXTENSIONS--
mbstring
--FILE--
<?php
/* Using mbstring to convert strings to and from Base64 has already been deprecated
* So this test should be removed when the Base64 'encoding' is */
function testConversion($raw, $base64) {
$converted = mb_convert_encoding($raw, 'Base64', '8bit');
if ($converted !== $base64)
die('Expected ' . bin2hex($raw) . ' to convert to "' . $base64 . '"; actually got "' . $converted . '"');
$converted = mb_convert_encoding($base64, '8bit', 'Base64');
if ($converted !== $raw)
die('Expected "' . $base64 . '" to convert to ' . bin2hex($raw) . '; actually got ' . bin2hex($converted));
}
testConversion('', '');
testConversion('a', 'YQ==');
testConversion('ab', 'YWI=');
testConversion("\x01\x02\x03", 'AQID');
testConversion("\xFF\xFE\x11\x22", '//4RIg==');
testConversion("\x00", 'AA==');
testConversion("\x00\x00", 'AAA=');
testConversion("\x00\x00\x00", 'AAAA');
testConversion(str_repeat("ABCDEFGHIJ", 20), "QUJDREVGR0hJSkFCQ0RFRkdISUpBQkNERUZHSElKQUJDREVGR0hJSkFCQ0RFRkdISUpBQkNERUZH\r\nSElKQUJDREVGR0hJSkFCQ0RFRkdISUpBQkNERUZHSElKQUJDREVGR0hJSkFCQ0RFRkdISUpBQkNE\r\nRUZHSElKQUJDREVGR0hJSkFCQ0RFRkdISUpBQkNERUZHSElKQUJDREVGR0hJSkFCQ0RFRkdISUpB\r\nQkNERUZHSElKQUJDREVGR0hJSkFCQ0RFRkdISUo=");
echo "Done!\n";
?>
--EXPECTF--
Deprecated: mb_convert_encoding(): Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead in %s
Deprecated: mb_convert_encoding(): Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead in %s
Deprecated: mb_convert_encoding(): Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead in %s
Deprecated: mb_convert_encoding(): Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead in %s
Deprecated: mb_convert_encoding(): Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead in %s
Deprecated: mb_convert_encoding(): Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead in %s
Deprecated: mb_convert_encoding(): Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead in %s
Deprecated: mb_convert_encoding(): Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead in %s
Deprecated: mb_convert_encoding(): Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead in %s
Done!