mirror of
https://github.com/nodejs/node.git
synced 2025-08-15 13:48:44 +02:00

PR-URL: https://github.com/nodejs/node/pull/50091 Reviewed-By: Yagiz Nizipli <yagiz@nizipli.com> Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com> Reviewed-By: Tobias Nießen <tniessen@tnie.de>
28614 lines
1.2 MiB
28614 lines
1.2 MiB
/* auto-generated on 2023-10-08 13:48:09 -0400. Do not edit! */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf.cpp
|
|
/* begin file src/simdutf.cpp */
|
|
#include "simdutf.h"
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=implementation.cpp
|
|
/* begin file src/implementation.cpp */
|
|
#include <initializer_list>
|
|
#include <climits>
|
|
|
|
// Useful for debugging purposes
|
|
namespace simdutf {
|
|
namespace {
|
|
|
|
template <typename T>
|
|
std::string toBinaryString(T b) {
|
|
std::string binary = "";
|
|
T mask = T(1) << (sizeof(T) * CHAR_BIT - 1);
|
|
while (mask > 0) {
|
|
binary += ((b & mask) == 0) ? '0' : '1';
|
|
mask >>= 1;
|
|
}
|
|
return binary;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Implementations
|
|
// The best choice should always come first!
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64.h
|
|
/* begin file src/simdutf/arm64.h */
|
|
#ifndef SIMDUTF_ARM64_H
|
|
#define SIMDUTF_ARM64_H
|
|
|
|
#ifdef SIMDUTF_FALLBACK_H
|
|
#error "arm64.h must be included before fallback.h"
|
|
#endif
|
|
|
|
|
|
#ifndef SIMDUTF_IMPLEMENTATION_ARM64
|
|
#define SIMDUTF_IMPLEMENTATION_ARM64 (SIMDUTF_IS_ARM64)
|
|
#endif
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_ARM64 SIMDUTF_IMPLEMENTATION_ARM64 && SIMDUTF_IS_ARM64
|
|
|
|
|
|
|
|
#if SIMDUTF_IMPLEMENTATION_ARM64
|
|
|
|
namespace simdutf {
|
|
/**
|
|
* Implementation for NEON (ARMv8).
|
|
*/
|
|
namespace arm64 {
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/implementation.h
|
|
/* begin file src/simdutf/arm64/implementation.h */
|
|
#ifndef SIMDUTF_ARM64_IMPLEMENTATION_H
|
|
#define SIMDUTF_ARM64_IMPLEMENTATION_H
|
|
|
|
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
|
|
namespace {
|
|
using namespace simdutf;
|
|
}
|
|
|
|
class implementation final : public simdutf::implementation {
|
|
public:
|
|
simdutf_really_inline implementation() : simdutf::implementation("arm64", "ARM NEON", internal::instruction_set::NEON) {}
|
|
simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
|
|
simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
|
|
};
|
|
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_ARM64_IMPLEMENTATION_H
|
|
/* end file src/simdutf/arm64/implementation.h */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/begin.h
|
|
/* begin file src/simdutf/arm64/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "arm64"
|
|
// #define SIMDUTF_IMPLEMENTATION arm64
|
|
/* end file src/simdutf/arm64/begin.h */
|
|
|
|
// Declarations
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/intrinsics.h
|
|
/* begin file src/simdutf/arm64/intrinsics.h */
|
|
#ifndef SIMDUTF_ARM64_INTRINSICS_H
|
|
#define SIMDUTF_ARM64_INTRINSICS_H
|
|
|
|
|
|
// This should be the correct header whether
|
|
// you use visual studio or other compilers.
|
|
#include <arm_neon.h>
|
|
|
|
#endif // SIMDUTF_ARM64_INTRINSICS_H
|
|
/* end file src/simdutf/arm64/intrinsics.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/bitmanipulation.h
|
|
/* begin file src/simdutf/arm64/bitmanipulation.h */
|
|
#ifndef SIMDUTF_ARM64_BITMANIPULATION_H
|
|
#define SIMDUTF_ARM64_BITMANIPULATION_H
|
|
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdutf_really_inline int count_ones(uint64_t input_num) {
|
|
return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_ARM64_BITMANIPULATION_H
|
|
/* end file src/simdutf/arm64/bitmanipulation.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/simd.h
|
|
/* begin file src/simdutf/arm64/simd.h */
|
|
#ifndef SIMDUTF_ARM64_SIMD_H
|
|
#define SIMDUTF_ARM64_SIMD_H
|
|
|
|
#include <type_traits>
|
|
|
|
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
namespace {
|
|
// Start of private section with Visual Studio workaround
|
|
|
|
|
|
/**
|
|
* make_uint8x16_t initializes a SIMD register (uint8x16_t).
|
|
* This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...}
|
|
* is not recognized under Visual Studio! This is a workaround.
|
|
* Using a std::initializer_list<uint8_t> as a parameter resulted in
|
|
* inefficient code. With the current approach, if the parameters are
|
|
* compile-time constants,
|
|
* GNU GCC compiles it to ldr, the same as uint8x16_t x = {1,2,3...}.
|
|
* You should not use this function except for compile-time constants:
|
|
* it is not efficient.
|
|
*/
|
|
simdutf_really_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
|
|
uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8,
|
|
uint8_t x9, uint8_t x10, uint8_t x11, uint8_t x12,
|
|
uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) {
|
|
// Doing a load like so end ups generating worse code.
|
|
// uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
|
|
// x9, x10,x11,x12,x13,x14,x15,x16};
|
|
// return vld1q_u8(array);
|
|
uint8x16_t x{};
|
|
// incredibly, Visual Studio does not allow x[0] = x1
|
|
x = vsetq_lane_u8(x1, x, 0);
|
|
x = vsetq_lane_u8(x2, x, 1);
|
|
x = vsetq_lane_u8(x3, x, 2);
|
|
x = vsetq_lane_u8(x4, x, 3);
|
|
x = vsetq_lane_u8(x5, x, 4);
|
|
x = vsetq_lane_u8(x6, x, 5);
|
|
x = vsetq_lane_u8(x7, x, 6);
|
|
x = vsetq_lane_u8(x8, x, 7);
|
|
x = vsetq_lane_u8(x9, x, 8);
|
|
x = vsetq_lane_u8(x10, x, 9);
|
|
x = vsetq_lane_u8(x11, x, 10);
|
|
x = vsetq_lane_u8(x12, x, 11);
|
|
x = vsetq_lane_u8(x13, x, 12);
|
|
x = vsetq_lane_u8(x14, x, 13);
|
|
x = vsetq_lane_u8(x15, x, 14);
|
|
x = vsetq_lane_u8(x16, x, 15);
|
|
return x;
|
|
}
|
|
|
|
// We have to do the same work for make_int8x16_t
|
|
simdutf_really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4,
|
|
int8_t x5, int8_t x6, int8_t x7, int8_t x8,
|
|
int8_t x9, int8_t x10, int8_t x11, int8_t x12,
|
|
int8_t x13, int8_t x14, int8_t x15, int8_t x16) {
|
|
// Doing a load like so end ups generating worse code.
|
|
// int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
|
|
// x9, x10,x11,x12,x13,x14,x15,x16};
|
|
// return vld1q_s8(array);
|
|
int8x16_t x{};
|
|
// incredibly, Visual Studio does not allow x[0] = x1
|
|
x = vsetq_lane_s8(x1, x, 0);
|
|
x = vsetq_lane_s8(x2, x, 1);
|
|
x = vsetq_lane_s8(x3, x, 2);
|
|
x = vsetq_lane_s8(x4, x, 3);
|
|
x = vsetq_lane_s8(x5, x, 4);
|
|
x = vsetq_lane_s8(x6, x, 5);
|
|
x = vsetq_lane_s8(x7, x, 6);
|
|
x = vsetq_lane_s8(x8, x, 7);
|
|
x = vsetq_lane_s8(x9, x, 8);
|
|
x = vsetq_lane_s8(x10, x, 9);
|
|
x = vsetq_lane_s8(x11, x, 10);
|
|
x = vsetq_lane_s8(x12, x, 11);
|
|
x = vsetq_lane_s8(x13, x, 12);
|
|
x = vsetq_lane_s8(x14, x, 13);
|
|
x = vsetq_lane_s8(x15, x, 14);
|
|
x = vsetq_lane_s8(x16, x, 15);
|
|
return x;
|
|
}
|
|
|
|
simdutf_really_inline uint8x8_t make_uint8x8_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
|
|
uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8) {
|
|
uint8x8_t x{};
|
|
x = vset_lane_u8(x1, x, 0);
|
|
x = vset_lane_u8(x2, x, 1);
|
|
x = vset_lane_u8(x3, x, 2);
|
|
x = vset_lane_u8(x4, x, 3);
|
|
x = vset_lane_u8(x5, x, 4);
|
|
x = vset_lane_u8(x6, x, 5);
|
|
x = vset_lane_u8(x7, x, 6);
|
|
x = vset_lane_u8(x8, x, 7);
|
|
return x;
|
|
}
|
|
|
|
simdutf_really_inline uint16x8_t make_uint16x8_t(uint16_t x1, uint16_t x2, uint16_t x3, uint16_t x4,
|
|
uint16_t x5, uint16_t x6, uint16_t x7, uint16_t x8) {
|
|
uint16x8_t x{};
|
|
x = vsetq_lane_u16(x1, x, 0);
|
|
x = vsetq_lane_u16(x2, x, 1);
|
|
x = vsetq_lane_u16(x3, x, 2);
|
|
x = vsetq_lane_u16(x4, x, 3);
|
|
x = vsetq_lane_u16(x5, x, 4);
|
|
x = vsetq_lane_u16(x6, x, 5);
|
|
x = vsetq_lane_u16(x7, x, 6);
|
|
x = vsetq_lane_u16(x8, x, 7);;
|
|
return x;
|
|
}
|
|
|
|
simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1, int16_t x2, int16_t x3, int16_t x4,
|
|
int16_t x5, int16_t x6, int16_t x7, int16_t x8) {
|
|
uint16x8_t x{};
|
|
x = vsetq_lane_s16(x1, x, 0);
|
|
x = vsetq_lane_s16(x2, x, 1);
|
|
x = vsetq_lane_s16(x3, x, 2);
|
|
x = vsetq_lane_s16(x4, x, 3);
|
|
x = vsetq_lane_s16(x5, x, 4);
|
|
x = vsetq_lane_s16(x6, x, 5);
|
|
x = vsetq_lane_s16(x7, x, 6);
|
|
x = vsetq_lane_s16(x8, x, 7);;
|
|
return x;
|
|
}
|
|
|
|
|
|
// End of private section with Visual Studio workaround
|
|
} // namespace
|
|
#endif // SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
|
|
|
|
template<typename T>
|
|
struct simd8;
|
|
|
|
//
|
|
// Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
|
|
//
|
|
template<typename T, typename Mask=simd8<bool>>
|
|
struct base_u8 {
|
|
uint8x16_t value;
|
|
static const int SIZE = sizeof(value);
|
|
|
|
// Conversion from/to SIMD register
|
|
simdutf_really_inline base_u8(const uint8x16_t _value) : value(_value) {}
|
|
simdutf_really_inline operator const uint8x16_t&() const { return this->value; }
|
|
simdutf_really_inline operator uint8x16_t&() { return this->value; }
|
|
simdutf_really_inline T first() const { return vgetq_lane_u8(*this,0); }
|
|
simdutf_really_inline T last() const { return vgetq_lane_u8(*this,15); }
|
|
|
|
// Bit operations
|
|
simdutf_really_inline simd8<T> operator|(const simd8<T> other) const { return vorrq_u8(*this, other); }
|
|
simdutf_really_inline simd8<T> operator&(const simd8<T> other) const { return vandq_u8(*this, other); }
|
|
simdutf_really_inline simd8<T> operator^(const simd8<T> other) const { return veorq_u8(*this, other); }
|
|
simdutf_really_inline simd8<T> bit_andnot(const simd8<T> other) const { return vbicq_u8(*this, other); }
|
|
simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
simdutf_really_inline simd8<T>& operator|=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
|
|
simdutf_really_inline simd8<T>& operator&=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
|
|
simdutf_really_inline simd8<T>& operator^=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
|
|
|
|
friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return vceqq_u8(lhs, rhs); }
|
|
|
|
template<int N=1>
|
|
simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
return vextq_u8(prev_chunk, *this, 16 - N);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template<>
|
|
struct simd8<bool>: base_u8<bool> {
|
|
typedef uint16_t bitmask_t;
|
|
typedef uint32_t bitmask2_t;
|
|
|
|
static simdutf_really_inline simd8<bool> splat(bool _value) { return vmovq_n_u8(uint8_t(-(!!_value))); }
|
|
|
|
simdutf_really_inline simd8(const uint8x16_t _value) : base_u8<bool>(_value) {}
|
|
// False constructor
|
|
simdutf_really_inline simd8() : simd8(vdupq_n_u8(0)) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(bool _value) : simd8(splat(_value)) {}
|
|
simdutf_really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
|
|
|
|
// We return uint32_t instead of uint16_t because that seems to be more efficient for most
|
|
// purposes (cutting it down to uint16_t costs performance in some compilers).
|
|
simdutf_really_inline uint32_t to_bitmask() const {
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t bit_mask = make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
|
|
#else
|
|
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
|
|
#endif
|
|
auto minput = *this & bit_mask;
|
|
uint8x16_t tmp = vpaddq_u8(minput, minput);
|
|
tmp = vpaddq_u8(tmp, tmp);
|
|
tmp = vpaddq_u8(tmp, tmp);
|
|
return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
|
|
}
|
|
|
|
// Returns 4-bit out of each byte, alternating between the high 4 bits and low bits
|
|
// result it is 64 bit.
|
|
// This method is expected to be faster than none() and is equivalent
|
|
// when the vector register is the result of a comparison, with byte
|
|
// values 0xff and 0x00.
|
|
simdutf_really_inline uint64_t to_bitmask64() const {
|
|
return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(*this), 4)), 0);
|
|
}
|
|
|
|
simdutf_really_inline bool any() const { return vmaxvq_u8(*this) != 0; }
|
|
simdutf_really_inline bool none() const { return vmaxvq_u8(*this) == 0; }
|
|
simdutf_really_inline bool all() const { return vminvq_u8(*this) == 0xFF; }
|
|
|
|
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template<>
|
|
struct simd8<uint8_t>: base_u8<uint8_t> {
|
|
static simdutf_really_inline simd8<uint8_t> splat(uint8_t _value) { return vmovq_n_u8(_value); }
|
|
static simdutf_really_inline simd8<uint8_t> zero() { return vdupq_n_u8(0); }
|
|
static simdutf_really_inline simd8<uint8_t> load(const uint8_t* values) { return vld1q_u8(values); }
|
|
simdutf_really_inline simd8(const uint8x16_t _value) : base_u8<uint8_t>(_value) {}
|
|
// Zero constructor
|
|
simdutf_really_inline simd8() : simd8(zero()) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Member-by-member initialization
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
simdutf_really_inline simd8(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) : simd8(make_uint8x16_t(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
)) {}
|
|
#else
|
|
simdutf_really_inline simd8(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) : simd8(uint8x16_t{
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
}) {}
|
|
#endif
|
|
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdutf_really_inline static simd8<uint8_t> repeat_16(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) {
|
|
return simd8<uint8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Store to array
|
|
simdutf_really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
|
|
|
|
// Saturated math
|
|
simdutf_really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return vqaddq_u8(*this, other); }
|
|
simdutf_really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return vqsubq_u8(*this, other); }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdutf_really_inline simd8<uint8_t> operator+(const simd8<uint8_t> other) const { return vaddq_u8(*this, other); }
|
|
simdutf_really_inline simd8<uint8_t> operator-(const simd8<uint8_t> other) const { return vsubq_u8(*this, other); }
|
|
simdutf_really_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other) { *this = *this + other; return *this; }
|
|
simdutf_really_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other) { *this = *this - other; return *this; }
|
|
|
|
// Order-specific operations
|
|
simdutf_really_inline uint8_t max_val() const { return vmaxvq_u8(*this); }
|
|
simdutf_really_inline uint8_t min_val() const { return vminvq_u8(*this); }
|
|
simdutf_really_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return vmaxq_u8(*this, other); }
|
|
simdutf_really_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return vminq_u8(*this, other); }
|
|
simdutf_really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return vcleq_u8(*this, other); }
|
|
simdutf_really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return vcgeq_u8(*this, other); }
|
|
simdutf_really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return vcltq_u8(*this, other); }
|
|
simdutf_really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return vcgtq_u8(*this, other); }
|
|
// Same as >, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
|
|
simdutf_really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this > other); }
|
|
// Same as <, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
|
|
simdutf_really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this < other); }
|
|
|
|
// Bit-specific operations
|
|
simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return vtstq_u8(*this, bits); }
|
|
simdutf_really_inline bool is_ascii() const { return this->max_val() < 0b10000000u; }
|
|
|
|
simdutf_really_inline bool any_bits_set_anywhere() const { return this->max_val() != 0; }
|
|
simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set_anywhere(); }
|
|
template<int N>
|
|
simdutf_really_inline simd8<uint8_t> shr() const { return vshrq_n_u8(*this, N); }
|
|
template<int N>
|
|
simdutf_really_inline simd8<uint8_t> shl() const { return vshlq_n_u8(*this, N); }
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
|
|
template<typename L>
|
|
simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return lookup_table.apply_lookup_16_to(*this);
|
|
}
|
|
|
|
|
|
template<typename L>
|
|
simdutf_really_inline simd8<L> lookup_16(
|
|
L replace0, L replace1, L replace2, L replace3,
|
|
L replace4, L replace5, L replace6, L replace7,
|
|
L replace8, L replace9, L replace10, L replace11,
|
|
L replace12, L replace13, L replace14, L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3,
|
|
replace4, replace5, replace6, replace7,
|
|
replace8, replace9, replace10, replace11,
|
|
replace12, replace13, replace14, replace15
|
|
));
|
|
}
|
|
|
|
template<typename T>
|
|
simdutf_really_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) const {
|
|
return vqtbl1q_u8(*this, simd8<uint8_t>(original));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template<>
|
|
struct simd8<int8_t> {
|
|
int8x16_t value;
|
|
|
|
static simdutf_really_inline simd8<int8_t> splat(int8_t _value) { return vmovq_n_s8(_value); }
|
|
static simdutf_really_inline simd8<int8_t> zero() { return vdupq_n_s8(0); }
|
|
static simdutf_really_inline simd8<int8_t> load(const int8_t values[16]) { return vld1q_s8(values); }
|
|
template <endianness big_endian>
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const {
|
|
uint16x8_t first = vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value)));
|
|
uint16x8_t second = vmovl_high_u8(vreinterpretq_u8_s8(this->value));
|
|
if (!match_system(big_endian)) {
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
#else
|
|
const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
|
|
#endif
|
|
first = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(first), swap));
|
|
second = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(second), swap));
|
|
}
|
|
vst1q_u16(reinterpret_cast<uint16_t*>(p), first);
|
|
vst1q_u16(reinterpret_cast<uint16_t*>(p + 8), second);
|
|
}
|
|
simdutf_really_inline void store_ascii_as_utf32(char32_t * p) const {
|
|
vst1q_u32(reinterpret_cast<uint32_t*>(p), vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value))))));
|
|
vst1q_u32(reinterpret_cast<uint32_t*>(p + 4), vmovl_high_u16(vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value)))));
|
|
vst1q_u32(reinterpret_cast<uint32_t*>(p + 8), vmovl_u16(vget_low_u16(vmovl_high_u8(vreinterpretq_u8_s8(this->value)))));
|
|
vst1q_u32(reinterpret_cast<uint32_t*>(p + 12), vmovl_high_u16(vmovl_high_u8(vreinterpretq_u8_s8(this->value))));
|
|
}
|
|
// Conversion from/to SIMD register
|
|
simdutf_really_inline simd8(const int8x16_t _value) : value{_value} {}
|
|
simdutf_really_inline operator const int8x16_t&() const { return this->value; }
|
|
simdutf_really_inline operator const uint8x16_t() const { return vreinterpretq_u8_s8(this->value); }
|
|
simdutf_really_inline operator int8x16_t&() { return this->value; }
|
|
|
|
// Zero constructor
|
|
simdutf_really_inline simd8() : simd8(zero()) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
simdutf_really_inline simd8(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) : simd8(make_int8x16_t(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
)) {}
|
|
#else
|
|
simdutf_really_inline simd8(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) : simd8(int8x16_t{
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
}) {}
|
|
#endif
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdutf_really_inline static simd8<int8_t> repeat_16(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) {
|
|
return simd8<int8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Store to array
|
|
simdutf_really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, value); }
|
|
// Explicit conversion to/from unsigned
|
|
//
|
|
// Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type.
|
|
// In theory, we could check this occurrence with std::same_as and std::enabled_if but it is C++14
|
|
// and relatively ugly and hard to read.
|
|
#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
simdutf_really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
|
|
#endif
|
|
simdutf_really_inline operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(this->value); }
|
|
|
|
simdutf_really_inline simd8<int8_t> operator|(const simd8<int8_t> other) const { return vorrq_s8(value, other.value); }
|
|
simdutf_really_inline simd8<int8_t> operator&(const simd8<int8_t> other) const { return vandq_s8(value, other.value); }
|
|
simdutf_really_inline simd8<int8_t> operator^(const simd8<int8_t> other) const { return veorq_s8(value, other.value); }
|
|
simdutf_really_inline simd8<int8_t> bit_andnot(const simd8<int8_t> other) const { return vbicq_s8(value, other.value); }
|
|
|
|
// Math
|
|
simdutf_really_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(value, other.value); }
|
|
simdutf_really_inline simd8<int8_t> operator-(const simd8<int8_t> other) const { return vsubq_s8(value, other.value); }
|
|
simdutf_really_inline simd8<int8_t>& operator+=(const simd8<int8_t> other) { *this = *this + other; return *this; }
|
|
simdutf_really_inline simd8<int8_t>& operator-=(const simd8<int8_t> other) { *this = *this - other; return *this; }
|
|
|
|
simdutf_really_inline int8_t max_val() const { return vmaxvq_s8(value); }
|
|
simdutf_really_inline int8_t min_val() const { return vminvq_s8(value); }
|
|
simdutf_really_inline bool is_ascii() const { return this->min_val() >= 0; }
|
|
|
|
// Order-sensitive comparisons
|
|
simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return vmaxq_s8(value, other.value); }
|
|
simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return vminq_s8(value, other.value); }
|
|
simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return vcgtq_s8(value, other.value); }
|
|
simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return vcltq_s8(value, other.value); }
|
|
simdutf_really_inline simd8<bool> operator==(const simd8<int8_t> other) const { return vceqq_s8(value, other.value); }
|
|
|
|
template<int N=1>
|
|
simdutf_really_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const {
|
|
return vextq_s8(prev_chunk, *this, 16 - N);
|
|
}
|
|
|
|
// Perform a lookup assuming no value is larger than 16
|
|
template<typename L>
|
|
simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return lookup_table.apply_lookup_16_to(*this);
|
|
}
|
|
template<typename L>
|
|
simdutf_really_inline simd8<L> lookup_16(
|
|
L replace0, L replace1, L replace2, L replace3,
|
|
L replace4, L replace5, L replace6, L replace7,
|
|
L replace8, L replace9, L replace10, L replace11,
|
|
L replace12, L replace13, L replace14, L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3,
|
|
replace4, replace5, replace6, replace7,
|
|
replace8, replace9, replace10, replace11,
|
|
replace12, replace13, replace14, replace15
|
|
));
|
|
}
|
|
|
|
template<typename T>
|
|
simdutf_really_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original) {
|
|
return vqtbl1q_s8(*this, simd8<uint8_t>(original));
|
|
}
|
|
};
|
|
|
|
template<typename T>
|
|
struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
|
|
simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
|
|
simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
|
|
|
|
simdutf_really_inline void store(T* ptr) const {
|
|
this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
|
|
this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
|
|
this->chunks[2].store(ptr+sizeof(simd8<T>)*2/sizeof(T));
|
|
this->chunks[3].store(ptr+sizeof(simd8<T>)*3/sizeof(T));
|
|
}
|
|
|
|
|
|
simdutf_really_inline simd8x64<T>& operator |=(const simd8x64<T> &other) {
|
|
this->chunks[0] |= other.chunks[0];
|
|
this->chunks[1] |= other.chunks[1];
|
|
this->chunks[2] |= other.chunks[2];
|
|
this->chunks[3] |= other.chunks[3];
|
|
return *this;
|
|
}
|
|
|
|
simdutf_really_inline simd8<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return reduce_or().is_ascii();
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
|
|
this->chunks[0].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*0);
|
|
this->chunks[1].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*1);
|
|
this->chunks[2].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*2);
|
|
this->chunks[3].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*3);
|
|
}
|
|
|
|
simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
|
|
this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*0);
|
|
this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*1);
|
|
this->chunks[2].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*2);
|
|
this->chunks[3].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*3);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t to_bitmask() const {
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t bit_mask = make_uint8x16_t(
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
|
|
);
|
|
#else
|
|
const uint8x16_t bit_mask = {
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
|
|
};
|
|
#endif
|
|
// Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
|
|
uint8x16_t sum0 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[0]), bit_mask), vandq_u8(uint8x16_t(this->chunks[1]), bit_mask));
|
|
uint8x16_t sum1 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[2]), bit_mask), vandq_u8(uint8x16_t(this->chunks[3]), bit_mask));
|
|
sum0 = vpaddq_u8(sum0, sum1);
|
|
sum0 = vpaddq_u8(sum0, sum0);
|
|
return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t eq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == mask,
|
|
this->chunks[1] == mask,
|
|
this->chunks[2] == mask,
|
|
this->chunks[3] == mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t lteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] <= mask,
|
|
this->chunks[1] <= mask,
|
|
this->chunks[2] <= mask,
|
|
this->chunks[3] <= mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t in_range(const T low, const T high) const {
|
|
const simd8<T> mask_low = simd8<T>::splat(low);
|
|
const simd8<T> mask_high = simd8<T>::splat(high);
|
|
|
|
return simd8x64<bool>(
|
|
(this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
|
|
(this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
|
|
(this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
|
|
(this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
|
|
const simd8<T> mask_low = simd8<T>::splat(low);
|
|
const simd8<T> mask_high = simd8<T>::splat(high);
|
|
return simd8x64<bool>(
|
|
(this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
|
|
(this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
|
|
(this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
|
|
(this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t lt(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] < mask,
|
|
this->chunks[1] < mask,
|
|
this->chunks[2] < mask,
|
|
this->chunks[3] < mask
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t gt(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] > mask,
|
|
this->chunks[1] > mask,
|
|
this->chunks[2] > mask,
|
|
this->chunks[3] > mask
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t gteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] >= mask,
|
|
this->chunks[1] >= mask,
|
|
this->chunks[2] >= mask,
|
|
this->chunks[3] >= mask
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
|
|
const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
|
|
return simd8x64<bool>(
|
|
simd8<uint8_t>(uint8x16_t(this->chunks[0])) >= mask,
|
|
simd8<uint8_t>(uint8x16_t(this->chunks[1])) >= mask,
|
|
simd8<uint8_t>(uint8x16_t(this->chunks[2])) >= mask,
|
|
simd8<uint8_t>(uint8x16_t(this->chunks[3])) >= mask
|
|
).to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/simd16-inl.h
|
|
/* begin file src/simdutf/arm64/simd16-inl.h */
|
|
template<typename T>
|
|
struct simd16;
|
|
|
|
template<typename T, typename Mask=simd16<bool>>
|
|
struct base_u16 {
|
|
uint16x8_t value;
|
|
static const int SIZE = sizeof(value);
|
|
|
|
// Conversion from/to SIMD register
|
|
simdutf_really_inline base_u16() = default;
|
|
simdutf_really_inline base_u16(const uint16x8_t _value) : value(_value) {}
|
|
simdutf_really_inline operator const uint16x8_t&() const { return this->value; }
|
|
simdutf_really_inline operator uint16x8_t&() { return this->value; }
|
|
// Bit operations
|
|
simdutf_really_inline simd16<T> operator|(const simd16<T> other) const { return vorrq_u16(*this, other); }
|
|
simdutf_really_inline simd16<T> operator&(const simd16<T> other) const { return vandq_u16(*this, other); }
|
|
simdutf_really_inline simd16<T> operator^(const simd16<T> other) const { return veorq_u16(*this, other); }
|
|
simdutf_really_inline simd16<T> bit_andnot(const simd16<T> other) const { return vbicq_u16(*this, other); }
|
|
simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
|
|
simdutf_really_inline simd16<T>& operator|=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
|
|
simdutf_really_inline simd16<T>& operator&=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
|
|
simdutf_really_inline simd16<T>& operator^=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
|
|
|
|
friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return vceqq_u16(lhs, rhs); }
|
|
|
|
template<int N=1>
|
|
simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
|
|
return vextq_u18(prev_chunk, *this, 8 - N);
|
|
}
|
|
};
|
|
|
|
template<typename T, typename Mask=simd16<bool>>
|
|
struct base16: base_u16<T> {
|
|
typedef uint16_t bitmask_t;
|
|
typedef uint32_t bitmask2_t;
|
|
|
|
simdutf_really_inline base16() : base_u16<T>() {}
|
|
simdutf_really_inline base16(const uint16x8_t _value) : base_u16<T>(_value) {}
|
|
template <typename Pointer>
|
|
simdutf_really_inline base16(const Pointer* ptr) : base16(vld1q_u16(ptr)) {}
|
|
|
|
static const int SIZE = sizeof(base_u16<T>::value);
|
|
|
|
template<int N=1>
|
|
simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
|
|
return vextq_u18(prev_chunk, *this, 8 - N);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template<>
|
|
struct simd16<bool>: base16<bool> {
|
|
static simdutf_really_inline simd16<bool> splat(bool _value) { return vmovq_n_u16(uint16_t(-(!!_value))); }
|
|
|
|
simdutf_really_inline simd16<bool>() : base16() {}
|
|
simdutf_really_inline simd16<bool>(const uint16x8_t _value) : base16<bool>(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
|
|
|
|
};
|
|
|
|
template<typename T>
|
|
struct base16_numeric: base16<T> {
|
|
static simdutf_really_inline simd16<T> splat(T _value) { return vmovq_n_u16(_value); }
|
|
static simdutf_really_inline simd16<T> zero() { return vdupq_n_u16(0); }
|
|
static simdutf_really_inline simd16<T> load(const T values[8]) {
|
|
return vld1q_u16(reinterpret_cast<const uint16_t*>(values));
|
|
}
|
|
|
|
simdutf_really_inline base16_numeric() : base16<T>() {}
|
|
simdutf_really_inline base16_numeric(const uint16x8_t _value) : base16<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdutf_really_inline void store(T dst[8]) const { return vst1q_u16(dst, *this); }
|
|
|
|
// Override to distinguish from bool version
|
|
simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return vaddq_u8(*this, other); }
|
|
simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return vsubq_u8(*this, other); }
|
|
simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
|
|
simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
|
|
};
|
|
|
|
// Signed words
|
|
template<>
|
|
struct simd16<int16_t> : base16_numeric<int16_t> {
|
|
simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
|
|
#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric<int16_t>(_value) {}
|
|
#endif
|
|
simdutf_really_inline simd16(const int16x8_t _value) : base16_numeric<int16_t>(vreinterpretq_u16_s16(_value)) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
|
|
simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
|
|
simdutf_really_inline operator simd16<uint16_t>() const;
|
|
simdutf_really_inline operator const uint16x8_t&() const { return this->value; }
|
|
simdutf_really_inline operator const int16x8_t() const { return vreinterpretq_s16_u16(this->value); }
|
|
|
|
simdutf_really_inline int16_t max_val() const { return vmaxvq_s16(vreinterpretq_s16_u16(this->value)); }
|
|
simdutf_really_inline int16_t min_val() const { return vminvq_s16(vreinterpretq_s16_u16(this->value)); }
|
|
// Order-sensitive comparisons
|
|
simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
|
|
simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
|
|
simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return vcgtq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
|
|
simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return vcltq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
|
|
};
|
|
|
|
|
|
|
|
|
|
// Unsigned words
|
|
template<>
|
|
struct simd16<uint16_t>: base16_numeric<uint16_t> {
|
|
simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
|
|
simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric<uint16_t>(_value) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
|
|
simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
|
|
|
|
|
|
simdutf_really_inline int16_t max_val() const { return vmaxvq_u16(*this); }
|
|
simdutf_really_inline int16_t min_val() const { return vminvq_u16(*this); }
|
|
// Saturated math
|
|
simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return vqaddq_u16(*this, other); }
|
|
simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return vqsubq_u16(*this, other); }
|
|
|
|
// Order-specific operations
|
|
simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return vmaxq_u16(*this, other); }
|
|
simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return vminq_u16(*this, other); }
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
|
|
simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return vcleq_u16(*this, other); }
|
|
simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return vcgeq_u16(*this, other); }
|
|
simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return vcgtq_u16(*this, other); }
|
|
simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return vcltq_u16(*this, other); }
|
|
|
|
// Bit-specific operations
|
|
simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
|
|
template<int N>
|
|
simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(vshrq_n_u16(*this, N)); }
|
|
template<int N>
|
|
simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(vshlq_n_u16(*this, N)); }
|
|
|
|
// logical operations
|
|
simdutf_really_inline simd16<uint16_t> operator|(const simd16<uint16_t> other) const { return vorrq_u16(*this, other); }
|
|
simdutf_really_inline simd16<uint16_t> operator&(const simd16<uint16_t> other) const { return vandq_u16(*this, other); }
|
|
simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> other) const { return veorq_u16(*this, other); }
|
|
|
|
// Pack with the unsigned saturation two uint16_t words into single uint8_t vector
|
|
static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
|
|
return vqmovn_high_u16(vqmovn_u16(v0), v1);
|
|
}
|
|
|
|
// Change the endianness
|
|
simdutf_really_inline simd16<uint16_t> swap_bytes() const {
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
#else
|
|
const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
|
|
#endif
|
|
return vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(*this), swap));
|
|
}
|
|
};
|
|
simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { return this->value; }
|
|
|
|
|
|
template<typename T>
|
|
struct simd16x32 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
|
|
static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
|
|
simd16<T> chunks[NUM_CHUNKS];
|
|
|
|
simd16x32(const simd16x32<T>& o) = delete; // no copy allowed
|
|
simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
|
|
simd16x32() = delete; // no default constructor allowed
|
|
|
|
simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1, const simd16<T> chunk2, const simd16<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+2*sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+3*sizeof(simd16<T>)/sizeof(T))} {}
|
|
|
|
simdutf_really_inline void store(T* ptr) const {
|
|
this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
|
|
this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
|
|
this->chunks[2].store(ptr+sizeof(simd16<T>)*2/sizeof(T));
|
|
this->chunks[3].store(ptr+sizeof(simd16<T>)*3/sizeof(T));
|
|
}
|
|
|
|
simdutf_really_inline simd16<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return reduce_or().is_ascii();
|
|
}
|
|
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
|
|
this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
|
|
this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*1);
|
|
this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*2);
|
|
this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*3);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t to_bitmask() const {
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t bit_mask = make_uint8x16_t(
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
|
|
);
|
|
#else
|
|
const uint8x16_t bit_mask = {
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
|
|
};
|
|
#endif
|
|
// Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
|
|
uint8x16_t sum0 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[0] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[1] & vreinterpretq_u16_u8(bit_mask)));
|
|
uint8x16_t sum1 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[2] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[3] & vreinterpretq_u16_u8(bit_mask)));
|
|
sum0 = vpaddq_u8(sum0, sum1);
|
|
sum0 = vpaddq_u8(sum0, sum0);
|
|
return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
|
|
}
|
|
|
|
simdutf_really_inline void swap_bytes() {
|
|
this->chunks[0] = this->chunks[0].swap_bytes();
|
|
this->chunks[1] = this->chunks[1].swap_bytes();
|
|
this->chunks[2] = this->chunks[2].swap_bytes();
|
|
this->chunks[3] = this->chunks[3].swap_bytes();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t eq(const T m) const {
|
|
const simd16<T> mask = simd16<T>::splat(m);
|
|
return simd16x32<bool>(
|
|
this->chunks[0] == mask,
|
|
this->chunks[1] == mask,
|
|
this->chunks[2] == mask,
|
|
this->chunks[3] == mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t lteq(const T m) const {
|
|
const simd16<T> mask = simd16<T>::splat(m);
|
|
return simd16x32<bool>(
|
|
this->chunks[0] <= mask,
|
|
this->chunks[1] <= mask,
|
|
this->chunks[2] <= mask,
|
|
this->chunks[3] <= mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t in_range(const T low, const T high) const {
|
|
const simd16<T> mask_low = simd16<T>::splat(low);
|
|
const simd16<T> mask_high = simd16<T>::splat(high);
|
|
|
|
return simd16x32<bool>(
|
|
(this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
|
|
(this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
|
|
(this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
|
|
(this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
|
|
const simd16<T> mask_low = simd16<T>::splat(low);
|
|
const simd16<T> mask_high = simd16<T>::splat(high);
|
|
return simd16x32<bool>(
|
|
(this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
|
|
(this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
|
|
(this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
|
|
(this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t lt(const T m) const {
|
|
const simd16<T> mask = simd16<T>::splat(m);
|
|
return simd16x32<bool>(
|
|
this->chunks[0] < mask,
|
|
this->chunks[1] < mask,
|
|
this->chunks[2] < mask,
|
|
this->chunks[3] < mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
}; // struct simd16x32<T>
|
|
template<>
|
|
simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(const uint16_t low, const uint16_t high) const {
|
|
const simd16<uint16_t> mask_low = simd16<uint16_t>::splat(low);
|
|
const simd16<uint16_t> mask_high = simd16<uint16_t>::splat(high);
|
|
simd16x32<uint16_t> x(
|
|
simd16<uint16_t>((this->chunks[0] > mask_high) | (this->chunks[0] < mask_low)),
|
|
simd16<uint16_t>((this->chunks[1] > mask_high) | (this->chunks[1] < mask_low)),
|
|
simd16<uint16_t>((this->chunks[2] > mask_high) | (this->chunks[2] < mask_low)),
|
|
simd16<uint16_t>((this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
|
|
);
|
|
return x.to_bitmask();
|
|
}
|
|
/* end file src/simdutf/arm64/simd16-inl.h */
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_ARM64_SIMD_H
|
|
/* end file src/simdutf/arm64/simd.h */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/end.h
|
|
/* begin file src/simdutf/arm64/end.h */
|
|
/* end file src/simdutf/arm64/end.h */
|
|
|
|
#endif // SIMDUTF_IMPLEMENTATION_ARM64
|
|
|
|
#endif // SIMDUTF_ARM64_H
|
|
/* end file src/simdutf/arm64.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake.h
|
|
/* begin file src/simdutf/icelake.h */
|
|
#ifndef SIMDUTF_ICELAKE_H
|
|
#define SIMDUTF_ICELAKE_H
|
|
|
|
|
|
|
|
#ifdef __has_include
|
|
// How do we detect that a compiler supports vbmi2?
|
|
// For sure if the following header is found, we are ok?
|
|
#if __has_include(<avx512vbmi2intrin.h>)
|
|
#define SIMDUTF_COMPILER_SUPPORTS_VBMI2 1
|
|
#endif
|
|
#endif
|
|
|
|
#ifdef _MSC_VER
|
|
#if _MSC_VER >= 1930
|
|
// Visual Studio 2022 and up support VBMI2 under x64 even if the header
|
|
// avx512vbmi2intrin.h is not found.
|
|
// Visual Studio 2019 technically supports VBMI2, but the implementation
|
|
// might be unreliable. Search for visualstudio2019icelakeissue in our
|
|
// tests.
|
|
#define SIMDUTF_COMPILER_SUPPORTS_VBMI2 1
|
|
#endif
|
|
#endif
|
|
|
|
// We allow icelake on x64 as long as the compiler is known to support VBMI2.
|
|
#ifndef SIMDUTF_IMPLEMENTATION_ICELAKE
|
|
#define SIMDUTF_IMPLEMENTATION_ICELAKE ((SIMDUTF_IS_X86_64) && (SIMDUTF_COMPILER_SUPPORTS_VBMI2))
|
|
#endif
|
|
|
|
// To see why (__BMI__) && (__LZCNT__) are not part of this next line, see
|
|
// https://github.com/simdutf/simdutf/issues/1247
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_ICELAKE ((SIMDUTF_IMPLEMENTATION_ICELAKE) && (SIMDUTF_IS_X86_64) && (__AVX2__) && (SIMDUTF_HAS_AVX512F && \
|
|
SIMDUTF_HAS_AVX512DQ && \
|
|
SIMDUTF_HAS_AVX512VL && \
|
|
SIMDUTF_HAS_AVX512VBMI2) && (!SIMDUTF_IS_32BITS))
|
|
|
|
#if SIMDUTF_IMPLEMENTATION_ICELAKE
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
|
|
#define SIMDUTF_TARGET_ICELAKE
|
|
#else
|
|
#define SIMDUTF_TARGET_ICELAKE SIMDUTF_TARGET_REGION("avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2,avx512vl,avx2,bmi,bmi2,pclmul,lzcnt,popcnt")
|
|
#endif
|
|
|
|
namespace simdutf {
|
|
namespace icelake {
|
|
} // namespace icelake
|
|
} // namespace simdutf
|
|
|
|
|
|
|
|
//
|
|
// These two need to be included outside SIMDUTF_TARGET_REGION
|
|
//
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/intrinsics.h
|
|
/* begin file src/simdutf/icelake/intrinsics.h */
|
|
#ifndef SIMDUTF_ICELAKE_INTRINSICS_H
|
|
#define SIMDUTF_ICELAKE_INTRINSICS_H
|
|
|
|
|
|
#ifdef SIMDUTF_VISUAL_STUDIO
|
|
// under clang within visual studio, this will include <x86intrin.h>
|
|
#include <intrin.h> // visual studio or clang
|
|
#include <immintrin.h>
|
|
#else
|
|
|
|
#if SIMDUTF_GCC11ORMORE
|
|
// We should not get warnings while including <x86intrin.h> yet we do
|
|
// under some versions of GCC.
|
|
// If the x86intrin.h header has uninitialized values that are problematic,
|
|
// it is a GCC issue, we want to ignore these warnigns.
|
|
SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
|
|
#endif
|
|
|
|
#include <x86intrin.h> // elsewhere
|
|
|
|
|
|
#if SIMDUTF_GCC11ORMORE
|
|
// cancels the suppression of the -Wuninitialized
|
|
SIMDUTF_POP_DISABLE_WARNINGS
|
|
#endif
|
|
|
|
#ifndef _tzcnt_u64
|
|
#define _tzcnt_u64(x) __tzcnt_u64(x)
|
|
#endif // _tzcnt_u64
|
|
#endif // SIMDUTF_VISUAL_STUDIO
|
|
|
|
#ifdef SIMDUTF_CLANG_VISUAL_STUDIO
|
|
/**
|
|
* You are not supposed, normally, to include these
|
|
* headers directly. Instead you should either include intrin.h
|
|
* or x86intrin.h. However, when compiling with clang
|
|
* under Windows (i.e., when _MSC_VER is set), these headers
|
|
* only get included *if* the corresponding features are detected
|
|
* from macros:
|
|
* e.g., if __AVX2__ is set... in turn, we normally set these
|
|
* macros by compiling against the corresponding architecture
|
|
* (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
|
|
* software with these advanced instructions. In simdutf, we
|
|
* want to compile the whole program for a generic target,
|
|
* and only target our specific kernels. As a workaround,
|
|
* we directly include the needed headers. These headers would
|
|
* normally guard against such usage, but we carefully included
|
|
* <x86intrin.h> (or <intrin.h>) before, so the headers
|
|
* are fooled.
|
|
*/
|
|
#include <bmiintrin.h> // for _blsr_u64
|
|
#include <bmi2intrin.h> // for _pext_u64, _pdep_u64
|
|
#include <lzcntintrin.h> // for __lzcnt64
|
|
#include <immintrin.h> // for most things (AVX2, AVX512, _popcnt64)
|
|
#include <smmintrin.h>
|
|
#include <tmmintrin.h>
|
|
#include <avxintrin.h>
|
|
#include <avx2intrin.h>
|
|
// Important: we need the AVX-512 headers:
|
|
#include <avx512fintrin.h>
|
|
#include <avx512dqintrin.h>
|
|
#include <avx512cdintrin.h>
|
|
#include <avx512bwintrin.h>
|
|
#include <avx512vlintrin.h>
|
|
#include <avx512vlbwintrin.h>
|
|
#include <avx512vbmiintrin.h>
|
|
#include <avx512vbmi2intrin.h>
|
|
// unfortunately, we may not get _blsr_u64, but, thankfully, clang
|
|
// has it as a macro.
|
|
#ifndef _blsr_u64
|
|
// we roll our own
|
|
#define _blsr_u64(n) ((n - 1) & n)
|
|
#endif // _blsr_u64
|
|
#endif // SIMDUTF_CLANG_VISUAL_STUDIO
|
|
|
|
|
|
|
|
#if defined(__GNUC__) && !defined(__clang__)
|
|
|
|
#if __GNUC__ == 8
|
|
#define SIMDUTF_GCC8 1
|
|
#elif __GNUC__ == 9
|
|
#define SIMDUTF_GCC9 1
|
|
#endif // __GNUC__ == 8 || __GNUC__ == 9
|
|
|
|
#endif // defined(__GNUC__) && !defined(__clang__)
|
|
|
|
#if SIMDUTF_GCC8
|
|
#pragma GCC push_options
|
|
#pragma GCC target("avx512f")
|
|
/**
|
|
* GCC 8 fails to provide _mm512_set_epi8. We roll our own.
|
|
*/
|
|
inline __m512i _mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24, uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34, uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39, uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44, uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49, uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54, uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59, uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63) {
|
|
return _mm512_set_epi64(uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) + (uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) + (uint64_t(a1) << 48) + (uint64_t(a0) << 56),
|
|
uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) + (uint64_t(a12) << 24) + (uint64_t(a11) << 32) + (uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56),
|
|
uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) + (uint64_t(a20) << 24) + (uint64_t(a19) << 32) + (uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56),
|
|
uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) + (uint64_t(a28) << 24) + (uint64_t(a27) << 32) + (uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56),
|
|
uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) + (uint64_t(a36) << 24) + (uint64_t(a35) << 32) + (uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56),
|
|
uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) + (uint64_t(a44) << 24) + (uint64_t(a43) << 32) + (uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56),
|
|
uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) + (uint64_t(a52) << 24) + (uint64_t(a51) << 32) + (uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56),
|
|
uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) + (uint64_t(a60) << 24) + (uint64_t(a59) << 32) + (uint64_t(a58) << 40) + (uint64_t(a57) << 48) + (uint64_t(a56) << 56));
|
|
}
|
|
#pragma GCC pop_options
|
|
#endif // SIMDUTF_GCC8
|
|
|
|
#endif // SIMDUTF_HASWELL_INTRINSICS_H
|
|
/* end file src/simdutf/icelake/intrinsics.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/implementation.h
|
|
/* begin file src/simdutf/icelake/implementation.h */
|
|
#ifndef SIMDUTF_ICELAKE_IMPLEMENTATION_H
|
|
#define SIMDUTF_ICELAKE_IMPLEMENTATION_H
|
|
|
|
|
|
namespace simdutf {
|
|
namespace icelake {
|
|
|
|
namespace {
|
|
using namespace simdutf;
|
|
}
|
|
|
|
class implementation final : public simdutf::implementation {
|
|
public:
|
|
simdutf_really_inline implementation() : simdutf::implementation(
|
|
"icelake",
|
|
"Intel AVX512 (AVX-512BW, AVX-512CD, AVX-512VL, AVX-512VBMI2 extensions)",
|
|
internal::instruction_set::AVX2 | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 | internal::instruction_set::AVX512BW | internal::instruction_set::AVX512CD | internal::instruction_set::AVX512VL | internal::instruction_set::AVX512VBMI2 ) {}
|
|
simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
|
|
simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
|
|
};
|
|
|
|
} // namespace icelake
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_ICELAKE_IMPLEMENTATION_H
|
|
/* end file src/simdutf/icelake/implementation.h */
|
|
|
|
//
|
|
// The rest need to be inside the region
|
|
//
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/begin.h
|
|
/* begin file src/simdutf/icelake/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "icelake"
|
|
// #define SIMDUTF_IMPLEMENTATION icelake
|
|
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_TARGET_ICELAKE
|
|
#endif
|
|
|
|
#if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
|
|
SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
|
|
#endif // end of workaround
|
|
/* end file src/simdutf/icelake/begin.h */
|
|
// Declarations
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/bitmanipulation.h
|
|
/* begin file src/simdutf/icelake/bitmanipulation.h */
|
|
#ifndef SIMDUTF_ICELAKE_BITMANIPULATION_H
|
|
#define SIMDUTF_ICELAKE_BITMANIPULATION_H
|
|
|
|
namespace simdutf {
|
|
namespace icelake {
|
|
namespace {
|
|
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
|
|
// note: we do not support legacy 32-bit Windows
|
|
return __popcnt64(input_num);// Visual Studio wants two underscores
|
|
}
|
|
#else
|
|
simdutf_really_inline long long int count_ones(uint64_t input_num) {
|
|
return _popcnt64(input_num);
|
|
}
|
|
#endif
|
|
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_ICELAKE_BITMANIPULATION_H
|
|
/* end file src/simdutf/icelake/bitmanipulation.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/end.h
|
|
/* begin file src/simdutf/icelake/end.h */
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_UNTARGET_REGION
|
|
#endif
|
|
|
|
|
|
#if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
|
|
SIMDUTF_POP_DISABLE_WARNINGS
|
|
#endif // end of workaround
|
|
/* end file src/simdutf/icelake/end.h */
|
|
|
|
|
|
|
|
#endif // SIMDUTF_IMPLEMENTATION_ICELAKE
|
|
#endif // SIMDUTF_ICELAKE_H
|
|
/* end file src/simdutf/icelake.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell.h
|
|
/* begin file src/simdutf/haswell.h */
|
|
#ifndef SIMDUTF_HASWELL_H
|
|
#define SIMDUTF_HASWELL_H
|
|
|
|
#ifdef SIMDUTF_WESTMERE_H
|
|
#error "haswell.h must be included before westmere.h"
|
|
#endif
|
|
#ifdef SIMDUTF_FALLBACK_H
|
|
#error "haswell.h must be included before fallback.h"
|
|
#endif
|
|
|
|
|
|
// Default Haswell to on if this is x86-64. Even if we're not compiled for it, it could be selected
|
|
// at runtime.
|
|
#ifndef SIMDUTF_IMPLEMENTATION_HASWELL
|
|
//
|
|
// You do not want to restrict it like so: SIMDUTF_IS_X86_64 && __AVX2__
|
|
// because we want to rely on *runtime dispatch*.
|
|
//
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
|
|
#define SIMDUTF_IMPLEMENTATION_HASWELL 0
|
|
#else
|
|
#define SIMDUTF_IMPLEMENTATION_HASWELL (SIMDUTF_IS_X86_64)
|
|
#endif
|
|
|
|
#endif
|
|
// To see why (__BMI__) && (__LZCNT__) are not part of this next line, see
|
|
// https://github.com/simdutf/simdutf/issues/1247
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_HASWELL ((SIMDUTF_IMPLEMENTATION_HASWELL) && (SIMDUTF_IS_X86_64) && (__AVX2__))
|
|
|
|
#if SIMDUTF_IMPLEMENTATION_HASWELL
|
|
|
|
#define SIMDUTF_TARGET_HASWELL SIMDUTF_TARGET_REGION("avx2,bmi,lzcnt,popcnt")
|
|
|
|
namespace simdutf {
|
|
/**
|
|
* Implementation for Haswell (Intel AVX2).
|
|
*/
|
|
namespace haswell {
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
|
|
//
|
|
// These two need to be included outside SIMDUTF_TARGET_REGION
|
|
//
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/implementation.h
|
|
/* begin file src/simdutf/haswell/implementation.h */
|
|
#ifndef SIMDUTF_HASWELL_IMPLEMENTATION_H
|
|
#define SIMDUTF_HASWELL_IMPLEMENTATION_H
|
|
|
|
|
|
// The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
|
|
using namespace simdutf;
|
|
|
|
class implementation final : public simdutf::implementation {
|
|
public:
|
|
simdutf_really_inline implementation() : simdutf::implementation(
|
|
"haswell",
|
|
"Intel/AMD AVX2",
|
|
internal::instruction_set::AVX2 | internal::instruction_set::BMI1 | internal::instruction_set::BMI2
|
|
) {}
|
|
simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
|
|
simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
|
|
};
|
|
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_HASWELL_IMPLEMENTATION_H
|
|
/* end file src/simdutf/haswell/implementation.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/intrinsics.h
|
|
/* begin file src/simdutf/haswell/intrinsics.h */
|
|
#ifndef SIMDUTF_HASWELL_INTRINSICS_H
|
|
#define SIMDUTF_HASWELL_INTRINSICS_H
|
|
|
|
|
|
#ifdef SIMDUTF_VISUAL_STUDIO
|
|
// under clang within visual studio, this will include <x86intrin.h>
|
|
#include <intrin.h> // visual studio or clang
|
|
#else
|
|
|
|
#if SIMDUTF_GCC11ORMORE
|
|
// We should not get warnings while including <x86intrin.h> yet we do
|
|
// under some versions of GCC.
|
|
// If the x86intrin.h header has uninitialized values that are problematic,
|
|
// it is a GCC issue, we want to ignore these warnigns.
|
|
SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
|
|
#endif
|
|
|
|
#include <x86intrin.h> // elsewhere
|
|
|
|
|
|
#if SIMDUTF_GCC11ORMORE
|
|
// cancels the suppression of the -Wuninitialized
|
|
SIMDUTF_POP_DISABLE_WARNINGS
|
|
#endif
|
|
|
|
#endif // SIMDUTF_VISUAL_STUDIO
|
|
|
|
#ifdef SIMDUTF_CLANG_VISUAL_STUDIO
|
|
/**
|
|
* You are not supposed, normally, to include these
|
|
* headers directly. Instead you should either include intrin.h
|
|
* or x86intrin.h. However, when compiling with clang
|
|
* under Windows (i.e., when _MSC_VER is set), these headers
|
|
* only get included *if* the corresponding features are detected
|
|
* from macros:
|
|
* e.g., if __AVX2__ is set... in turn, we normally set these
|
|
* macros by compiling against the corresponding architecture
|
|
* (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
|
|
* software with these advanced instructions. In simdutf, we
|
|
* want to compile the whole program for a generic target,
|
|
* and only target our specific kernels. As a workaround,
|
|
* we directly include the needed headers. These headers would
|
|
* normally guard against such usage, but we carefully included
|
|
* <x86intrin.h> (or <intrin.h>) before, so the headers
|
|
* are fooled.
|
|
*/
|
|
#include <bmiintrin.h> // for _blsr_u64
|
|
#include <lzcntintrin.h> // for __lzcnt64
|
|
#include <immintrin.h> // for most things (AVX2, AVX512, _popcnt64)
|
|
#include <smmintrin.h>
|
|
#include <tmmintrin.h>
|
|
#include <avxintrin.h>
|
|
#include <avx2intrin.h>
|
|
// unfortunately, we may not get _blsr_u64, but, thankfully, clang
|
|
// has it as a macro.
|
|
#ifndef _blsr_u64
|
|
// we roll our own
|
|
#define _blsr_u64(n) ((n - 1) & n)
|
|
#endif // _blsr_u64
|
|
#endif // SIMDUTF_CLANG_VISUAL_STUDIO
|
|
|
|
#endif // SIMDUTF_HASWELL_INTRINSICS_H
|
|
/* end file src/simdutf/haswell/intrinsics.h */
|
|
|
|
//
|
|
// The rest need to be inside the region
|
|
//
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/begin.h
|
|
/* begin file src/simdutf/haswell/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "haswell"
|
|
// #define SIMDUTF_IMPLEMENTATION haswell
|
|
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_TARGET_HASWELL
|
|
#endif
|
|
|
|
#if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
|
|
SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
|
|
#endif // end of workaround
|
|
/* end file src/simdutf/haswell/begin.h */
|
|
// Declarations
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/bitmanipulation.h
|
|
/* begin file src/simdutf/haswell/bitmanipulation.h */
|
|
#ifndef SIMDUTF_HASWELL_BITMANIPULATION_H
|
|
#define SIMDUTF_HASWELL_BITMANIPULATION_H
|
|
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
|
|
// note: we do not support legacy 32-bit Windows
|
|
return __popcnt64(input_num);// Visual Studio wants two underscores
|
|
}
|
|
#else
|
|
simdutf_really_inline long long int count_ones(uint64_t input_num) {
|
|
return _popcnt64(input_num);
|
|
}
|
|
#endif
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_HASWELL_BITMANIPULATION_H
|
|
/* end file src/simdutf/haswell/bitmanipulation.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/simd.h
|
|
/* begin file src/simdutf/haswell/simd.h */
|
|
#ifndef SIMDUTF_HASWELL_SIMD_H
|
|
#define SIMDUTF_HASWELL_SIMD_H
|
|
|
|
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template<typename Child>
|
|
struct base {
|
|
__m256i value;
|
|
|
|
// Zero constructor
|
|
simdutf_really_inline base() : value{__m256i()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdutf_really_inline base(const __m256i _value) : value(_value) {}
|
|
// Conversion to SIMD register
|
|
simdutf_really_inline operator const __m256i&() const { return this->value; }
|
|
simdutf_really_inline operator __m256i&() { return this->value; }
|
|
template <endianness big_endian>
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
|
|
__m256i first = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(*this));
|
|
__m256i second = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(*this,1));
|
|
if (big_endian) {
|
|
const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
|
|
17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
|
|
first = _mm256_shuffle_epi8(first, swap);
|
|
second = _mm256_shuffle_epi8(second, swap);
|
|
}
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), first);
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), second);
|
|
}
|
|
simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(*this)));
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr+8), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(_mm256_srli_si256(*this,8))));
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), _mm256_cvtepu8_epi32(_mm256_extractf128_si256(*this,1)));
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 24), _mm256_cvtepu8_epi32(_mm_srli_si128(_mm256_extractf128_si256(*this,1),8)));
|
|
}
|
|
// Bit operations
|
|
simdutf_really_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); }
|
|
simdutf_really_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); }
|
|
simdutf_really_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); }
|
|
simdutf_really_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(other, *this); }
|
|
simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
|
|
simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
|
|
simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
|
|
};
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template<typename T>
|
|
struct simd8;
|
|
|
|
template<typename T, typename Mask=simd8<bool>>
|
|
struct base8: base<simd8<T>> {
|
|
typedef uint32_t bitmask_t;
|
|
typedef uint64_t bitmask2_t;
|
|
|
|
simdutf_really_inline base8() : base<simd8<T>>() {}
|
|
simdutf_really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
|
|
simdutf_really_inline T first() const { return _mm256_extract_epi8(*this,0); }
|
|
simdutf_really_inline T last() const { return _mm256_extract_epi8(*this,31); }
|
|
friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm256_cmpeq_epi8(lhs, rhs); }
|
|
|
|
static const int SIZE = sizeof(base<T>::value);
|
|
|
|
template<int N=1>
|
|
simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template<>
|
|
struct simd8<bool>: base8<bool> {
|
|
static simdutf_really_inline simd8<bool> splat(bool _value) { return _mm256_set1_epi8(uint8_t(-(!!_value))); }
|
|
|
|
simdutf_really_inline simd8<bool>() : base8() {}
|
|
simdutf_really_inline simd8<bool>(const __m256i _value) : base8<bool>(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
|
|
|
|
simdutf_really_inline uint32_t to_bitmask() const { return uint32_t(_mm256_movemask_epi8(*this)); }
|
|
simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
|
|
simdutf_really_inline bool none() const { return _mm256_testz_si256(*this, *this); }
|
|
simdutf_really_inline bool all() const { return static_cast<uint32_t>(_mm256_movemask_epi8(*this)) == 0xFFFFFFFF; }
|
|
simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
|
|
};
|
|
|
|
template<typename T>
|
|
struct base8_numeric: base8<T> {
|
|
static simdutf_really_inline simd8<T> splat(T _value) { return _mm256_set1_epi8(_value); }
|
|
static simdutf_really_inline simd8<T> zero() { return _mm256_setzero_si256(); }
|
|
static simdutf_really_inline simd8<T> load(const T values[32]) {
|
|
return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
|
|
}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
static simdutf_really_inline simd8<T> repeat_16(
|
|
T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
|
|
T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
|
|
) {
|
|
return simd8<T>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
simdutf_really_inline base8_numeric() : base8<T>() {}
|
|
simdutf_really_inline base8_numeric(const __m256i _value) : base8<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdutf_really_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdutf_really_inline simd8<T> operator+(const simd8<T> other) const { return _mm256_add_epi8(*this, other); }
|
|
simdutf_really_inline simd8<T> operator-(const simd8<T> other) const { return _mm256_sub_epi8(*this, other); }
|
|
simdutf_really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
|
|
simdutf_really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
|
|
|
|
// Override to distinguish from bool version
|
|
simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
|
|
template<typename L>
|
|
simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return _mm256_shuffle_epi8(lookup_table, *this);
|
|
}
|
|
|
|
template<typename L>
|
|
simdutf_really_inline simd8<L> lookup_16(
|
|
L replace0, L replace1, L replace2, L replace3,
|
|
L replace4, L replace5, L replace6, L replace7,
|
|
L replace8, L replace9, L replace10, L replace11,
|
|
L replace12, L replace13, L replace14, L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3,
|
|
replace4, replace5, replace6, replace7,
|
|
replace8, replace9, replace10, replace11,
|
|
replace12, replace13, replace14, replace15
|
|
));
|
|
}
|
|
};
|
|
|
|
|
|
// Signed bytes
|
|
template<>
|
|
struct simd8<int8_t> : base8_numeric<int8_t> {
|
|
simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
|
|
simdutf_really_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
|
|
simdutf_really_inline operator simd8<uint8_t>() const;
|
|
// Member-by-member initialization
|
|
simdutf_really_inline simd8(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
|
|
int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
|
|
int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31
|
|
) : simd8(_mm256_setr_epi8(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v16,v17,v18,v19,v20,v21,v22,v23,
|
|
v24,v25,v26,v27,v28,v29,v30,v31
|
|
)) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdutf_really_inline static simd8<int8_t> repeat_16(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) {
|
|
return simd8<int8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
simdutf_really_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; }
|
|
// Order-sensitive comparisons
|
|
simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm256_max_epi8(*this, other); }
|
|
simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm256_min_epi8(*this, other); }
|
|
simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(*this, other); }
|
|
simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(other, *this); }
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template<>
|
|
struct simd8<uint8_t>: base8_numeric<uint8_t> {
|
|
simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
|
|
simdutf_really_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdutf_really_inline simd8(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
|
|
uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
|
|
uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31
|
|
) : simd8(_mm256_setr_epi8(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v16,v17,v18,v19,v20,v21,v22,v23,
|
|
v24,v25,v26,v27,v28,v29,v30,v31
|
|
)) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdutf_really_inline static simd8<uint8_t> repeat_16(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) {
|
|
return simd8<uint8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
|
|
// Saturated math
|
|
simdutf_really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm256_adds_epu8(*this, other); }
|
|
simdutf_really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm256_subs_epu8(*this, other); }
|
|
|
|
// Order-specific operations
|
|
simdutf_really_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm256_max_epu8(*this, other); }
|
|
simdutf_really_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm256_min_epu8(other, *this); }
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
|
|
simdutf_really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
|
|
simdutf_really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
|
|
simdutf_really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
simdutf_really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); }
|
|
|
|
// Bit-specific operations
|
|
simdutf_really_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
|
|
simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
|
|
simdutf_really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
|
|
simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
|
|
simdutf_really_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; }
|
|
simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
|
|
simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
|
|
simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm256_testz_si256(*this, bits); }
|
|
simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
|
|
template<int N>
|
|
simdutf_really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
|
|
template<int N>
|
|
simdutf_really_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm256_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
|
|
// Get one of the bits and make a bitmask out of it.
|
|
// e.g. value.get_bit<7>() gets the high bit
|
|
template<int N>
|
|
simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7-N)); }
|
|
};
|
|
simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
|
|
|
|
|
|
template<typename T>
|
|
struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
|
|
simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
|
|
simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
|
|
simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T))} {}
|
|
|
|
simdutf_really_inline void store(T* ptr) const {
|
|
this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
|
|
this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
|
|
}
|
|
|
|
simdutf_really_inline uint64_t to_bitmask() const {
|
|
uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
|
|
uint64_t r_hi = this->chunks[1].to_bitmask();
|
|
return r_lo | (r_hi << 32);
|
|
}
|
|
|
|
simdutf_really_inline simd8x64<T>& operator|=(const simd8x64<T> &other) {
|
|
this->chunks[0] |= other.chunks[0];
|
|
this->chunks[1] |= other.chunks[1];
|
|
return *this;
|
|
}
|
|
|
|
simdutf_really_inline simd8<T> reduce_or() const {
|
|
return this->chunks[0] | this->chunks[1];
|
|
}
|
|
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return this->reduce_or().is_ascii();
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
|
|
this->chunks[0].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*0);
|
|
this->chunks[1].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*1);
|
|
}
|
|
|
|
simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
|
|
this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*0);
|
|
this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*1);
|
|
}
|
|
|
|
simdutf_really_inline simd8x64<T> bit_or(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<T>(
|
|
this->chunks[0] | mask,
|
|
this->chunks[1] | mask
|
|
);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t eq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == mask,
|
|
this->chunks[1] == mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == other.chunks[0],
|
|
this->chunks[1] == other.chunks[1]
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t lteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] <= mask,
|
|
this->chunks[1] <= mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t in_range(const T low, const T high) const {
|
|
const simd8<T> mask_low = simd8<T>::splat(low);
|
|
const simd8<T> mask_high = simd8<T>::splat(high);
|
|
|
|
return simd8x64<bool>(
|
|
(this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
|
|
(this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
|
|
(this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
|
|
(this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
|
|
const simd8<T> mask_low = simd8<T>::splat(low);
|
|
const simd8<T> mask_high = simd8<T>::splat(high);
|
|
return simd8x64<bool>(
|
|
(this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
|
|
(this->chunks[1] > mask_high) | (this->chunks[1] < mask_low)
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t lt(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] < mask,
|
|
this->chunks[1] < mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t gt(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] > mask,
|
|
this->chunks[1] > mask
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t gteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] >= mask,
|
|
this->chunks[1] >= mask
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
|
|
const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
|
|
return simd8x64<bool>(
|
|
(simd8<uint8_t>(__m256i(this->chunks[0])) >= mask),
|
|
(simd8<uint8_t>(__m256i(this->chunks[1])) >= mask)
|
|
).to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/simd16-inl.h
|
|
/* begin file src/simdutf/haswell/simd16-inl.h */
|
|
#ifdef __GNUC__
|
|
#if __GNUC__ < 8
|
|
#define _mm256_set_m128i(xmm1, xmm2) _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2)
|
|
#define _mm256_setr_m128i(xmm2, xmm1) _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2)
|
|
#endif
|
|
#endif
|
|
|
|
template<typename T>
|
|
struct simd16;
|
|
|
|
template<typename T, typename Mask=simd16<bool>>
|
|
struct base16: base<simd16<T>> {
|
|
using bitmask_type = uint32_t;
|
|
|
|
simdutf_really_inline base16() : base<simd16<T>>() {}
|
|
simdutf_really_inline base16(const __m256i _value) : base<simd16<T>>(_value) {}
|
|
template <typename Pointer>
|
|
simdutf_really_inline base16(const Pointer* ptr) : base16(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr))) {}
|
|
friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return _mm256_cmpeq_epi16(lhs, rhs); }
|
|
|
|
/// the size of vector in bytes
|
|
static const int SIZE = sizeof(base<simd16<T>>::value);
|
|
|
|
/// the number of elements of type T a vector can hold
|
|
static const int ELEMENTS = SIZE / sizeof(T);
|
|
|
|
template<int N=1>
|
|
simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
|
|
return _mm256_alignr_epi8(*this, prev_chunk, 16 - N);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template<>
|
|
struct simd16<bool>: base16<bool> {
|
|
static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm256_set1_epi16(uint16_t(-(!!_value))); }
|
|
|
|
simdutf_really_inline simd16<bool>() : base16() {}
|
|
simdutf_really_inline simd16<bool>(const __m256i _value) : base16<bool>(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
|
|
|
|
simdutf_really_inline bitmask_type to_bitmask() const { return _mm256_movemask_epi8(*this); }
|
|
simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
|
|
simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
|
|
};
|
|
|
|
template<typename T>
|
|
struct base16_numeric: base16<T> {
|
|
static simdutf_really_inline simd16<T> splat(T _value) { return _mm256_set1_epi16(_value); }
|
|
static simdutf_really_inline simd16<T> zero() { return _mm256_setzero_si256(); }
|
|
static simdutf_really_inline simd16<T> load(const T values[8]) {
|
|
return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
|
|
}
|
|
|
|
simdutf_really_inline base16_numeric() : base16<T>() {}
|
|
simdutf_really_inline base16_numeric(const __m256i _value) : base16<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdutf_really_inline void store(T dst[8]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
|
|
|
|
// Override to distinguish from bool version
|
|
simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFFFu; }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm256_add_epi16(*this, other); }
|
|
simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm256_sub_epi16(*this, other); }
|
|
simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
|
|
simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
|
|
};
|
|
|
|
// Signed words
|
|
template<>
|
|
struct simd16<int16_t> : base16_numeric<int16_t> {
|
|
simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
|
|
simdutf_really_inline simd16(const __m256i _value) : base16_numeric<int16_t>(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
|
|
simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
|
|
// Order-sensitive comparisons
|
|
simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm256_max_epi16(*this, other); }
|
|
simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm256_min_epi16(*this, other); }
|
|
simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(*this, other); }
|
|
simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(other, *this); }
|
|
};
|
|
|
|
// Unsigned words
|
|
template<>
|
|
struct simd16<uint16_t>: base16_numeric<uint16_t> {
|
|
simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
|
|
simdutf_really_inline simd16(const __m256i _value) : base16_numeric<uint16_t>(_value) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
|
|
simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
|
|
|
|
// Saturated math
|
|
simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm256_adds_epu16(*this, other); }
|
|
simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm256_subs_epu16(*this, other); }
|
|
|
|
// Order-specific operations
|
|
simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm256_max_epu16(*this, other); }
|
|
simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return _mm256_min_epu16(*this, other); }
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
|
|
simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return other.max_val(*this) == other; }
|
|
simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return other.min_val(*this) == other; }
|
|
simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
|
|
// Bit-specific operations
|
|
simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
|
|
simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const { return (*this & bits).bits_not_set(); }
|
|
simdutf_really_inline simd16<bool> any_bits_set() const { return ~this->bits_not_set(); }
|
|
simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const { return ~this->bits_not_set(bits); }
|
|
|
|
simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
|
|
simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
|
|
simdutf_really_inline bool bits_not_set_anywhere(simd16<uint16_t> bits) const { return _mm256_testz_si256(*this, bits); }
|
|
simdutf_really_inline bool any_bits_set_anywhere(simd16<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
|
|
template<int N>
|
|
simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(_mm256_srli_epi16(*this, N)); }
|
|
template<int N>
|
|
simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(_mm256_slli_epi16(*this, N)); }
|
|
// Get one of the bits and make a bitmask out of it.
|
|
// e.g. value.get_bit<7>() gets the high bit
|
|
template<int N>
|
|
simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 15-N)); }
|
|
|
|
// Change the endianness
|
|
simdutf_really_inline simd16<uint16_t> swap_bytes() const {
|
|
const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
|
|
17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
|
|
return _mm256_shuffle_epi8(*this, swap);
|
|
}
|
|
|
|
// Pack with the unsigned saturation two uint16_t words into single uint8_t vector
|
|
static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
|
|
// Note: the AVX2 variant of pack operates on 128-bit lanes, thus
|
|
// we have to shuffle lanes in order to produce bytes in the
|
|
// correct order.
|
|
|
|
// get the 0th lanes
|
|
const __m128i lo_0 = _mm256_extracti128_si256(v0, 0);
|
|
const __m128i lo_1 = _mm256_extracti128_si256(v1, 0);
|
|
|
|
// get the 1st lanes
|
|
const __m128i hi_0 = _mm256_extracti128_si256(v0, 1);
|
|
const __m128i hi_1 = _mm256_extracti128_si256(v1, 1);
|
|
|
|
// build new vectors (shuffle lanes)
|
|
const __m256i t0 = _mm256_set_m128i(lo_1, lo_0);
|
|
const __m256i t1 = _mm256_set_m128i(hi_1, hi_0);
|
|
|
|
// pack words in linear order from v0 and v1
|
|
return _mm256_packus_epi16(t0, t1);
|
|
}
|
|
};
|
|
|
|
|
|
template<typename T>
|
|
struct simd16x32 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
|
|
static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
|
|
simd16<T> chunks[NUM_CHUNKS];
|
|
|
|
simd16x32(const simd16x32<T>& o) = delete; // no copy allowed
|
|
simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
|
|
simd16x32() = delete; // no default constructor allowed
|
|
|
|
simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1) : chunks{chunk0, chunk1} {}
|
|
simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T))} {}
|
|
|
|
simdutf_really_inline void store(T* ptr) const {
|
|
this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
|
|
this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
|
|
}
|
|
|
|
simdutf_really_inline uint64_t to_bitmask() const {
|
|
uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
|
|
uint64_t r_hi = this->chunks[1].to_bitmask();
|
|
return r_lo | (r_hi << 32);
|
|
}
|
|
|
|
simdutf_really_inline simd16<T> reduce_or() const {
|
|
return this->chunks[0] | this->chunks[1];
|
|
}
|
|
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return this->reduce_or().is_ascii();
|
|
}
|
|
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
|
|
this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
|
|
this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>));
|
|
}
|
|
|
|
simdutf_really_inline simd16x32<T> bit_or(const T m) const {
|
|
const simd16<T> mask = simd16<T>::splat(m);
|
|
return simd16x32<T>(
|
|
this->chunks[0] | mask,
|
|
this->chunks[1] | mask
|
|
);
|
|
}
|
|
|
|
simdutf_really_inline void swap_bytes() {
|
|
this->chunks[0] = this->chunks[0].swap_bytes();
|
|
this->chunks[1] = this->chunks[1].swap_bytes();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t eq(const T m) const {
|
|
const simd16<T> mask = simd16<T>::splat(m);
|
|
return simd16x32<bool>(
|
|
this->chunks[0] == mask,
|
|
this->chunks[1] == mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t eq(const simd16x32<uint16_t> &other) const {
|
|
return simd16x32<bool>(
|
|
this->chunks[0] == other.chunks[0],
|
|
this->chunks[1] == other.chunks[1]
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t lteq(const T m) const {
|
|
const simd16<T> mask = simd16<T>::splat(m);
|
|
return simd16x32<bool>(
|
|
this->chunks[0] <= mask,
|
|
this->chunks[1] <= mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t in_range(const T low, const T high) const {
|
|
const simd16<T> mask_low = simd16<T>::splat(low);
|
|
const simd16<T> mask_high = simd16<T>::splat(high);
|
|
|
|
return simd16x32<bool>(
|
|
(this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
|
|
(this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
|
|
(this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
|
|
(this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
|
|
const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low-1));
|
|
const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high+1));
|
|
return simd16x32<bool>(
|
|
(this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
|
|
(this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low)
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t lt(const T m) const {
|
|
const simd16<T> mask = simd16<T>::splat(m);
|
|
return simd16x32<bool>(
|
|
this->chunks[0] < mask,
|
|
this->chunks[1] < mask
|
|
).to_bitmask();
|
|
}
|
|
}; // struct simd16x32<T>
|
|
/* end file src/simdutf/haswell/simd16-inl.h */
|
|
|
|
} // namespace simd
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_HASWELL_SIMD_H
|
|
/* end file src/simdutf/haswell/simd.h */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/end.h
|
|
/* begin file src/simdutf/haswell/end.h */
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_UNTARGET_REGION
|
|
#endif
|
|
|
|
|
|
#if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
|
|
SIMDUTF_POP_DISABLE_WARNINGS
|
|
#endif // end of workaround
|
|
/* end file src/simdutf/haswell/end.h */
|
|
|
|
#endif // SIMDUTF_IMPLEMENTATION_HASWELL
|
|
#endif // SIMDUTF_HASWELL_COMMON_H
|
|
/* end file src/simdutf/haswell.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere.h
|
|
/* begin file src/simdutf/westmere.h */
|
|
#ifndef SIMDUTF_WESTMERE_H
|
|
#define SIMDUTF_WESTMERE_H
|
|
|
|
#ifdef SIMDUTF_FALLBACK_H
|
|
#error "westmere.h must be included before fallback.h"
|
|
#endif
|
|
|
|
|
|
// Default Westmere to on if this is x86-64, unless we'll always select Haswell.
|
|
#ifndef SIMDUTF_IMPLEMENTATION_WESTMERE
|
|
//
|
|
// You do not want to set it to (SIMDUTF_IS_X86_64 && !SIMDUTF_REQUIRES_HASWELL)
|
|
// because you want to rely on runtime dispatch!
|
|
//
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE || SIMDUTF_CAN_ALWAYS_RUN_HASWELL
|
|
#define SIMDUTF_IMPLEMENTATION_WESTMERE 0
|
|
#else
|
|
#define SIMDUTF_IMPLEMENTATION_WESTMERE (SIMDUTF_IS_X86_64)
|
|
#endif
|
|
|
|
#endif
|
|
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_WESTMERE (SIMDUTF_IMPLEMENTATION_WESTMERE && SIMDUTF_IS_X86_64 && __SSE4_2__)
|
|
|
|
#if SIMDUTF_IMPLEMENTATION_WESTMERE
|
|
|
|
#define SIMDUTF_TARGET_WESTMERE SIMDUTF_TARGET_REGION("sse4.2,popcnt")
|
|
|
|
namespace simdutf {
|
|
/**
|
|
* Implementation for Westmere (Intel SSE4.2).
|
|
*/
|
|
namespace westmere {
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
|
|
//
|
|
// These two need to be included outside SIMDUTF_TARGET_REGION
|
|
//
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/implementation.h
|
|
/* begin file src/simdutf/westmere/implementation.h */
|
|
#ifndef SIMDUTF_WESTMERE_IMPLEMENTATION_H
|
|
#define SIMDUTF_WESTMERE_IMPLEMENTATION_H
|
|
|
|
|
|
// The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
|
|
namespace {
|
|
using namespace simdutf;
|
|
}
|
|
|
|
class implementation final : public simdutf::implementation {
|
|
public:
|
|
simdutf_really_inline implementation() : simdutf::implementation("westmere", "Intel/AMD SSE4.2", internal::instruction_set::SSE42) {}
|
|
simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
|
|
simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
|
|
};
|
|
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_WESTMERE_IMPLEMENTATION_H
|
|
/* end file src/simdutf/westmere/implementation.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/intrinsics.h
|
|
/* begin file src/simdutf/westmere/intrinsics.h */
|
|
#ifndef SIMDUTF_WESTMERE_INTRINSICS_H
|
|
#define SIMDUTF_WESTMERE_INTRINSICS_H
|
|
|
|
#ifdef SIMDUTF_VISUAL_STUDIO
|
|
// under clang within visual studio, this will include <x86intrin.h>
|
|
#include <intrin.h> // visual studio or clang
|
|
#else
|
|
|
|
#if SIMDUTF_GCC11ORMORE
|
|
// We should not get warnings while including <x86intrin.h> yet we do
|
|
// under some versions of GCC.
|
|
// If the x86intrin.h header has uninitialized values that are problematic,
|
|
// it is a GCC issue, we want to ignore these warnigns.
|
|
SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
|
|
#endif
|
|
|
|
#include <x86intrin.h> // elsewhere
|
|
|
|
|
|
#if SIMDUTF_GCC11ORMORE
|
|
// cancels the suppression of the -Wuninitialized
|
|
SIMDUTF_POP_DISABLE_WARNINGS
|
|
#endif
|
|
|
|
#endif // SIMDUTF_VISUAL_STUDIO
|
|
|
|
|
|
#ifdef SIMDUTF_CLANG_VISUAL_STUDIO
|
|
/**
|
|
* You are not supposed, normally, to include these
|
|
* headers directly. Instead you should either include intrin.h
|
|
* or x86intrin.h. However, when compiling with clang
|
|
* under Windows (i.e., when _MSC_VER is set), these headers
|
|
* only get included *if* the corresponding features are detected
|
|
* from macros:
|
|
*/
|
|
#include <smmintrin.h> // for _mm_alignr_epi8
|
|
#endif
|
|
|
|
|
|
|
|
#endif // SIMDUTF_WESTMERE_INTRINSICS_H
|
|
/* end file src/simdutf/westmere/intrinsics.h */
|
|
|
|
//
|
|
// The rest need to be inside the region
|
|
//
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/begin.h
|
|
/* begin file src/simdutf/westmere/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "westmere"
|
|
// #define SIMDUTF_IMPLEMENTATION westmere
|
|
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_TARGET_WESTMERE
|
|
#endif
|
|
/* end file src/simdutf/westmere/begin.h */
|
|
|
|
// Declarations
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/bitmanipulation.h
|
|
/* begin file src/simdutf/westmere/bitmanipulation.h */
|
|
#ifndef SIMDUTF_WESTMERE_BITMANIPULATION_H
|
|
#define SIMDUTF_WESTMERE_BITMANIPULATION_H
|
|
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
|
|
// note: we do not support legacy 32-bit Windows
|
|
return __popcnt64(input_num);// Visual Studio wants two underscores
|
|
}
|
|
#else
|
|
simdutf_really_inline long long int count_ones(uint64_t input_num) {
|
|
return _popcnt64(input_num);
|
|
}
|
|
#endif
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_WESTMERE_BITMANIPULATION_H
|
|
/* end file src/simdutf/westmere/bitmanipulation.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/simd.h
|
|
/* begin file src/simdutf/westmere/simd.h */
|
|
#ifndef SIMDUTF_WESTMERE_SIMD_H
|
|
#define SIMDUTF_WESTMERE_SIMD_H
|
|
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
template<typename Child>
|
|
struct base {
|
|
__m128i value;
|
|
|
|
// Zero constructor
|
|
simdutf_really_inline base() : value{__m128i()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdutf_really_inline base(const __m128i _value) : value(_value) {}
|
|
// Conversion to SIMD register
|
|
simdutf_really_inline operator const __m128i&() const { return this->value; }
|
|
simdutf_really_inline operator __m128i&() { return this->value; }
|
|
template <endianness big_endian>
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const {
|
|
__m128i first = _mm_cvtepu8_epi16(*this);
|
|
__m128i second = _mm_cvtepu8_epi16(_mm_srli_si128(*this,8));
|
|
if (big_endian) {
|
|
const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
first = _mm_shuffle_epi8(first, swap);
|
|
second = _mm_shuffle_epi8(second, swap);
|
|
}
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(p), first);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), second);
|
|
}
|
|
simdutf_really_inline void store_ascii_as_utf32(char32_t * p) const {
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(p), _mm_cvtepu8_epi32(*this));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(p+4), _mm_cvtepu8_epi32(_mm_srli_si128(*this,4)));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), _mm_cvtepu8_epi32(_mm_srli_si128(*this,8)));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(p+12), _mm_cvtepu8_epi32(_mm_srli_si128(*this,12)));
|
|
}
|
|
// Bit operations
|
|
simdutf_really_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); }
|
|
simdutf_really_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); }
|
|
simdutf_really_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); }
|
|
simdutf_really_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(other, *this); }
|
|
simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
|
|
simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
|
|
simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
|
|
};
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template<typename T>
|
|
struct simd8;
|
|
|
|
template<typename T, typename Mask=simd8<bool>>
|
|
struct base8: base<simd8<T>> {
|
|
typedef uint16_t bitmask_t;
|
|
typedef uint32_t bitmask2_t;
|
|
|
|
simdutf_really_inline T first() const { return _mm_extract_epi8(*this,0); }
|
|
simdutf_really_inline T last() const { return _mm_extract_epi8(*this,15); }
|
|
simdutf_really_inline base8() : base<simd8<T>>() {}
|
|
simdutf_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
|
|
|
|
friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm_cmpeq_epi8(lhs, rhs); }
|
|
|
|
static const int SIZE = sizeof(base<simd8<T>>::value);
|
|
|
|
template<int N=1>
|
|
simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template<>
|
|
struct simd8<bool>: base8<bool> {
|
|
static simdutf_really_inline simd8<bool> splat(bool _value) { return _mm_set1_epi8(uint8_t(-(!!_value))); }
|
|
|
|
simdutf_really_inline simd8<bool>() : base8() {}
|
|
simdutf_really_inline simd8<bool>(const __m128i _value) : base8<bool>(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
|
|
|
|
simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
|
|
simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
|
|
simdutf_really_inline bool none() const { return _mm_testz_si128(*this, *this); }
|
|
simdutf_really_inline bool all() const { return _mm_movemask_epi8(*this) == 0xFFFF; }
|
|
simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
|
|
};
|
|
|
|
template<typename T>
|
|
struct base8_numeric: base8<T> {
|
|
static simdutf_really_inline simd8<T> splat(T _value) { return _mm_set1_epi8(_value); }
|
|
static simdutf_really_inline simd8<T> zero() { return _mm_setzero_si128(); }
|
|
static simdutf_really_inline simd8<T> load(const T values[16]) {
|
|
return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
|
|
}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
static simdutf_really_inline simd8<T> repeat_16(
|
|
T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
|
|
T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
|
|
) {
|
|
return simd8<T>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
simdutf_really_inline base8_numeric() : base8<T>() {}
|
|
simdutf_really_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdutf_really_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
|
|
|
|
// Override to distinguish from bool version
|
|
simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdutf_really_inline simd8<T> operator+(const simd8<T> other) const { return _mm_add_epi8(*this, other); }
|
|
simdutf_really_inline simd8<T> operator-(const simd8<T> other) const { return _mm_sub_epi8(*this, other); }
|
|
simdutf_really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
|
|
simdutf_really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
|
|
template<typename L>
|
|
simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return _mm_shuffle_epi8(lookup_table, *this);
|
|
}
|
|
|
|
template<typename L>
|
|
simdutf_really_inline simd8<L> lookup_16(
|
|
L replace0, L replace1, L replace2, L replace3,
|
|
L replace4, L replace5, L replace6, L replace7,
|
|
L replace8, L replace9, L replace10, L replace11,
|
|
L replace12, L replace13, L replace14, L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3,
|
|
replace4, replace5, replace6, replace7,
|
|
replace8, replace9, replace10, replace11,
|
|
replace12, replace13, replace14, replace15
|
|
));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template<>
|
|
struct simd8<int8_t> : base8_numeric<int8_t> {
|
|
simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
|
|
simdutf_really_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdutf_really_inline simd8(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) : simd8(_mm_setr_epi8(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
)) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdutf_really_inline static simd8<int8_t> repeat_16(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) {
|
|
return simd8<int8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
simdutf_really_inline operator simd8<uint8_t>() const;
|
|
simdutf_really_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; }
|
|
|
|
// Order-sensitive comparisons
|
|
simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm_max_epi8(*this, other); }
|
|
simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm_min_epi8(*this, other); }
|
|
simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(*this, other); }
|
|
simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(other, *this); }
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template<>
|
|
struct simd8<uint8_t>: base8_numeric<uint8_t> {
|
|
simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
|
|
simdutf_really_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd8(const uint8_t* values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdutf_really_inline simd8(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) : simd8(_mm_setr_epi8(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
)) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdutf_really_inline static simd8<uint8_t> repeat_16(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) {
|
|
return simd8<uint8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Saturated math
|
|
simdutf_really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm_adds_epu8(*this, other); }
|
|
simdutf_really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm_subs_epu8(*this, other); }
|
|
|
|
// Order-specific operations
|
|
simdutf_really_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm_max_epu8(*this, other); }
|
|
simdutf_really_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm_min_epu8(*this, other); }
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
|
|
simdutf_really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
|
|
simdutf_really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
|
|
simdutf_really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
simdutf_really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
|
|
// Bit-specific operations
|
|
simdutf_really_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
|
|
simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
|
|
simdutf_really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
|
|
simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
|
|
simdutf_really_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; }
|
|
|
|
simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
|
|
simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
|
|
simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm_testz_si128(*this, bits); }
|
|
simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
|
|
template<int N>
|
|
simdutf_really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
|
|
template<int N>
|
|
simdutf_really_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
|
|
// Get one of the bits and make a bitmask out of it.
|
|
// e.g. value.get_bit<7>() gets the high bit
|
|
template<int N>
|
|
simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
|
|
};
|
|
simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
|
|
|
|
// Unsigned bytes
|
|
template<>
|
|
struct simd8<uint16_t>: base<uint16_t> {
|
|
static simdutf_really_inline simd8<uint16_t> splat(uint16_t _value) { return _mm_set1_epi16(_value); }
|
|
static simdutf_really_inline simd8<uint16_t> load(const uint16_t values[8]) {
|
|
return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
|
|
}
|
|
|
|
simdutf_really_inline simd8() : base<uint16_t>() {}
|
|
simdutf_really_inline simd8(const __m128i _value) : base<uint16_t>(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(uint16_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd8(const uint16_t* values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdutf_really_inline simd8(
|
|
uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7
|
|
) : simd8(_mm_setr_epi16(
|
|
v0, v1, v2, v3, v4, v5, v6, v7
|
|
)) {}
|
|
|
|
// Saturated math
|
|
simdutf_really_inline simd8<uint16_t> saturating_add(const simd8<uint16_t> other) const { return _mm_adds_epu16(*this, other); }
|
|
simdutf_really_inline simd8<uint16_t> saturating_sub(const simd8<uint16_t> other) const { return _mm_subs_epu16(*this, other); }
|
|
|
|
// Order-specific operations
|
|
simdutf_really_inline simd8<uint16_t> max_val(const simd8<uint16_t> other) const { return _mm_max_epu16(*this, other); }
|
|
simdutf_really_inline simd8<uint16_t> min_val(const simd8<uint16_t> other) const { return _mm_min_epu16(*this, other); }
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd8<uint16_t> gt_bits(const simd8<uint16_t> other) const { return this->saturating_sub(other); }
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd8<uint16_t> lt_bits(const simd8<uint16_t> other) const { return other.saturating_sub(*this); }
|
|
simdutf_really_inline simd8<bool> operator<=(const simd8<uint16_t> other) const { return other.max_val(*this) == other; }
|
|
simdutf_really_inline simd8<bool> operator>=(const simd8<uint16_t> other) const { return other.min_val(*this) == other; }
|
|
simdutf_really_inline simd8<bool> operator==(const simd8<uint16_t> other) const { return _mm_cmpeq_epi16(*this, other); }
|
|
simdutf_really_inline simd8<bool> operator&(const simd8<uint16_t> other) const { return _mm_and_si128(*this, other); }
|
|
simdutf_really_inline simd8<bool> operator|(const simd8<uint16_t> other) const { return _mm_or_si128(*this, other); }
|
|
|
|
// Bit-specific operations
|
|
simdutf_really_inline simd8<bool> bits_not_set() const { return *this == uint16_t(0); }
|
|
simdutf_really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
|
|
|
|
simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
|
|
simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
|
|
simdutf_really_inline bool bits_not_set_anywhere(simd8<uint16_t> bits) const { return _mm_testz_si128(*this, bits); }
|
|
simdutf_really_inline bool any_bits_set_anywhere(simd8<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
|
|
};
|
|
template<typename T>
|
|
struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
|
|
simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
|
|
simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
|
|
|
|
simdutf_really_inline void store(T* ptr) const {
|
|
this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
|
|
this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
|
|
this->chunks[2].store(ptr+sizeof(simd8<T>)*2/sizeof(T));
|
|
this->chunks[3].store(ptr+sizeof(simd8<T>)*3/sizeof(T));
|
|
}
|
|
|
|
simdutf_really_inline simd8x64<T>& operator |=(const simd8x64<T> &other) {
|
|
this->chunks[0] |= other.chunks[0];
|
|
this->chunks[1] |= other.chunks[1];
|
|
this->chunks[2] |= other.chunks[2];
|
|
this->chunks[3] |= other.chunks[3];
|
|
return *this;
|
|
}
|
|
|
|
simdutf_really_inline simd8<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return this->reduce_or().is_ascii();
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
|
|
this->chunks[0].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*0);
|
|
this->chunks[1].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*1);
|
|
this->chunks[2].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*2);
|
|
this->chunks[3].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*3);
|
|
}
|
|
|
|
simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
|
|
this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*0);
|
|
this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*1);
|
|
this->chunks[2].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*2);
|
|
this->chunks[3].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*3);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t to_bitmask() const {
|
|
uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() );
|
|
uint64_t r1 = this->chunks[1].to_bitmask() ;
|
|
uint64_t r2 = this->chunks[2].to_bitmask() ;
|
|
uint64_t r3 = this->chunks[3].to_bitmask() ;
|
|
return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t eq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == mask,
|
|
this->chunks[1] == mask,
|
|
this->chunks[2] == mask,
|
|
this->chunks[3] == mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == other.chunks[0],
|
|
this->chunks[1] == other.chunks[1],
|
|
this->chunks[2] == other.chunks[2],
|
|
this->chunks[3] == other.chunks[3]
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t lteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] <= mask,
|
|
this->chunks[1] <= mask,
|
|
this->chunks[2] <= mask,
|
|
this->chunks[3] <= mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t in_range(const T low, const T high) const {
|
|
const simd8<T> mask_low = simd8<T>::splat(low);
|
|
const simd8<T> mask_high = simd8<T>::splat(high);
|
|
|
|
return simd8x64<bool>(
|
|
(this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
|
|
(this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
|
|
(this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
|
|
(this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
|
|
const simd8<T> mask_low = simd8<T>::splat(low-1);
|
|
const simd8<T> mask_high = simd8<T>::splat(high+1);
|
|
return simd8x64<bool>(
|
|
(this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
|
|
(this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
|
|
(this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
|
|
(this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low)
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t lt(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] < mask,
|
|
this->chunks[1] < mask,
|
|
this->chunks[2] < mask,
|
|
this->chunks[3] < mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t gt(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] > mask,
|
|
this->chunks[1] > mask,
|
|
this->chunks[2] > mask,
|
|
this->chunks[3] > mask
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t gteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] >= mask,
|
|
this->chunks[1] >= mask,
|
|
this->chunks[2] >= mask,
|
|
this->chunks[3] >= mask
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
|
|
const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
|
|
return simd8x64<bool>(
|
|
simd8<uint8_t>(__m128i(this->chunks[0])) >= mask,
|
|
simd8<uint8_t>(__m128i(this->chunks[1])) >= mask,
|
|
simd8<uint8_t>(__m128i(this->chunks[2])) >= mask,
|
|
simd8<uint8_t>(__m128i(this->chunks[3])) >= mask
|
|
).to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/simd16-inl.h
|
|
/* begin file src/simdutf/westmere/simd16-inl.h */
|
|
template<typename T>
|
|
struct simd16;
|
|
|
|
template<typename T, typename Mask=simd16<bool>>
|
|
struct base16: base<simd16<T>> {
|
|
typedef uint16_t bitmask_t;
|
|
typedef uint32_t bitmask2_t;
|
|
|
|
simdutf_really_inline base16() : base<simd16<T>>() {}
|
|
simdutf_really_inline base16(const __m128i _value) : base<simd16<T>>(_value) {}
|
|
template <typename Pointer>
|
|
simdutf_really_inline base16(const Pointer* ptr) : base16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr))) {}
|
|
|
|
friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return _mm_cmpeq_epi16(lhs, rhs); }
|
|
|
|
static const int SIZE = sizeof(base<simd16<T>>::value);
|
|
|
|
template<int N=1>
|
|
simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
|
|
return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template<>
|
|
struct simd16<bool>: base16<bool> {
|
|
static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm_set1_epi16(uint16_t(-(!!_value))); }
|
|
|
|
simdutf_really_inline simd16<bool>() : base16() {}
|
|
simdutf_really_inline simd16<bool>(const __m128i _value) : base16<bool>(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
|
|
|
|
simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
|
|
simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
|
|
simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
|
|
};
|
|
|
|
template<typename T>
|
|
struct base16_numeric: base16<T> {
|
|
static simdutf_really_inline simd16<T> splat(T _value) { return _mm_set1_epi16(_value); }
|
|
static simdutf_really_inline simd16<T> zero() { return _mm_setzero_si128(); }
|
|
static simdutf_really_inline simd16<T> load(const T values[8]) {
|
|
return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
|
|
}
|
|
|
|
simdutf_really_inline base16_numeric() : base16<T>() {}
|
|
simdutf_really_inline base16_numeric(const __m128i _value) : base16<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdutf_really_inline void store(T dst[8]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
|
|
|
|
// Override to distinguish from bool version
|
|
simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm_add_epi16(*this, other); }
|
|
simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm_sub_epi16(*this, other); }
|
|
simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
|
|
simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
|
|
};
|
|
|
|
// Signed words
|
|
template<>
|
|
struct simd16<int16_t> : base16_numeric<int16_t> {
|
|
simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
|
|
simdutf_really_inline simd16(const __m128i _value) : base16_numeric<int16_t>(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
|
|
simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
|
|
// Member-by-member initialization
|
|
simdutf_really_inline simd16(
|
|
int16_t v0, int16_t v1, int16_t v2, int16_t v3, int16_t v4, int16_t v5, int16_t v6, int16_t v7)
|
|
: simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {}
|
|
simdutf_really_inline operator simd16<uint16_t>() const;
|
|
|
|
// Order-sensitive comparisons
|
|
simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm_max_epi16(*this, other); }
|
|
simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm_min_epi16(*this, other); }
|
|
simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(*this, other); }
|
|
simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(other, *this); }
|
|
};
|
|
|
|
// Unsigned words
|
|
template<>
|
|
struct simd16<uint16_t>: base16_numeric<uint16_t> {
|
|
simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
|
|
simdutf_really_inline simd16(const __m128i _value) : base16_numeric<uint16_t>(_value) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
|
|
simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
|
|
// Member-by-member initialization
|
|
simdutf_really_inline simd16(
|
|
uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7)
|
|
: simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdutf_really_inline static simd16<uint16_t> repeat_16(
|
|
uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7
|
|
) {
|
|
return simd16<uint16_t>(v0, v1, v2, v3, v4, v5, v6, v7);
|
|
}
|
|
|
|
// Saturated math
|
|
simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm_adds_epu16(*this, other); }
|
|
simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm_subs_epu16(*this, other); }
|
|
|
|
// Order-specific operations
|
|
simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm_max_epu16(*this, other); }
|
|
simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return _mm_min_epu16(*this, other); }
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
|
|
simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return other.max_val(*this) == other; }
|
|
simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return other.min_val(*this) == other; }
|
|
simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
|
|
// Bit-specific operations
|
|
simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
|
|
simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const { return (*this & bits).bits_not_set(); }
|
|
simdutf_really_inline simd16<bool> any_bits_set() const { return ~this->bits_not_set(); }
|
|
simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const { return ~this->bits_not_set(bits); }
|
|
|
|
simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
|
|
simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
|
|
simdutf_really_inline bool bits_not_set_anywhere(simd16<uint16_t> bits) const { return _mm_testz_si128(*this, bits); }
|
|
simdutf_really_inline bool any_bits_set_anywhere(simd16<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
|
|
template<int N>
|
|
simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(_mm_srli_epi16(*this, N)); }
|
|
template<int N>
|
|
simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(_mm_slli_epi16(*this, N)); }
|
|
// Get one of the bits and make a bitmask out of it.
|
|
// e.g. value.get_bit<7>() gets the high bit
|
|
template<int N>
|
|
simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
|
|
|
|
// Change the endianness
|
|
simdutf_really_inline simd16<uint16_t> swap_bytes() const {
|
|
const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
return _mm_shuffle_epi8(*this, swap);
|
|
}
|
|
|
|
// Pack with the unsigned saturation two uint16_t words into single uint8_t vector
|
|
static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
|
|
return _mm_packus_epi16(v0, v1);
|
|
}
|
|
};
|
|
simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { return this->value; }
|
|
|
|
template<typename T>
|
|
struct simd16x32 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
|
|
static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
|
|
simd16<T> chunks[NUM_CHUNKS];
|
|
|
|
simd16x32(const simd16x32<T>& o) = delete; // no copy allowed
|
|
simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
|
|
simd16x32() = delete; // no default constructor allowed
|
|
|
|
simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1, const simd16<T> chunk2, const simd16<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+2*sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+3*sizeof(simd16<T>)/sizeof(T))} {}
|
|
|
|
simdutf_really_inline void store(T* ptr) const {
|
|
this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
|
|
this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
|
|
this->chunks[2].store(ptr+sizeof(simd16<T>)*2/sizeof(T));
|
|
this->chunks[3].store(ptr+sizeof(simd16<T>)*3/sizeof(T));
|
|
}
|
|
|
|
simdutf_really_inline simd16<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return this->reduce_or().is_ascii();
|
|
}
|
|
|
|
simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
|
|
this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
|
|
this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*1);
|
|
this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*2);
|
|
this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*3);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t to_bitmask() const {
|
|
uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() );
|
|
uint64_t r1 = this->chunks[1].to_bitmask() ;
|
|
uint64_t r2 = this->chunks[2].to_bitmask() ;
|
|
uint64_t r3 = this->chunks[3].to_bitmask() ;
|
|
return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
|
|
}
|
|
|
|
simdutf_really_inline void swap_bytes() {
|
|
this->chunks[0] = this->chunks[0].swap_bytes();
|
|
this->chunks[1] = this->chunks[1].swap_bytes();
|
|
this->chunks[2] = this->chunks[2].swap_bytes();
|
|
this->chunks[3] = this->chunks[3].swap_bytes();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t eq(const T m) const {
|
|
const simd16<T> mask = simd16<T>::splat(m);
|
|
return simd16x32<bool>(
|
|
this->chunks[0] == mask,
|
|
this->chunks[1] == mask,
|
|
this->chunks[2] == mask,
|
|
this->chunks[3] == mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t eq(const simd16x32<uint16_t> &other) const {
|
|
return simd16x32<bool>(
|
|
this->chunks[0] == other.chunks[0],
|
|
this->chunks[1] == other.chunks[1],
|
|
this->chunks[2] == other.chunks[2],
|
|
this->chunks[3] == other.chunks[3]
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t lteq(const T m) const {
|
|
const simd16<T> mask = simd16<T>::splat(m);
|
|
return simd16x32<bool>(
|
|
this->chunks[0] <= mask,
|
|
this->chunks[1] <= mask,
|
|
this->chunks[2] <= mask,
|
|
this->chunks[3] <= mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t in_range(const T low, const T high) const {
|
|
const simd16<T> mask_low = simd16<T>::splat(low);
|
|
const simd16<T> mask_high = simd16<T>::splat(high);
|
|
|
|
return simd16x32<bool>(
|
|
(this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
|
|
(this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
|
|
(this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
|
|
(this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
|
|
const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low-1));
|
|
const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high+1));
|
|
return simd16x32<bool>(
|
|
(this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
|
|
(this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
|
|
(this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
|
|
(this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low)
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t lt(const T m) const {
|
|
const simd16<T> mask = simd16<T>::splat(m);
|
|
return simd16x32<bool>(
|
|
this->chunks[0] < mask,
|
|
this->chunks[1] < mask,
|
|
this->chunks[2] < mask,
|
|
this->chunks[3] < mask
|
|
).to_bitmask();
|
|
}
|
|
}; // struct simd16x32<T>
|
|
/* end file src/simdutf/westmere/simd16-inl.h */
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_WESTMERE_SIMD_INPUT_H
|
|
/* end file src/simdutf/westmere/simd.h */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/end.h
|
|
/* begin file src/simdutf/westmere/end.h */
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_UNTARGET_REGION
|
|
#endif
|
|
|
|
/* end file src/simdutf/westmere/end.h */
|
|
|
|
#endif // SIMDUTF_IMPLEMENTATION_WESTMERE
|
|
#endif // SIMDUTF_WESTMERE_COMMON_H
|
|
/* end file src/simdutf/westmere.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64.h
|
|
/* begin file src/simdutf/ppc64.h */
|
|
#ifndef SIMDUTF_PPC64_H
|
|
#define SIMDUTF_PPC64_H
|
|
|
|
#ifdef SIMDUTF_FALLBACK_H
|
|
#error "ppc64.h must be included before fallback.h"
|
|
#endif
|
|
|
|
|
|
#ifndef SIMDUTF_IMPLEMENTATION_PPC64
|
|
#define SIMDUTF_IMPLEMENTATION_PPC64 (SIMDUTF_IS_PPC64)
|
|
#endif
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_PPC64 SIMDUTF_IMPLEMENTATION_PPC64 && SIMDUTF_IS_PPC64
|
|
|
|
|
|
|
|
#if SIMDUTF_IMPLEMENTATION_PPC64
|
|
|
|
namespace simdutf {
|
|
/**
|
|
* Implementation for ALTIVEC (PPC64).
|
|
*/
|
|
namespace ppc64 {
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/implementation.h
|
|
/* begin file src/simdutf/ppc64/implementation.h */
|
|
#ifndef SIMDUTF_PPC64_IMPLEMENTATION_H
|
|
#define SIMDUTF_PPC64_IMPLEMENTATION_H
|
|
|
|
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
|
|
namespace {
|
|
using namespace simdutf;
|
|
} // namespace
|
|
|
|
class implementation final : public simdutf::implementation {
|
|
public:
|
|
simdutf_really_inline implementation()
|
|
: simdutf::implementation("ppc64", "PPC64 ALTIVEC",
|
|
internal::instruction_set::ALTIVEC) {}
|
|
simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
|
|
simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
|
|
};
|
|
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_PPC64_IMPLEMENTATION_H
|
|
/* end file src/simdutf/ppc64/implementation.h */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/begin.h
|
|
/* begin file src/simdutf/ppc64/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "ppc64"
|
|
// #define SIMDUTF_IMPLEMENTATION ppc64
|
|
/* end file src/simdutf/ppc64/begin.h */
|
|
|
|
// Declarations
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/intrinsics.h
|
|
/* begin file src/simdutf/ppc64/intrinsics.h */
|
|
#ifndef SIMDUTF_PPC64_INTRINSICS_H
|
|
#define SIMDUTF_PPC64_INTRINSICS_H
|
|
|
|
|
|
// This should be the correct header whether
|
|
// you use visual studio or other compilers.
|
|
#include <altivec.h>
|
|
|
|
// These are defined by altivec.h in GCC toolchain, it is safe to undef them.
|
|
#ifdef bool
|
|
#undef bool
|
|
#endif
|
|
|
|
#ifdef vector
|
|
#undef vector
|
|
#endif
|
|
|
|
#endif // SIMDUTF_PPC64_INTRINSICS_H
|
|
/* end file src/simdutf/ppc64/intrinsics.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/bitmanipulation.h
|
|
/* begin file src/simdutf/ppc64/bitmanipulation.h */
|
|
#ifndef SIMDUTF_PPC64_BITMANIPULATION_H
|
|
#define SIMDUTF_PPC64_BITMANIPULATION_H
|
|
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
simdutf_really_inline int count_ones(uint64_t input_num) {
|
|
// note: we do not support legacy 32-bit Windows
|
|
return __popcnt64(input_num); // Visual Studio wants two underscores
|
|
}
|
|
#else
|
|
simdutf_really_inline int count_ones(uint64_t input_num) {
|
|
return __builtin_popcountll(input_num);
|
|
}
|
|
#endif
|
|
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_PPC64_BITMANIPULATION_H
|
|
/* end file src/simdutf/ppc64/bitmanipulation.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/simd.h
|
|
/* begin file src/simdutf/ppc64/simd.h */
|
|
#ifndef SIMDUTF_PPC64_SIMD_H
|
|
#define SIMDUTF_PPC64_SIMD_H
|
|
|
|
#include <type_traits>
|
|
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
using __m128i = __vector unsigned char;
|
|
|
|
template <typename Child> struct base {
|
|
__m128i value;
|
|
|
|
// Zero constructor
|
|
simdutf_really_inline base() : value{__m128i()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdutf_really_inline base(const __m128i _value) : value(_value) {}
|
|
|
|
// Conversion to SIMD register
|
|
simdutf_really_inline operator const __m128i &() const {
|
|
return this->value;
|
|
}
|
|
simdutf_really_inline operator __m128i &() { return this->value; }
|
|
|
|
// Bit operations
|
|
simdutf_really_inline Child operator|(const Child other) const {
|
|
return vec_or(this->value, (__m128i)other);
|
|
}
|
|
simdutf_really_inline Child operator&(const Child other) const {
|
|
return vec_and(this->value, (__m128i)other);
|
|
}
|
|
simdutf_really_inline Child operator^(const Child other) const {
|
|
return vec_xor(this->value, (__m128i)other);
|
|
}
|
|
simdutf_really_inline Child bit_andnot(const Child other) const {
|
|
return vec_andc(this->value, (__m128i)other);
|
|
}
|
|
simdutf_really_inline Child &operator|=(const Child other) {
|
|
auto this_cast = static_cast<Child*>(this);
|
|
*this_cast = *this_cast | other;
|
|
return *this_cast;
|
|
}
|
|
simdutf_really_inline Child &operator&=(const Child other) {
|
|
auto this_cast = static_cast<Child*>(this);
|
|
*this_cast = *this_cast & other;
|
|
return *this_cast;
|
|
}
|
|
simdutf_really_inline Child &operator^=(const Child other) {
|
|
auto this_cast = static_cast<Child*>(this);
|
|
*this_cast = *this_cast ^ other;
|
|
return *this_cast;
|
|
}
|
|
};
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template <typename T> struct simd8;
|
|
|
|
template <typename T, typename Mask = simd8<bool>>
|
|
struct base8 : base<simd8<T>> {
|
|
typedef uint16_t bitmask_t;
|
|
typedef uint32_t bitmask2_t;
|
|
|
|
simdutf_really_inline base8() : base<simd8<T>>() {}
|
|
simdutf_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
|
|
|
|
friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) {
|
|
return (__m128i)vec_cmpeq(lhs.value, (__m128i)rhs);
|
|
}
|
|
|
|
static const int SIZE = sizeof(base<simd8<T>>::value);
|
|
|
|
template <int N = 1>
|
|
simdutf_really_inline simd8<T> prev(simd8<T> prev_chunk) const {
|
|
__m128i chunk = this->value;
|
|
#ifdef __LITTLE_ENDIAN__
|
|
chunk = (__m128i)vec_reve(this->value);
|
|
prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk);
|
|
#endif
|
|
chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N);
|
|
#ifdef __LITTLE_ENDIAN__
|
|
chunk = (__m128i)vec_reve((__m128i)chunk);
|
|
#endif
|
|
return chunk;
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template <> struct simd8<bool> : base8<bool> {
|
|
static simdutf_really_inline simd8<bool> splat(bool _value) {
|
|
return (__m128i)vec_splats((unsigned char)(-(!!_value)));
|
|
}
|
|
|
|
simdutf_really_inline simd8<bool>() : base8() {}
|
|
simdutf_really_inline simd8<bool>(const __m128i _value)
|
|
: base8<bool>(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8<bool>(bool _value)
|
|
: base8<bool>(splat(_value)) {}
|
|
|
|
simdutf_really_inline int to_bitmask() const {
|
|
__vector unsigned long long result;
|
|
const __m128i perm_mask = {0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
|
|
0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
|
|
|
|
result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value,
|
|
(__m128i)perm_mask));
|
|
#ifdef __LITTLE_ENDIAN__
|
|
return static_cast<int>(result[1]);
|
|
#else
|
|
return static_cast<int>(result[0]);
|
|
#endif
|
|
}
|
|
simdutf_really_inline bool any() const {
|
|
return !vec_all_eq(this->value, (__m128i)vec_splats(0));
|
|
}
|
|
simdutf_really_inline simd8<bool> operator~() const {
|
|
return this->value ^ (__m128i)splat(true);
|
|
}
|
|
};
|
|
|
|
template <typename T> struct base8_numeric : base8<T> {
|
|
static simdutf_really_inline simd8<T> splat(T value) {
|
|
(void)value;
|
|
return (__m128i)vec_splats(value);
|
|
}
|
|
static simdutf_really_inline simd8<T> zero() { return splat(0); }
|
|
static simdutf_really_inline simd8<T> load(const T values[16]) {
|
|
return (__m128i)(vec_vsx_ld(0, reinterpret_cast<const uint8_t *>(values)));
|
|
}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
|
|
T v5, T v6, T v7, T v8, T v9,
|
|
T v10, T v11, T v12, T v13,
|
|
T v14, T v15) {
|
|
return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
|
|
v14, v15);
|
|
}
|
|
|
|
simdutf_really_inline base8_numeric() : base8<T>() {}
|
|
simdutf_really_inline base8_numeric(const __m128i _value)
|
|
: base8<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdutf_really_inline void store(T dst[16]) const {
|
|
vec_vsx_st(this->value, 0, reinterpret_cast<__m128i *>(dst));
|
|
}
|
|
|
|
// Override to distinguish from bool version
|
|
simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdutf_really_inline simd8<T> operator+(const simd8<T> other) const {
|
|
return (__m128i)((__m128i)this->value + (__m128i)other);
|
|
}
|
|
simdutf_really_inline simd8<T> operator-(const simd8<T> other) const {
|
|
return (__m128i)((__m128i)this->value - (__m128i)other);
|
|
}
|
|
simdutf_really_inline simd8<T> &operator+=(const simd8<T> other) {
|
|
*this = *this + other;
|
|
return *static_cast<simd8<T> *>(this);
|
|
}
|
|
simdutf_really_inline simd8<T> &operator-=(const simd8<T> other) {
|
|
*this = *this - other;
|
|
return *static_cast<simd8<T> *>(this);
|
|
}
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior
|
|
// for out of range values)
|
|
template <typename L>
|
|
simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table, this->value);
|
|
}
|
|
|
|
template <typename L>
|
|
simdutf_really_inline simd8<L>
|
|
lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
|
|
L replace5, L replace6, L replace7, L replace8, L replace9,
|
|
L replace10, L replace11, L replace12, L replace13, L replace14,
|
|
L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3, replace4, replace5, replace6,
|
|
replace7, replace8, replace9, replace10, replace11, replace12,
|
|
replace13, replace14, replace15));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template <> struct simd8<int8_t> : base8_numeric<int8_t> {
|
|
simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
|
|
simdutf_really_inline simd8(const __m128i _value)
|
|
: base8_numeric<int8_t>(_value) {}
|
|
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
|
|
int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11,
|
|
int8_t v12, int8_t v13, int8_t v14, int8_t v15)
|
|
: simd8((__m128i)(__vector signed char){v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10, v11, v12, v13, v14,
|
|
v15}) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdutf_really_inline static simd8<int8_t>
|
|
repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
|
|
int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
|
|
int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
|
|
return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
|
|
v13, v14, v15);
|
|
}
|
|
|
|
// Order-sensitive comparisons
|
|
simdutf_really_inline simd8<int8_t>
|
|
max_val(const simd8<int8_t> other) const {
|
|
return (__m128i)vec_max((__vector signed char)this->value,
|
|
(__vector signed char)(__m128i)other);
|
|
}
|
|
simdutf_really_inline simd8<int8_t>
|
|
min_val(const simd8<int8_t> other) const {
|
|
return (__m128i)vec_min((__vector signed char)this->value,
|
|
(__vector signed char)(__m128i)other);
|
|
}
|
|
simdutf_really_inline simd8<bool>
|
|
operator>(const simd8<int8_t> other) const {
|
|
return (__m128i)vec_cmpgt((__vector signed char)this->value,
|
|
(__vector signed char)(__m128i)other);
|
|
}
|
|
simdutf_really_inline simd8<bool>
|
|
operator<(const simd8<int8_t> other) const {
|
|
return (__m128i)vec_cmplt((__vector signed char)this->value,
|
|
(__vector signed char)(__m128i)other);
|
|
}
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
|
|
simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
|
|
simdutf_really_inline simd8(const __m128i _value)
|
|
: base8_numeric<uint8_t>(_value) {}
|
|
// Splat constructor
|
|
simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdutf_really_inline simd8(const uint8_t *values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdutf_really_inline
|
|
simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
|
|
uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
|
|
uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
|
|
: simd8((__m128i){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
|
|
v13, v14, v15}) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdutf_really_inline static simd8<uint8_t>
|
|
repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
|
|
uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
|
|
uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
|
|
uint8_t v15) {
|
|
return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
|
|
v13, v14, v15);
|
|
}
|
|
|
|
// Saturated math
|
|
simdutf_really_inline simd8<uint8_t>
|
|
saturating_add(const simd8<uint8_t> other) const {
|
|
return (__m128i)vec_adds(this->value, (__m128i)other);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t>
|
|
saturating_sub(const simd8<uint8_t> other) const {
|
|
return (__m128i)vec_subs(this->value, (__m128i)other);
|
|
}
|
|
|
|
// Order-specific operations
|
|
simdutf_really_inline simd8<uint8_t>
|
|
max_val(const simd8<uint8_t> other) const {
|
|
return (__m128i)vec_max(this->value, (__m128i)other);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t>
|
|
min_val(const simd8<uint8_t> other) const {
|
|
return (__m128i)vec_min(this->value, (__m128i)other);
|
|
}
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd8<uint8_t>
|
|
gt_bits(const simd8<uint8_t> other) const {
|
|
return this->saturating_sub(other);
|
|
}
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdutf_really_inline simd8<uint8_t>
|
|
lt_bits(const simd8<uint8_t> other) const {
|
|
return other.saturating_sub(*this);
|
|
}
|
|
simdutf_really_inline simd8<bool>
|
|
operator<=(const simd8<uint8_t> other) const {
|
|
return other.max_val(*this) == other;
|
|
}
|
|
simdutf_really_inline simd8<bool>
|
|
operator>=(const simd8<uint8_t> other) const {
|
|
return other.min_val(*this) == other;
|
|
}
|
|
simdutf_really_inline simd8<bool>
|
|
operator>(const simd8<uint8_t> other) const {
|
|
return this->gt_bits(other).any_bits_set();
|
|
}
|
|
simdutf_really_inline simd8<bool>
|
|
operator<(const simd8<uint8_t> other) const {
|
|
return this->gt_bits(other).any_bits_set();
|
|
}
|
|
|
|
// Bit-specific operations
|
|
simdutf_really_inline simd8<bool> bits_not_set() const {
|
|
return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0)));
|
|
}
|
|
simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const {
|
|
return (*this & bits).bits_not_set();
|
|
}
|
|
simdutf_really_inline simd8<bool> any_bits_set() const {
|
|
return ~this->bits_not_set();
|
|
}
|
|
simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
|
|
return ~this->bits_not_set(bits);
|
|
}
|
|
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return this->saturating_sub(0b01111111u).bits_not_set_anywhere();
|
|
}
|
|
|
|
simdutf_really_inline bool bits_not_set_anywhere() const {
|
|
return vec_all_eq(this->value, (__m128i)vec_splats(0));
|
|
}
|
|
simdutf_really_inline bool any_bits_set_anywhere() const {
|
|
return !bits_not_set_anywhere();
|
|
}
|
|
simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
|
|
return vec_all_eq(vec_and(this->value, (__m128i)bits),
|
|
(__m128i)vec_splats(0));
|
|
}
|
|
simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
|
|
return !bits_not_set_anywhere(bits);
|
|
}
|
|
template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
|
|
return simd8<uint8_t>(
|
|
(__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N)));
|
|
}
|
|
template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
|
|
return simd8<uint8_t>(
|
|
(__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N)));
|
|
}
|
|
};
|
|
|
|
template <typename T> struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 4,
|
|
"PPC64 kernel should use four registers per 64-byte block.");
|
|
simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
|
|
simd8x64<T> &
|
|
operator=(const simd8<T> other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
|
|
const simd8<T> chunk2, const simd8<T> chunk3)
|
|
: chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
|
|
simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
|
|
|
|
simdutf_really_inline void store(T* ptr) const {
|
|
this->chunks[0].store(ptr + sizeof(simd8<T>) * 0/sizeof(T));
|
|
this->chunks[1].store(ptr + sizeof(simd8<T>) * 1/sizeof(T));
|
|
this->chunks[2].store(ptr + sizeof(simd8<T>) * 2/sizeof(T));
|
|
this->chunks[3].store(ptr + sizeof(simd8<T>) * 3/sizeof(T));
|
|
}
|
|
|
|
|
|
simdutf_really_inline simd8x64<T>& operator |=(const simd8x64<T> &other) {
|
|
this->chunks[0] |= other.chunks[0];
|
|
this->chunks[1] |= other.chunks[1];
|
|
this->chunks[2] |= other.chunks[2];
|
|
this->chunks[3] |= other.chunks[3];
|
|
return *this;
|
|
}
|
|
|
|
simdutf_really_inline simd8<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) |
|
|
(this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
|
|
simdutf_really_inline bool is_ascii() const {
|
|
return input.reduce_or().is_ascii();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t to_bitmask() const {
|
|
uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
|
|
uint64_t r1 = this->chunks[1].to_bitmask();
|
|
uint64_t r2 = this->chunks[2].to_bitmask();
|
|
uint64_t r3 = this->chunks[3].to_bitmask();
|
|
return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
|
|
}
|
|
|
|
simdutf_really_inline uint64_t eq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
|
|
this->chunks[2] == mask, this->chunks[3] == mask)
|
|
.to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
|
|
return simd8x64<bool>(this->chunks[0] == other.chunks[0],
|
|
this->chunks[1] == other.chunks[1],
|
|
this->chunks[2] == other.chunks[2],
|
|
this->chunks[3] == other.chunks[3])
|
|
.to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t lteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
|
|
this->chunks[2] <= mask, this->chunks[3] <= mask)
|
|
.to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t in_range(const T low, const T high) const {
|
|
const simd8<T> mask_low = simd8<T>::splat(low);
|
|
const simd8<T> mask_high = simd8<T>::splat(high);
|
|
|
|
return simd8x64<bool>(
|
|
(this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
|
|
(this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
|
|
(this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
|
|
(this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
|
|
const simd8<T> mask_low = simd8<T>::splat(low);
|
|
const simd8<T> mask_high = simd8<T>::splat(high);
|
|
return simd8x64<bool>(
|
|
(this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
|
|
(this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
|
|
(this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
|
|
(this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t lt(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
|
|
this->chunks[2] < mask, this->chunks[3] < mask)
|
|
.to_bitmask();
|
|
}
|
|
|
|
simdutf_really_inline uint64_t gt(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] > mask,
|
|
this->chunks[1] > mask,
|
|
this->chunks[2] > mask,
|
|
this->chunks[3] > mask
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t gteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] >= mask,
|
|
this->chunks[1] >= mask,
|
|
this->chunks[2] >= mask,
|
|
this->chunks[3] >= mask
|
|
).to_bitmask();
|
|
}
|
|
simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
|
|
const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
|
|
return simd8x64<bool>(
|
|
simd8<uint8_t>(this->chunks[0]) >= mask,
|
|
simd8<uint8_t>(this->chunks[1]) >= mask,
|
|
simd8<uint8_t>(this->chunks[2]) >= mask,
|
|
simd8<uint8_t>(this->chunks[3]) >= mask
|
|
).to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_PPC64_SIMD_INPUT_H
|
|
/* end file src/simdutf/ppc64/simd.h */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/end.h
|
|
/* begin file src/simdutf/ppc64/end.h */
|
|
/* end file src/simdutf/ppc64/end.h */
|
|
|
|
#endif // SIMDUTF_IMPLEMENTATION_PPC64
|
|
|
|
#endif // SIMDUTF_PPC64_H
|
|
/* end file src/simdutf/ppc64.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback.h
|
|
/* begin file src/simdutf/fallback.h */
|
|
#ifndef SIMDUTF_FALLBACK_H
|
|
#define SIMDUTF_FALLBACK_H
|
|
|
|
|
|
// Note that fallback.h is always imported last.
|
|
|
|
// Default Fallback to on unless a builtin implementation has already been selected.
|
|
#ifndef SIMDUTF_IMPLEMENTATION_FALLBACK
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_ARM64 || SIMDUTF_CAN_ALWAYS_RUN_ICELAKE || SIMDUTF_CAN_ALWAYS_RUN_HASWELL || SIMDUTF_CAN_ALWAYS_RUN_WESTMERE || SIMDUTF_CAN_ALWAYS_RUN_PPC64
|
|
#define SIMDUTF_IMPLEMENTATION_FALLBACK 0
|
|
#else
|
|
#define SIMDUTF_IMPLEMENTATION_FALLBACK 1
|
|
#endif
|
|
#endif
|
|
|
|
#define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK)
|
|
|
|
#if SIMDUTF_IMPLEMENTATION_FALLBACK
|
|
|
|
namespace simdutf {
|
|
/**
|
|
* Fallback implementation (runs on any machine).
|
|
*/
|
|
namespace fallback {
|
|
} // namespace fallback
|
|
} // namespace simdutf
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/implementation.h
|
|
/* begin file src/simdutf/fallback/implementation.h */
|
|
#ifndef SIMDUTF_FALLBACK_IMPLEMENTATION_H
|
|
#define SIMDUTF_FALLBACK_IMPLEMENTATION_H
|
|
|
|
|
|
namespace simdutf {
|
|
namespace fallback {
|
|
|
|
namespace {
|
|
using namespace simdutf;
|
|
}
|
|
|
|
class implementation final : public simdutf::implementation {
|
|
public:
|
|
simdutf_really_inline implementation() : simdutf::implementation(
|
|
"fallback",
|
|
"Generic fallback implementation",
|
|
0
|
|
) {}
|
|
simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
|
|
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
|
|
void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
|
|
simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
|
|
simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
|
|
};
|
|
|
|
} // namespace fallback
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_FALLBACK_IMPLEMENTATION_H
|
|
/* end file src/simdutf/fallback/implementation.h */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/begin.h
|
|
/* begin file src/simdutf/fallback/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "fallback"
|
|
// #define SIMDUTF_IMPLEMENTATION fallback
|
|
/* end file src/simdutf/fallback/begin.h */
|
|
|
|
// Declarations
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/bitmanipulation.h
|
|
/* begin file src/simdutf/fallback/bitmanipulation.h */
|
|
#ifndef SIMDUTF_FALLBACK_BITMANIPULATION_H
|
|
#define SIMDUTF_FALLBACK_BITMANIPULATION_H
|
|
|
|
#include <limits>
|
|
|
|
namespace simdutf {
|
|
namespace fallback {
|
|
namespace {
|
|
|
|
#if defined(_MSC_VER) && !defined(_M_ARM64) && !defined(_M_X64)
|
|
static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) {
|
|
unsigned long x0 = (unsigned long)x, top, bottom;
|
|
_BitScanForward(&top, (unsigned long)(x >> 32));
|
|
_BitScanForward(&bottom, x0);
|
|
*ret = x0 ? bottom : 32 + top;
|
|
return x != 0;
|
|
}
|
|
static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) {
|
|
unsigned long x1 = (unsigned long)(x >> 32), top, bottom;
|
|
_BitScanReverse(&top, x1);
|
|
_BitScanReverse(&bottom, (unsigned long)x);
|
|
*ret = x1 ? top + 32 : bottom;
|
|
return x != 0;
|
|
}
|
|
#endif
|
|
|
|
} // unnamed namespace
|
|
} // namespace fallback
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_FALLBACK_BITMANIPULATION_H
|
|
/* end file src/simdutf/fallback/bitmanipulation.h */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/end.h
|
|
/* begin file src/simdutf/fallback/end.h */
|
|
/* end file src/simdutf/fallback/end.h */
|
|
|
|
#endif // SIMDUTF_IMPLEMENTATION_FALLBACK
|
|
#endif // SIMDUTF_FALLBACK_H
|
|
/* end file src/simdutf/fallback.h */
|
|
|
|
namespace simdutf {
|
|
bool implementation::supported_by_runtime_system() const {
|
|
uint32_t required_instruction_sets = this->required_instruction_sets();
|
|
uint32_t supported_instruction_sets = internal::detect_supported_architectures();
|
|
return ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets);
|
|
}
|
|
|
|
simdutf_warn_unused encoding_type implementation::autodetect_encoding(const char * input, size_t length) const noexcept {
|
|
// If there is a BOM, then we trust it.
|
|
auto bom_encoding = simdutf::BOM::check_bom(input, length);
|
|
if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
|
|
// UTF8 is common, it includes ASCII, and is commonly represented
|
|
// without a BOM, so if it fits, go with that. Note that it is still
|
|
// possible to get it wrong, we are only 'guessing'. If some has UTF-16
|
|
// data without a BOM, it could pass as UTF-8.
|
|
//
|
|
// An interesting twist might be to check for UTF-16 ASCII first (every
|
|
// other byte is zero).
|
|
if(validate_utf8(input, length)) { return encoding_type::UTF8; }
|
|
// The next most common encoding that might appear without BOM is probably
|
|
// UTF-16LE, so try that next.
|
|
if((length % 2) == 0) {
|
|
// important: we need to divide by two
|
|
if(validate_utf16le(reinterpret_cast<const char16_t*>(input), length/2)) { return encoding_type::UTF16_LE; }
|
|
}
|
|
if((length % 4) == 0) {
|
|
if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { return encoding_type::UTF32_LE; }
|
|
}
|
|
return encoding_type::unspecified;
|
|
}
|
|
|
|
namespace internal {
|
|
|
|
// Static array of known implementations. We're hoping these get baked into the executable
|
|
// without requiring a static initializer.
|
|
|
|
|
|
#if SIMDUTF_IMPLEMENTATION_ICELAKE
|
|
const icelake::implementation icelake_singleton{};
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_HASWELL
|
|
const haswell::implementation haswell_singleton{};
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_WESTMERE
|
|
const westmere::implementation westmere_singleton{};
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_ARM64
|
|
const arm64::implementation arm64_singleton{};
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_PPC64
|
|
const ppc64::implementation ppc64_singleton{};
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_FALLBACK
|
|
const fallback::implementation fallback_singleton{};
|
|
#endif
|
|
|
|
/**
|
|
* @private Detects best supported implementation on first use, and sets it
|
|
*/
|
|
class detect_best_supported_implementation_on_first_use final : public implementation {
|
|
public:
|
|
const std::string &name() const noexcept final { return set_best()->name(); }
|
|
const std::string &description() const noexcept final { return set_best()->description(); }
|
|
uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); }
|
|
|
|
simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept override {
|
|
return set_best()->detect_encodings(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused bool validate_utf8(const char * buf, size_t len) const noexcept final override {
|
|
return set_best()->validate_utf8(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result validate_utf8_with_errors(const char * buf, size_t len) const noexcept final override {
|
|
return set_best()->validate_utf8_with_errors(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused bool validate_ascii(const char * buf, size_t len) const noexcept final override {
|
|
return set_best()->validate_ascii(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result validate_ascii_with_errors(const char * buf, size_t len) const noexcept final override {
|
|
return set_best()->validate_ascii_with_errors(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused bool validate_utf16le(const char16_t * buf, size_t len) const noexcept final override {
|
|
return set_best()->validate_utf16le(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused bool validate_utf16be(const char16_t * buf, size_t len) const noexcept final override {
|
|
return set_best()->validate_utf16be(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result validate_utf16le_with_errors(const char16_t * buf, size_t len) const noexcept final override {
|
|
return set_best()->validate_utf16le_with_errors(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result validate_utf16be_with_errors(const char16_t * buf, size_t len) const noexcept final override {
|
|
return set_best()->validate_utf16be_with_errors(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused bool validate_utf32(const char32_t * buf, size_t len) const noexcept final override {
|
|
return set_best()->validate_utf32(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result validate_utf32_with_errors(const char32_t * buf, size_t len) const noexcept final override {
|
|
return set_best()->validate_utf32_with_errors(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
|
|
return set_best()->convert_utf8_to_utf16le(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
|
|
return set_best()->convert_utf8_to_utf16be(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
|
|
return set_best()->convert_utf8_to_utf16le_with_errors(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
|
|
return set_best()->convert_utf8_to_utf16be_with_errors(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf8_to_utf16le(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf8_to_utf16be(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override {
|
|
return set_best()->convert_utf8_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override {
|
|
return set_best()->convert_utf8_to_utf32_with_errors(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf8_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
|
|
return set_best()->convert_utf16le_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
|
|
return set_best()->convert_utf16be_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
|
|
return set_best()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
|
|
return set_best()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf16le_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf16be_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override {
|
|
return set_best()->convert_utf32_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override {
|
|
return set_best()->convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf32_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
|
|
return set_best()->convert_utf32_to_utf16le(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
|
|
return set_best()->convert_utf32_to_utf16be(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
|
|
return set_best()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
|
|
return set_best()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf32_to_utf16le(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf32_to_utf16be(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
|
|
return set_best()->convert_utf16le_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
|
|
return set_best()->convert_utf16be_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
|
|
return set_best()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
|
|
return set_best()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf16le_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
|
|
return set_best()->convert_valid_utf16be_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
void change_endianness_utf16(const char16_t * buf, size_t len, char16_t * output) const noexcept final override {
|
|
set_best()->change_endianness_utf16(buf, len, output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t len) const noexcept final override {
|
|
return set_best()->count_utf16le(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t len) const noexcept final override {
|
|
return set_best()->count_utf16be(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused size_t count_utf8(const char * buf, size_t len) const noexcept final override {
|
|
return set_best()->count_utf8(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * buf, size_t len) const noexcept override {
|
|
return set_best()->utf8_length_from_utf16le(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * buf, size_t len) const noexcept override {
|
|
return set_best()->utf8_length_from_utf16be(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * buf, size_t len) const noexcept override {
|
|
return set_best()->utf32_length_from_utf16le(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * buf, size_t len) const noexcept override {
|
|
return set_best()->utf32_length_from_utf16be(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused size_t utf16_length_from_utf8(const char * buf, size_t len) const noexcept override {
|
|
return set_best()->utf16_length_from_utf8(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * buf, size_t len) const noexcept override {
|
|
return set_best()->utf8_length_from_utf32(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * buf, size_t len) const noexcept override {
|
|
return set_best()->utf16_length_from_utf32(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused size_t utf32_length_from_utf8(const char * buf, size_t len) const noexcept override {
|
|
return set_best()->utf32_length_from_utf8(buf, len);
|
|
}
|
|
|
|
simdutf_really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
|
|
|
|
private:
|
|
const implementation *set_best() const noexcept;
|
|
};
|
|
|
|
|
|
const std::initializer_list<const implementation *> available_implementation_pointers {
|
|
#if SIMDUTF_IMPLEMENTATION_ICELAKE
|
|
&icelake_singleton,
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_HASWELL
|
|
&haswell_singleton,
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_WESTMERE
|
|
&westmere_singleton,
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_ARM64
|
|
&arm64_singleton,
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_PPC64
|
|
&ppc64_singleton,
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_FALLBACK
|
|
&fallback_singleton,
|
|
#endif
|
|
}; // available_implementation_pointers
|
|
|
|
// So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support
|
|
class unsupported_implementation final : public implementation {
|
|
public:
|
|
simdutf_warn_unused int detect_encodings(const char *, size_t) const noexcept override {
|
|
return encoding_type::unspecified;
|
|
}
|
|
|
|
simdutf_warn_unused bool validate_utf8(const char *, size_t) const noexcept final override {
|
|
return false; // Just refuse to validate. Given that we have a fallback implementation
|
|
// it seems unlikely that unsupported_implementation will ever be used. If it is used,
|
|
// then it will flag all strings as invalid. The alternative is to return an error_code
|
|
// from which the user has to figure out whether the string is valid UTF-8... which seems
|
|
// like a lot of work just to handle the very unlikely case that we have an unsupported
|
|
// implementation. And, when it does happen (that we have an unsupported implementation),
|
|
// what are the chances that the programmer has a fallback? Given that *we* provide the
|
|
// fallback, it implies that the programmer would need a fallback for our fallback.
|
|
}
|
|
|
|
simdutf_warn_unused result validate_utf8_with_errors(const char *, size_t) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused bool validate_ascii(const char *, size_t) const noexcept final override {
|
|
return false;
|
|
}
|
|
|
|
simdutf_warn_unused result validate_ascii_with_errors(const char *, size_t) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused bool validate_utf16le(const char16_t*, size_t) const noexcept final override {
|
|
return false;
|
|
}
|
|
|
|
simdutf_warn_unused bool validate_utf16be(const char16_t*, size_t) const noexcept final override {
|
|
return false;
|
|
}
|
|
|
|
simdutf_warn_unused result validate_utf16le_with_errors(const char16_t*, size_t) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused result validate_utf16be_with_errors(const char16_t*, size_t) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused bool validate_utf32(const char32_t*, size_t) const noexcept final override {
|
|
return false;
|
|
}
|
|
|
|
simdutf_warn_unused result validate_utf32_with_errors(const char32_t*, size_t) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char*, size_t, char16_t*) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char*, size_t, char16_t*) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char*, size_t, char32_t*) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t*, size_t, char*) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override {
|
|
return result(error_code::OTHER, 0);
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
void change_endianness_utf16(const char16_t *, size_t, char16_t *) const noexcept final override {
|
|
|
|
}
|
|
|
|
simdutf_warn_unused size_t count_utf16le(const char16_t *, size_t) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t count_utf16be(const char16_t *, size_t) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t count_utf8(const char *, size_t) const noexcept final override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *, size_t) const noexcept override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *, size_t) const noexcept override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *, size_t) const noexcept override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *, size_t) const noexcept override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t utf16_length_from_utf8(const char *, size_t) const noexcept override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *, size_t) const noexcept override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *, size_t) const noexcept override {
|
|
return 0;
|
|
}
|
|
|
|
simdutf_warn_unused size_t utf32_length_from_utf8(const char *, size_t) const noexcept override {
|
|
return 0;
|
|
}
|
|
|
|
unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}
|
|
};
|
|
|
|
const unsupported_implementation unsupported_singleton{};
|
|
|
|
size_t available_implementation_list::size() const noexcept {
|
|
return internal::available_implementation_pointers.size();
|
|
}
|
|
const implementation * const *available_implementation_list::begin() const noexcept {
|
|
return internal::available_implementation_pointers.begin();
|
|
}
|
|
const implementation * const *available_implementation_list::end() const noexcept {
|
|
return internal::available_implementation_pointers.end();
|
|
}
|
|
const implementation *available_implementation_list::detect_best_supported() const noexcept {
|
|
// They are prelisted in priority order, so we just go down the list
|
|
uint32_t supported_instruction_sets = internal::detect_supported_architectures();
|
|
for (const implementation *impl : internal::available_implementation_pointers) {
|
|
uint32_t required_instruction_sets = impl->required_instruction_sets();
|
|
if ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets) { return impl; }
|
|
}
|
|
return &unsupported_singleton; // this should never happen?
|
|
}
|
|
|
|
const implementation *detect_best_supported_implementation_on_first_use::set_best() const noexcept {
|
|
SIMDUTF_PUSH_DISABLE_WARNINGS
|
|
SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe
|
|
char *force_implementation_name = getenv("SIMDUTF_FORCE_IMPLEMENTATION");
|
|
SIMDUTF_POP_DISABLE_WARNINGS
|
|
|
|
if (force_implementation_name) {
|
|
auto force_implementation = get_available_implementations()[force_implementation_name];
|
|
if (force_implementation) {
|
|
return get_active_implementation() = force_implementation;
|
|
} else {
|
|
// Note: abort() and stderr usage within the library is forbidden.
|
|
return get_active_implementation() = &unsupported_singleton;
|
|
}
|
|
}
|
|
return get_active_implementation() = get_available_implementations().detect_best_supported();
|
|
}
|
|
|
|
} // namespace internal
|
|
|
|
|
|
|
|
/**
|
|
* The list of available implementations compiled into simdutf.
|
|
*/
|
|
SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations() {
|
|
static const internal::available_implementation_list available_implementations{};
|
|
return available_implementations;
|
|
}
|
|
|
|
/**
|
|
* The active implementation.
|
|
*/
|
|
SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation>& get_active_implementation() {
|
|
static const internal::detect_best_supported_implementation_on_first_use detect_best_supported_implementation_on_first_use_singleton;
|
|
static internal::atomic_ptr<const implementation> active_implementation{&detect_best_supported_implementation_on_first_use_singleton};
|
|
return active_implementation;
|
|
}
|
|
|
|
simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept {
|
|
return get_active_implementation()->validate_utf8(buf, len);
|
|
}
|
|
simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) noexcept {
|
|
return get_active_implementation()->validate_utf8_with_errors(buf, len);
|
|
}
|
|
simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept {
|
|
return get_active_implementation()->validate_ascii(buf, len);
|
|
}
|
|
simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) noexcept {
|
|
return get_active_implementation()->validate_ascii_with_errors(buf, len);
|
|
}
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_utf8_to_utf16be(input, length, utf16_output);
|
|
#else
|
|
return convert_utf8_to_utf16le(input, length, utf16_output);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept {
|
|
return get_active_implementation()->convert_utf8_to_utf16le(input, length, utf16_output);
|
|
}
|
|
simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept {
|
|
return get_active_implementation()->convert_utf8_to_utf16be(input, length, utf16_output);
|
|
}
|
|
simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
|
|
#else
|
|
return convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
|
|
return get_active_implementation()->convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
|
|
}
|
|
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
|
|
return get_active_implementation()->convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
|
|
}
|
|
simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) noexcept {
|
|
return get_active_implementation()->convert_utf8_to_utf32(input, length, utf32_output);
|
|
}
|
|
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept {
|
|
return get_active_implementation()->convert_utf8_to_utf32_with_errors(input, length, utf32_output);
|
|
}
|
|
simdutf_warn_unused bool validate_utf16(const char16_t * buf, size_t len) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return validate_utf16be(buf, len);
|
|
#else
|
|
return validate_utf16le(buf, len);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused bool validate_utf16le(const char16_t * buf, size_t len) noexcept {
|
|
return get_active_implementation()->validate_utf16le(buf, len);
|
|
}
|
|
simdutf_warn_unused bool validate_utf16be(const char16_t * buf, size_t len) noexcept {
|
|
return get_active_implementation()->validate_utf16be(buf, len);
|
|
}
|
|
simdutf_warn_unused result validate_utf16_with_errors(const char16_t * buf, size_t len) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return validate_utf16be_with_errors(buf, len);
|
|
#else
|
|
return validate_utf16le_with_errors(buf, len);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused result validate_utf16le_with_errors(const char16_t * buf, size_t len) noexcept {
|
|
return get_active_implementation()->validate_utf16le_with_errors(buf, len);
|
|
}
|
|
simdutf_warn_unused result validate_utf16be_with_errors(const char16_t * buf, size_t len) noexcept {
|
|
return get_active_implementation()->validate_utf16be_with_errors(buf, len);
|
|
}
|
|
simdutf_warn_unused bool validate_utf32(const char32_t * buf, size_t len) noexcept {
|
|
return get_active_implementation()->validate_utf32(buf, len);
|
|
}
|
|
simdutf_warn_unused result validate_utf32_with_errors(const char32_t * buf, size_t len) noexcept {
|
|
return get_active_implementation()->validate_utf32_with_errors(buf, len);
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
|
|
#else
|
|
return convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
|
|
return get_active_implementation()->convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
|
|
return get_active_implementation()->convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept {
|
|
return get_active_implementation()->convert_valid_utf8_to_utf32(input, length, utf32_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_utf16be_to_utf8(buf, len, utf8_buffer);
|
|
#else
|
|
return convert_utf16le_to_utf8(buf, len, utf8_buffer);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
|
|
return get_active_implementation()->convert_utf16le_to_utf8(buf, len, utf8_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
|
|
return get_active_implementation()->convert_utf16be_to_utf8(buf, len, utf8_buffer);
|
|
}
|
|
simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
|
|
#else
|
|
return convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
|
|
return get_active_implementation()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
|
|
}
|
|
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
|
|
return get_active_implementation()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
|
|
#else
|
|
return convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
|
|
return get_active_implementation()->convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
|
|
return get_active_implementation()->convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) noexcept {
|
|
return get_active_implementation()->convert_utf32_to_utf8(buf, len, utf8_buffer);
|
|
}
|
|
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) noexcept {
|
|
return get_active_implementation()->convert_utf32_to_utf8_with_errors(buf, len, utf8_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) noexcept {
|
|
return get_active_implementation()->convert_valid_utf32_to_utf8(buf, len, utf8_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_utf32_to_utf16be(buf, len, utf16_buffer);
|
|
#else
|
|
return convert_utf32_to_utf16le(buf, len, utf16_buffer);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
|
|
return get_active_implementation()->convert_utf32_to_utf16le(buf, len, utf16_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
|
|
return get_active_implementation()->convert_utf32_to_utf16be(buf, len, utf16_buffer);
|
|
}
|
|
simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
|
|
#else
|
|
return convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
|
|
return get_active_implementation()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
|
|
}
|
|
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
|
|
return get_active_implementation()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
|
|
#else
|
|
return convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
|
|
return get_active_implementation()->convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
|
|
return get_active_implementation()->convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_utf16be_to_utf32(buf, len, utf32_buffer);
|
|
#else
|
|
return convert_utf16le_to_utf32(buf, len, utf32_buffer);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
|
|
return get_active_implementation()->convert_utf16le_to_utf32(buf, len, utf32_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
|
|
return get_active_implementation()->convert_utf16be_to_utf32(buf, len, utf32_buffer);
|
|
}
|
|
simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
|
|
#else
|
|
return convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
|
|
return get_active_implementation()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
|
|
}
|
|
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
|
|
return get_active_implementation()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
|
|
#else
|
|
return convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
|
|
return get_active_implementation()->convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
|
|
}
|
|
simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
|
|
return get_active_implementation()->convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
|
|
}
|
|
void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept {
|
|
get_active_implementation()->change_endianness_utf16(input, length, output);
|
|
}
|
|
simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return count_utf16be(input, length);
|
|
#else
|
|
return count_utf16le(input, length);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) noexcept {
|
|
return get_active_implementation()->count_utf16le(input, length);
|
|
}
|
|
simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length) noexcept {
|
|
return get_active_implementation()->count_utf16be(input, length);
|
|
}
|
|
simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept {
|
|
return get_active_implementation()->count_utf8(input, length);
|
|
}
|
|
simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return utf8_length_from_utf16be(input, length);
|
|
#else
|
|
return utf8_length_from_utf16le(input, length);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) noexcept {
|
|
return get_active_implementation()->utf8_length_from_utf16le(input, length);
|
|
}
|
|
simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept {
|
|
return get_active_implementation()->utf8_length_from_utf16be(input, length);
|
|
}
|
|
simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_t length) noexcept {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return utf32_length_from_utf16be(input, length);
|
|
#else
|
|
return utf32_length_from_utf16le(input, length);
|
|
#endif
|
|
}
|
|
simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept {
|
|
return get_active_implementation()->utf32_length_from_utf16le(input, length);
|
|
}
|
|
simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept {
|
|
return get_active_implementation()->utf32_length_from_utf16be(input, length);
|
|
}
|
|
simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept {
|
|
return get_active_implementation()->utf16_length_from_utf8(input, length);
|
|
}
|
|
simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept {
|
|
return get_active_implementation()->utf8_length_from_utf32(input, length);
|
|
}
|
|
simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept {
|
|
return get_active_implementation()->utf16_length_from_utf32(input, length);
|
|
}
|
|
simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept {
|
|
return get_active_implementation()->utf32_length_from_utf8(input, length);
|
|
}
|
|
simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * buf, size_t length) noexcept {
|
|
return get_active_implementation()->autodetect_encoding(buf, length);
|
|
}
|
|
simdutf_warn_unused int detect_encodings(const char * buf, size_t length) noexcept {
|
|
return get_active_implementation()->detect_encodings(buf, length);
|
|
}
|
|
|
|
const implementation * builtin_implementation() {
|
|
static const implementation * builtin_impl = get_available_implementations()[SIMDUTF_STRINGIFY(SIMDUTF_BUILTIN_IMPLEMENTATION)];
|
|
return builtin_impl;
|
|
}
|
|
|
|
|
|
} // namespace simdutf
|
|
|
|
/* end file src/implementation.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=encoding_types.cpp
|
|
/* begin file src/encoding_types.cpp */
|
|
|
|
namespace simdutf {
|
|
bool match_system(endianness e) {
|
|
#if SIMDUTF_IS_BIG_ENDIAN
|
|
return e == endianness::BIG;
|
|
#else
|
|
return e == endianness::LITTLE;
|
|
#endif
|
|
}
|
|
|
|
std::string to_string(encoding_type bom) {
|
|
switch (bom) {
|
|
case UTF16_LE: return "UTF16 little-endian";
|
|
case UTF16_BE: return "UTF16 big-endian";
|
|
case UTF32_LE: return "UTF32 little-endian";
|
|
case UTF32_BE: return "UTF32 big-endian";
|
|
case UTF8: return "UTF8";
|
|
case unspecified: return "unknown";
|
|
default: return "error";
|
|
}
|
|
}
|
|
|
|
namespace BOM {
|
|
// Note that BOM for UTF8 is discouraged.
|
|
encoding_type check_bom(const uint8_t* byte, size_t length) {
|
|
if (length >= 2 && byte[0] == 0xff and byte[1] == 0xfe) {
|
|
if (length >= 4 && byte[2] == 0x00 and byte[3] == 0x0) {
|
|
return encoding_type::UTF32_LE;
|
|
} else {
|
|
return encoding_type::UTF16_LE;
|
|
}
|
|
} else if (length >= 2 && byte[0] == 0xfe and byte[1] == 0xff) {
|
|
return encoding_type::UTF16_BE;
|
|
} else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and byte[2] == 0xfe and byte[3] == 0xff) {
|
|
return encoding_type::UTF32_BE;
|
|
} else if (length >= 4 && byte[0] == 0xef and byte[1] == 0xbb and byte[3] == 0xbf) {
|
|
return encoding_type::UTF8;
|
|
}
|
|
return encoding_type::unspecified;
|
|
}
|
|
|
|
encoding_type check_bom(const char* byte, size_t length) {
|
|
return check_bom(reinterpret_cast<const uint8_t*>(byte), length);
|
|
}
|
|
|
|
size_t bom_byte_size(encoding_type bom) {
|
|
switch (bom) {
|
|
case UTF16_LE: return 2;
|
|
case UTF16_BE: return 2;
|
|
case UTF32_LE: return 4;
|
|
case UTF32_BE: return 4;
|
|
case UTF8: return 3;
|
|
case unspecified: return 0;
|
|
default: return 0;
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|
|
/* end file src/encoding_types.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=error.cpp
|
|
/* begin file src/error.cpp */
|
|
namespace simdutf {
|
|
|
|
simdutf_really_inline result::result() : error{error_code::SUCCESS}, count{0} {};
|
|
|
|
simdutf_really_inline result::result(error_code _err, size_t _pos) : error{_err}, count{_pos} {};
|
|
|
|
}
|
|
/* end file src/error.cpp */
|
|
// The large tables should be included once and they
|
|
// should not depend on a kernel.
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=tables/utf8_to_utf16_tables.h
|
|
/* begin file src/tables/utf8_to_utf16_tables.h */
|
|
#ifndef SIMDUTF_UTF8_TO_UTF16_TABLES_H
|
|
#define SIMDUTF_UTF8_TO_UTF16_TABLES_H
|
|
#include <cstdint>
|
|
|
|
namespace simdutf {
|
|
namespace {
|
|
namespace tables {
|
|
namespace utf8_to_utf16 {
|
|
/**
|
|
* utf8bigindex uses about 8 kB
|
|
* shufutf8 uses about 3344 B
|
|
*
|
|
* So we use a bit over 11 kB. It would be
|
|
* easy to save about 4 kB by only
|
|
* storing the index in utf8bigindex, and
|
|
* deriving the consumed bytes otherwise.
|
|
* However, this may come at a significant (10% to 20%)
|
|
* performance penalty.
|
|
*/
|
|
|
|
const uint8_t shufutf8[209][16] =
|
|
{ {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 6, 5, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 6, 255, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 7, 6, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 6, 255, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 7, 6, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 7, 255, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 8, 7, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
|
|
{0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
|
|
{0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
|
|
{1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 8, 255, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 9, 8, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 9, 255, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 10, 9, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 9, 255, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 10, 9, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 255, 0, 0, 0, 0},
|
|
{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 5, 255, 255, 255},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 6, 5, 255, 255},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 7, 6, 5, 255},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 6, 255, 255, 255},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 7, 6, 255, 255},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 8, 7, 6, 255},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 7, 255, 255, 255},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 8, 7, 255, 255},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 9, 8, 7, 255},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 6, 255, 255, 255},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 7, 6, 255, 255},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 8, 7, 6, 255},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 7, 255, 255, 255},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 8, 7, 255, 255},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 9, 8, 7, 255},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 8, 255, 255, 255},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 9, 8, 255, 255},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 10, 9, 8, 255},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 7, 6, 255, 255},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 8, 7, 6, 255},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 7, 255, 255, 255},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 8, 7, 255, 255},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 9, 8, 7, 255},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 8, 255, 255, 255},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 9, 8, 255, 255},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 10, 9, 8, 255},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 7, 255, 255, 255},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 8, 7, 255, 255},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 9, 8, 7, 255},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 8, 255, 255, 255},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 9, 8, 255, 255},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 10, 9, 8, 255},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 9, 255, 255, 255},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 10, 9, 255, 255},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 1, 255, 255, 255, 5, 4, 3, 2, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 2, 1, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 3, 2, 1, 255, 7, 6, 5, 4, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 4, 3, 2, 1, 5, 255, 255, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 4, 3, 2, 1, 6, 5, 255, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 4, 3, 2, 1, 7, 6, 5, 255, 0, 0, 0, 0},
|
|
{0, 255, 255, 255, 4, 3, 2, 1, 8, 7, 6, 5, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 2, 255, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 3, 2, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 4, 3, 2, 255, 8, 7, 6, 5, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 5, 4, 3, 2, 6, 255, 255, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 5, 4, 3, 2, 7, 6, 255, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 5, 4, 3, 2, 8, 7, 6, 255, 0, 0, 0, 0},
|
|
{1, 0, 255, 255, 5, 4, 3, 2, 9, 8, 7, 6, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 3, 255, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 4, 3, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 5, 4, 3, 255, 9, 8, 7, 6, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 6, 5, 4, 3, 7, 255, 255, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 6, 5, 4, 3, 8, 7, 255, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 6, 5, 4, 3, 9, 8, 7, 255, 0, 0, 0, 0},
|
|
{2, 1, 0, 255, 6, 5, 4, 3, 10, 9, 8, 7, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 4, 255, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 4, 255, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 4, 255, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 4, 255, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 5, 4, 255, 255, 6, 255, 255, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 5, 4, 255, 255, 7, 6, 255, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 5, 4, 255, 255, 8, 7, 6, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 5, 4, 255, 255, 9, 8, 7, 6, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 6, 5, 4, 255, 7, 255, 255, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 6, 5, 4, 255, 8, 7, 255, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 6, 5, 4, 255, 9, 8, 7, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 6, 5, 4, 255, 10, 9, 8, 7, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 7, 6, 5, 4, 8, 255, 255, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 7, 6, 5, 4, 9, 8, 255, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 7, 6, 5, 4, 10, 9, 8, 255, 0, 0, 0, 0},
|
|
{3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 0, 0, 0, 0}};
|
|
/* number of two bytes : 64 */
|
|
/* number of two + three bytes : 145 */
|
|
/* number of two + three + four bytes : 209 */
|
|
const uint8_t utf8bigindex[4096][2] =
|
|
{ {209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{146, 4},
|
|
{209, 12},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{147, 5},
|
|
{209, 12},
|
|
{150, 5},
|
|
{162, 5},
|
|
{65, 5},
|
|
{209, 12},
|
|
{153, 5},
|
|
{165, 5},
|
|
{67, 5},
|
|
{177, 5},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{148, 6},
|
|
{209, 12},
|
|
{151, 6},
|
|
{163, 6},
|
|
{66, 6},
|
|
{209, 12},
|
|
{154, 6},
|
|
{166, 6},
|
|
{68, 6},
|
|
{178, 6},
|
|
{74, 6},
|
|
{92, 6},
|
|
{64, 4},
|
|
{209, 12},
|
|
{157, 6},
|
|
{169, 6},
|
|
{70, 6},
|
|
{181, 6},
|
|
{76, 6},
|
|
{94, 6},
|
|
{65, 5},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{67, 5},
|
|
{118, 6},
|
|
{73, 5},
|
|
{91, 5},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{209, 12},
|
|
{155, 7},
|
|
{167, 7},
|
|
{69, 7},
|
|
{179, 7},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{170, 7},
|
|
{71, 7},
|
|
{182, 7},
|
|
{77, 7},
|
|
{95, 7},
|
|
{65, 5},
|
|
{194, 7},
|
|
{83, 7},
|
|
{101, 7},
|
|
{67, 5},
|
|
{119, 7},
|
|
{73, 5},
|
|
{91, 5},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{185, 7},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{103, 7},
|
|
{68, 6},
|
|
{121, 7},
|
|
{74, 6},
|
|
{92, 6},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{76, 6},
|
|
{94, 6},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{209, 12},
|
|
{156, 8},
|
|
{168, 8},
|
|
{146, 4},
|
|
{180, 8},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{159, 8},
|
|
{171, 8},
|
|
{72, 8},
|
|
{183, 8},
|
|
{78, 8},
|
|
{96, 8},
|
|
{65, 5},
|
|
{195, 8},
|
|
{84, 8},
|
|
{102, 8},
|
|
{67, 5},
|
|
{120, 8},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{174, 8},
|
|
{148, 6},
|
|
{186, 8},
|
|
{80, 8},
|
|
{98, 8},
|
|
{66, 6},
|
|
{198, 8},
|
|
{86, 8},
|
|
{104, 8},
|
|
{68, 6},
|
|
{122, 8},
|
|
{74, 6},
|
|
{92, 6},
|
|
{3, 8},
|
|
{209, 12},
|
|
{157, 6},
|
|
{110, 8},
|
|
{70, 6},
|
|
{128, 8},
|
|
{76, 6},
|
|
{94, 6},
|
|
{5, 8},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{9, 8},
|
|
{118, 6},
|
|
{17, 8},
|
|
{33, 8},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{189, 8},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{201, 8},
|
|
{88, 8},
|
|
{106, 8},
|
|
{69, 7},
|
|
{124, 8},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{112, 8},
|
|
{71, 7},
|
|
{130, 8},
|
|
{77, 7},
|
|
{95, 7},
|
|
{6, 8},
|
|
{194, 7},
|
|
{83, 7},
|
|
{101, 7},
|
|
{10, 8},
|
|
{119, 7},
|
|
{18, 8},
|
|
{34, 8},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{136, 8},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{103, 7},
|
|
{12, 8},
|
|
{121, 7},
|
|
{20, 8},
|
|
{36, 8},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{24, 8},
|
|
{40, 8},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{48, 8},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{146, 4},
|
|
{209, 12},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{160, 9},
|
|
{172, 9},
|
|
{147, 5},
|
|
{184, 9},
|
|
{150, 5},
|
|
{162, 5},
|
|
{65, 5},
|
|
{196, 9},
|
|
{153, 5},
|
|
{165, 5},
|
|
{67, 5},
|
|
{177, 5},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{175, 9},
|
|
{148, 6},
|
|
{187, 9},
|
|
{81, 9},
|
|
{99, 9},
|
|
{66, 6},
|
|
{199, 9},
|
|
{87, 9},
|
|
{105, 9},
|
|
{68, 6},
|
|
{123, 9},
|
|
{74, 6},
|
|
{92, 6},
|
|
{64, 4},
|
|
{209, 12},
|
|
{157, 6},
|
|
{111, 9},
|
|
{70, 6},
|
|
{129, 9},
|
|
{76, 6},
|
|
{94, 6},
|
|
{65, 5},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{67, 5},
|
|
{118, 6},
|
|
{73, 5},
|
|
{91, 5},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{190, 9},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{202, 9},
|
|
{89, 9},
|
|
{107, 9},
|
|
{69, 7},
|
|
{125, 9},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{113, 9},
|
|
{71, 7},
|
|
{131, 9},
|
|
{77, 7},
|
|
{95, 7},
|
|
{7, 9},
|
|
{194, 7},
|
|
{83, 7},
|
|
{101, 7},
|
|
{11, 9},
|
|
{119, 7},
|
|
{19, 9},
|
|
{35, 9},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{137, 9},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{103, 7},
|
|
{13, 9},
|
|
{121, 7},
|
|
{21, 9},
|
|
{37, 9},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{25, 9},
|
|
{41, 9},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{49, 9},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{205, 9},
|
|
{156, 8},
|
|
{168, 8},
|
|
{146, 4},
|
|
{180, 8},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{159, 8},
|
|
{115, 9},
|
|
{72, 8},
|
|
{133, 9},
|
|
{78, 8},
|
|
{96, 8},
|
|
{65, 5},
|
|
{195, 8},
|
|
{84, 8},
|
|
{102, 8},
|
|
{67, 5},
|
|
{120, 8},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{174, 8},
|
|
{148, 6},
|
|
{139, 9},
|
|
{80, 8},
|
|
{98, 8},
|
|
{66, 6},
|
|
{198, 8},
|
|
{86, 8},
|
|
{104, 8},
|
|
{14, 9},
|
|
{122, 8},
|
|
{22, 9},
|
|
{38, 9},
|
|
{3, 8},
|
|
{209, 12},
|
|
{157, 6},
|
|
{110, 8},
|
|
{70, 6},
|
|
{128, 8},
|
|
{26, 9},
|
|
{42, 9},
|
|
{5, 8},
|
|
{193, 6},
|
|
{82, 6},
|
|
{50, 9},
|
|
{9, 8},
|
|
{118, 6},
|
|
{17, 8},
|
|
{33, 8},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{189, 8},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{201, 8},
|
|
{88, 8},
|
|
{106, 8},
|
|
{69, 7},
|
|
{124, 8},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{112, 8},
|
|
{71, 7},
|
|
{130, 8},
|
|
{28, 9},
|
|
{44, 9},
|
|
{6, 8},
|
|
{194, 7},
|
|
{83, 7},
|
|
{52, 9},
|
|
{10, 8},
|
|
{119, 7},
|
|
{18, 8},
|
|
{34, 8},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{136, 8},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{56, 9},
|
|
{12, 8},
|
|
{121, 7},
|
|
{20, 8},
|
|
{36, 8},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{24, 8},
|
|
{40, 8},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{48, 8},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{146, 4},
|
|
{209, 12},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{147, 5},
|
|
{209, 12},
|
|
{150, 5},
|
|
{162, 5},
|
|
{65, 5},
|
|
{209, 12},
|
|
{153, 5},
|
|
{165, 5},
|
|
{67, 5},
|
|
{177, 5},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{176, 10},
|
|
{148, 6},
|
|
{188, 10},
|
|
{151, 6},
|
|
{163, 6},
|
|
{66, 6},
|
|
{200, 10},
|
|
{154, 6},
|
|
{166, 6},
|
|
{68, 6},
|
|
{178, 6},
|
|
{74, 6},
|
|
{92, 6},
|
|
{64, 4},
|
|
{209, 12},
|
|
{157, 6},
|
|
{169, 6},
|
|
{70, 6},
|
|
{181, 6},
|
|
{76, 6},
|
|
{94, 6},
|
|
{65, 5},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{67, 5},
|
|
{118, 6},
|
|
{73, 5},
|
|
{91, 5},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{191, 10},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{203, 10},
|
|
{90, 10},
|
|
{108, 10},
|
|
{69, 7},
|
|
{126, 10},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{114, 10},
|
|
{71, 7},
|
|
{132, 10},
|
|
{77, 7},
|
|
{95, 7},
|
|
{65, 5},
|
|
{194, 7},
|
|
{83, 7},
|
|
{101, 7},
|
|
{67, 5},
|
|
{119, 7},
|
|
{73, 5},
|
|
{91, 5},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{138, 10},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{103, 7},
|
|
{68, 6},
|
|
{121, 7},
|
|
{74, 6},
|
|
{92, 6},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{76, 6},
|
|
{94, 6},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{206, 10},
|
|
{156, 8},
|
|
{168, 8},
|
|
{146, 4},
|
|
{180, 8},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{159, 8},
|
|
{116, 10},
|
|
{72, 8},
|
|
{134, 10},
|
|
{78, 8},
|
|
{96, 8},
|
|
{65, 5},
|
|
{195, 8},
|
|
{84, 8},
|
|
{102, 8},
|
|
{67, 5},
|
|
{120, 8},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{174, 8},
|
|
{148, 6},
|
|
{140, 10},
|
|
{80, 8},
|
|
{98, 8},
|
|
{66, 6},
|
|
{198, 8},
|
|
{86, 8},
|
|
{104, 8},
|
|
{15, 10},
|
|
{122, 8},
|
|
{23, 10},
|
|
{39, 10},
|
|
{3, 8},
|
|
{209, 12},
|
|
{157, 6},
|
|
{110, 8},
|
|
{70, 6},
|
|
{128, 8},
|
|
{27, 10},
|
|
{43, 10},
|
|
{5, 8},
|
|
{193, 6},
|
|
{82, 6},
|
|
{51, 10},
|
|
{9, 8},
|
|
{118, 6},
|
|
{17, 8},
|
|
{33, 8},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{189, 8},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{201, 8},
|
|
{88, 8},
|
|
{106, 8},
|
|
{69, 7},
|
|
{124, 8},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{112, 8},
|
|
{71, 7},
|
|
{130, 8},
|
|
{29, 10},
|
|
{45, 10},
|
|
{6, 8},
|
|
{194, 7},
|
|
{83, 7},
|
|
{53, 10},
|
|
{10, 8},
|
|
{119, 7},
|
|
{18, 8},
|
|
{34, 8},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{136, 8},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{57, 10},
|
|
{12, 8},
|
|
{121, 7},
|
|
{20, 8},
|
|
{36, 8},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{24, 8},
|
|
{40, 8},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{48, 8},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{146, 4},
|
|
{209, 12},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{160, 9},
|
|
{172, 9},
|
|
{147, 5},
|
|
{184, 9},
|
|
{150, 5},
|
|
{162, 5},
|
|
{65, 5},
|
|
{196, 9},
|
|
{153, 5},
|
|
{165, 5},
|
|
{67, 5},
|
|
{177, 5},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{175, 9},
|
|
{148, 6},
|
|
{142, 10},
|
|
{81, 9},
|
|
{99, 9},
|
|
{66, 6},
|
|
{199, 9},
|
|
{87, 9},
|
|
{105, 9},
|
|
{68, 6},
|
|
{123, 9},
|
|
{74, 6},
|
|
{92, 6},
|
|
{64, 4},
|
|
{209, 12},
|
|
{157, 6},
|
|
{111, 9},
|
|
{70, 6},
|
|
{129, 9},
|
|
{76, 6},
|
|
{94, 6},
|
|
{65, 5},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{67, 5},
|
|
{118, 6},
|
|
{73, 5},
|
|
{91, 5},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{190, 9},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{202, 9},
|
|
{89, 9},
|
|
{107, 9},
|
|
{69, 7},
|
|
{125, 9},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{113, 9},
|
|
{71, 7},
|
|
{131, 9},
|
|
{30, 10},
|
|
{46, 10},
|
|
{7, 9},
|
|
{194, 7},
|
|
{83, 7},
|
|
{54, 10},
|
|
{11, 9},
|
|
{119, 7},
|
|
{19, 9},
|
|
{35, 9},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{137, 9},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{58, 10},
|
|
{13, 9},
|
|
{121, 7},
|
|
{21, 9},
|
|
{37, 9},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{25, 9},
|
|
{41, 9},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{49, 9},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{205, 9},
|
|
{156, 8},
|
|
{168, 8},
|
|
{146, 4},
|
|
{180, 8},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{159, 8},
|
|
{115, 9},
|
|
{72, 8},
|
|
{133, 9},
|
|
{78, 8},
|
|
{96, 8},
|
|
{65, 5},
|
|
{195, 8},
|
|
{84, 8},
|
|
{102, 8},
|
|
{67, 5},
|
|
{120, 8},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{174, 8},
|
|
{148, 6},
|
|
{139, 9},
|
|
{80, 8},
|
|
{98, 8},
|
|
{66, 6},
|
|
{198, 8},
|
|
{86, 8},
|
|
{60, 10},
|
|
{14, 9},
|
|
{122, 8},
|
|
{22, 9},
|
|
{38, 9},
|
|
{3, 8},
|
|
{209, 12},
|
|
{157, 6},
|
|
{110, 8},
|
|
{70, 6},
|
|
{128, 8},
|
|
{26, 9},
|
|
{42, 9},
|
|
{5, 8},
|
|
{193, 6},
|
|
{82, 6},
|
|
{50, 9},
|
|
{9, 8},
|
|
{118, 6},
|
|
{17, 8},
|
|
{33, 8},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{189, 8},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{201, 8},
|
|
{88, 8},
|
|
{106, 8},
|
|
{69, 7},
|
|
{124, 8},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{112, 8},
|
|
{71, 7},
|
|
{130, 8},
|
|
{28, 9},
|
|
{44, 9},
|
|
{6, 8},
|
|
{194, 7},
|
|
{83, 7},
|
|
{52, 9},
|
|
{10, 8},
|
|
{119, 7},
|
|
{18, 8},
|
|
{34, 8},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{136, 8},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{56, 9},
|
|
{12, 8},
|
|
{121, 7},
|
|
{20, 8},
|
|
{36, 8},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{24, 8},
|
|
{40, 8},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{48, 8},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{146, 4},
|
|
{209, 12},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{147, 5},
|
|
{209, 12},
|
|
{150, 5},
|
|
{162, 5},
|
|
{65, 5},
|
|
{209, 12},
|
|
{153, 5},
|
|
{165, 5},
|
|
{67, 5},
|
|
{177, 5},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{148, 6},
|
|
{209, 12},
|
|
{151, 6},
|
|
{163, 6},
|
|
{66, 6},
|
|
{209, 12},
|
|
{154, 6},
|
|
{166, 6},
|
|
{68, 6},
|
|
{178, 6},
|
|
{74, 6},
|
|
{92, 6},
|
|
{64, 4},
|
|
{209, 12},
|
|
{157, 6},
|
|
{169, 6},
|
|
{70, 6},
|
|
{181, 6},
|
|
{76, 6},
|
|
{94, 6},
|
|
{65, 5},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{67, 5},
|
|
{118, 6},
|
|
{73, 5},
|
|
{91, 5},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{192, 11},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{204, 11},
|
|
{155, 7},
|
|
{167, 7},
|
|
{69, 7},
|
|
{179, 7},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{170, 7},
|
|
{71, 7},
|
|
{182, 7},
|
|
{77, 7},
|
|
{95, 7},
|
|
{65, 5},
|
|
{194, 7},
|
|
{83, 7},
|
|
{101, 7},
|
|
{67, 5},
|
|
{119, 7},
|
|
{73, 5},
|
|
{91, 5},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{185, 7},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{103, 7},
|
|
{68, 6},
|
|
{121, 7},
|
|
{74, 6},
|
|
{92, 6},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{76, 6},
|
|
{94, 6},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{207, 11},
|
|
{156, 8},
|
|
{168, 8},
|
|
{146, 4},
|
|
{180, 8},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{159, 8},
|
|
{117, 11},
|
|
{72, 8},
|
|
{135, 11},
|
|
{78, 8},
|
|
{96, 8},
|
|
{65, 5},
|
|
{195, 8},
|
|
{84, 8},
|
|
{102, 8},
|
|
{67, 5},
|
|
{120, 8},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{174, 8},
|
|
{148, 6},
|
|
{141, 11},
|
|
{80, 8},
|
|
{98, 8},
|
|
{66, 6},
|
|
{198, 8},
|
|
{86, 8},
|
|
{104, 8},
|
|
{68, 6},
|
|
{122, 8},
|
|
{74, 6},
|
|
{92, 6},
|
|
{3, 8},
|
|
{209, 12},
|
|
{157, 6},
|
|
{110, 8},
|
|
{70, 6},
|
|
{128, 8},
|
|
{76, 6},
|
|
{94, 6},
|
|
{5, 8},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{9, 8},
|
|
{118, 6},
|
|
{17, 8},
|
|
{33, 8},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{189, 8},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{201, 8},
|
|
{88, 8},
|
|
{106, 8},
|
|
{69, 7},
|
|
{124, 8},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{112, 8},
|
|
{71, 7},
|
|
{130, 8},
|
|
{77, 7},
|
|
{95, 7},
|
|
{6, 8},
|
|
{194, 7},
|
|
{83, 7},
|
|
{101, 7},
|
|
{10, 8},
|
|
{119, 7},
|
|
{18, 8},
|
|
{34, 8},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{136, 8},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{103, 7},
|
|
{12, 8},
|
|
{121, 7},
|
|
{20, 8},
|
|
{36, 8},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{24, 8},
|
|
{40, 8},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{48, 8},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{146, 4},
|
|
{209, 12},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{160, 9},
|
|
{172, 9},
|
|
{147, 5},
|
|
{184, 9},
|
|
{150, 5},
|
|
{162, 5},
|
|
{65, 5},
|
|
{196, 9},
|
|
{153, 5},
|
|
{165, 5},
|
|
{67, 5},
|
|
{177, 5},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{175, 9},
|
|
{148, 6},
|
|
{143, 11},
|
|
{81, 9},
|
|
{99, 9},
|
|
{66, 6},
|
|
{199, 9},
|
|
{87, 9},
|
|
{105, 9},
|
|
{68, 6},
|
|
{123, 9},
|
|
{74, 6},
|
|
{92, 6},
|
|
{64, 4},
|
|
{209, 12},
|
|
{157, 6},
|
|
{111, 9},
|
|
{70, 6},
|
|
{129, 9},
|
|
{76, 6},
|
|
{94, 6},
|
|
{65, 5},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{67, 5},
|
|
{118, 6},
|
|
{73, 5},
|
|
{91, 5},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{190, 9},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{202, 9},
|
|
{89, 9},
|
|
{107, 9},
|
|
{69, 7},
|
|
{125, 9},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{113, 9},
|
|
{71, 7},
|
|
{131, 9},
|
|
{31, 11},
|
|
{47, 11},
|
|
{7, 9},
|
|
{194, 7},
|
|
{83, 7},
|
|
{55, 11},
|
|
{11, 9},
|
|
{119, 7},
|
|
{19, 9},
|
|
{35, 9},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{137, 9},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{59, 11},
|
|
{13, 9},
|
|
{121, 7},
|
|
{21, 9},
|
|
{37, 9},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{25, 9},
|
|
{41, 9},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{49, 9},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{205, 9},
|
|
{156, 8},
|
|
{168, 8},
|
|
{146, 4},
|
|
{180, 8},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{159, 8},
|
|
{115, 9},
|
|
{72, 8},
|
|
{133, 9},
|
|
{78, 8},
|
|
{96, 8},
|
|
{65, 5},
|
|
{195, 8},
|
|
{84, 8},
|
|
{102, 8},
|
|
{67, 5},
|
|
{120, 8},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{174, 8},
|
|
{148, 6},
|
|
{139, 9},
|
|
{80, 8},
|
|
{98, 8},
|
|
{66, 6},
|
|
{198, 8},
|
|
{86, 8},
|
|
{61, 11},
|
|
{14, 9},
|
|
{122, 8},
|
|
{22, 9},
|
|
{38, 9},
|
|
{3, 8},
|
|
{209, 12},
|
|
{157, 6},
|
|
{110, 8},
|
|
{70, 6},
|
|
{128, 8},
|
|
{26, 9},
|
|
{42, 9},
|
|
{5, 8},
|
|
{193, 6},
|
|
{82, 6},
|
|
{50, 9},
|
|
{9, 8},
|
|
{118, 6},
|
|
{17, 8},
|
|
{33, 8},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{189, 8},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{201, 8},
|
|
{88, 8},
|
|
{106, 8},
|
|
{69, 7},
|
|
{124, 8},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{112, 8},
|
|
{71, 7},
|
|
{130, 8},
|
|
{28, 9},
|
|
{44, 9},
|
|
{6, 8},
|
|
{194, 7},
|
|
{83, 7},
|
|
{52, 9},
|
|
{10, 8},
|
|
{119, 7},
|
|
{18, 8},
|
|
{34, 8},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{136, 8},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{56, 9},
|
|
{12, 8},
|
|
{121, 7},
|
|
{20, 8},
|
|
{36, 8},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{24, 8},
|
|
{40, 8},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{48, 8},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{146, 4},
|
|
{209, 12},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{147, 5},
|
|
{209, 12},
|
|
{150, 5},
|
|
{162, 5},
|
|
{65, 5},
|
|
{209, 12},
|
|
{153, 5},
|
|
{165, 5},
|
|
{67, 5},
|
|
{177, 5},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{176, 10},
|
|
{148, 6},
|
|
{188, 10},
|
|
{151, 6},
|
|
{163, 6},
|
|
{66, 6},
|
|
{200, 10},
|
|
{154, 6},
|
|
{166, 6},
|
|
{68, 6},
|
|
{178, 6},
|
|
{74, 6},
|
|
{92, 6},
|
|
{64, 4},
|
|
{209, 12},
|
|
{157, 6},
|
|
{169, 6},
|
|
{70, 6},
|
|
{181, 6},
|
|
{76, 6},
|
|
{94, 6},
|
|
{65, 5},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{67, 5},
|
|
{118, 6},
|
|
{73, 5},
|
|
{91, 5},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{191, 10},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{203, 10},
|
|
{90, 10},
|
|
{108, 10},
|
|
{69, 7},
|
|
{126, 10},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{114, 10},
|
|
{71, 7},
|
|
{132, 10},
|
|
{77, 7},
|
|
{95, 7},
|
|
{65, 5},
|
|
{194, 7},
|
|
{83, 7},
|
|
{101, 7},
|
|
{67, 5},
|
|
{119, 7},
|
|
{73, 5},
|
|
{91, 5},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{138, 10},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{103, 7},
|
|
{68, 6},
|
|
{121, 7},
|
|
{74, 6},
|
|
{92, 6},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{76, 6},
|
|
{94, 6},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{206, 10},
|
|
{156, 8},
|
|
{168, 8},
|
|
{146, 4},
|
|
{180, 8},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{159, 8},
|
|
{116, 10},
|
|
{72, 8},
|
|
{134, 10},
|
|
{78, 8},
|
|
{96, 8},
|
|
{65, 5},
|
|
{195, 8},
|
|
{84, 8},
|
|
{102, 8},
|
|
{67, 5},
|
|
{120, 8},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{174, 8},
|
|
{148, 6},
|
|
{140, 10},
|
|
{80, 8},
|
|
{98, 8},
|
|
{66, 6},
|
|
{198, 8},
|
|
{86, 8},
|
|
{62, 11},
|
|
{15, 10},
|
|
{122, 8},
|
|
{23, 10},
|
|
{39, 10},
|
|
{3, 8},
|
|
{209, 12},
|
|
{157, 6},
|
|
{110, 8},
|
|
{70, 6},
|
|
{128, 8},
|
|
{27, 10},
|
|
{43, 10},
|
|
{5, 8},
|
|
{193, 6},
|
|
{82, 6},
|
|
{51, 10},
|
|
{9, 8},
|
|
{118, 6},
|
|
{17, 8},
|
|
{33, 8},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{189, 8},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{201, 8},
|
|
{88, 8},
|
|
{106, 8},
|
|
{69, 7},
|
|
{124, 8},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{112, 8},
|
|
{71, 7},
|
|
{130, 8},
|
|
{29, 10},
|
|
{45, 10},
|
|
{6, 8},
|
|
{194, 7},
|
|
{83, 7},
|
|
{53, 10},
|
|
{10, 8},
|
|
{119, 7},
|
|
{18, 8},
|
|
{34, 8},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{136, 8},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{57, 10},
|
|
{12, 8},
|
|
{121, 7},
|
|
{20, 8},
|
|
{36, 8},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{24, 8},
|
|
{40, 8},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{48, 8},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{146, 4},
|
|
{209, 12},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{160, 9},
|
|
{172, 9},
|
|
{147, 5},
|
|
{184, 9},
|
|
{150, 5},
|
|
{162, 5},
|
|
{65, 5},
|
|
{196, 9},
|
|
{153, 5},
|
|
{165, 5},
|
|
{67, 5},
|
|
{177, 5},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{175, 9},
|
|
{148, 6},
|
|
{142, 10},
|
|
{81, 9},
|
|
{99, 9},
|
|
{66, 6},
|
|
{199, 9},
|
|
{87, 9},
|
|
{105, 9},
|
|
{68, 6},
|
|
{123, 9},
|
|
{74, 6},
|
|
{92, 6},
|
|
{64, 4},
|
|
{209, 12},
|
|
{157, 6},
|
|
{111, 9},
|
|
{70, 6},
|
|
{129, 9},
|
|
{76, 6},
|
|
{94, 6},
|
|
{65, 5},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{67, 5},
|
|
{118, 6},
|
|
{73, 5},
|
|
{91, 5},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{190, 9},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{202, 9},
|
|
{89, 9},
|
|
{107, 9},
|
|
{69, 7},
|
|
{125, 9},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{113, 9},
|
|
{71, 7},
|
|
{131, 9},
|
|
{30, 10},
|
|
{46, 10},
|
|
{7, 9},
|
|
{194, 7},
|
|
{83, 7},
|
|
{54, 10},
|
|
{11, 9},
|
|
{119, 7},
|
|
{19, 9},
|
|
{35, 9},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{137, 9},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{58, 10},
|
|
{13, 9},
|
|
{121, 7},
|
|
{21, 9},
|
|
{37, 9},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{25, 9},
|
|
{41, 9},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{49, 9},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{205, 9},
|
|
{156, 8},
|
|
{168, 8},
|
|
{146, 4},
|
|
{180, 8},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{159, 8},
|
|
{115, 9},
|
|
{72, 8},
|
|
{133, 9},
|
|
{78, 8},
|
|
{96, 8},
|
|
{65, 5},
|
|
{195, 8},
|
|
{84, 8},
|
|
{102, 8},
|
|
{67, 5},
|
|
{120, 8},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{174, 8},
|
|
{148, 6},
|
|
{139, 9},
|
|
{80, 8},
|
|
{98, 8},
|
|
{66, 6},
|
|
{198, 8},
|
|
{86, 8},
|
|
{60, 10},
|
|
{14, 9},
|
|
{122, 8},
|
|
{22, 9},
|
|
{38, 9},
|
|
{3, 8},
|
|
{209, 12},
|
|
{157, 6},
|
|
{110, 8},
|
|
{70, 6},
|
|
{128, 8},
|
|
{26, 9},
|
|
{42, 9},
|
|
{5, 8},
|
|
{193, 6},
|
|
{82, 6},
|
|
{50, 9},
|
|
{9, 8},
|
|
{118, 6},
|
|
{17, 8},
|
|
{33, 8},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{189, 8},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{201, 8},
|
|
{88, 8},
|
|
{106, 8},
|
|
{69, 7},
|
|
{124, 8},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{112, 8},
|
|
{71, 7},
|
|
{130, 8},
|
|
{28, 9},
|
|
{44, 9},
|
|
{6, 8},
|
|
{194, 7},
|
|
{83, 7},
|
|
{52, 9},
|
|
{10, 8},
|
|
{119, 7},
|
|
{18, 8},
|
|
{34, 8},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{136, 8},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{56, 9},
|
|
{12, 8},
|
|
{121, 7},
|
|
{20, 8},
|
|
{36, 8},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{24, 8},
|
|
{40, 8},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{48, 8},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{146, 4},
|
|
{209, 12},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{147, 5},
|
|
{209, 12},
|
|
{150, 5},
|
|
{162, 5},
|
|
{65, 5},
|
|
{209, 12},
|
|
{153, 5},
|
|
{165, 5},
|
|
{67, 5},
|
|
{177, 5},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{148, 6},
|
|
{209, 12},
|
|
{151, 6},
|
|
{163, 6},
|
|
{66, 6},
|
|
{209, 12},
|
|
{154, 6},
|
|
{166, 6},
|
|
{68, 6},
|
|
{178, 6},
|
|
{74, 6},
|
|
{92, 6},
|
|
{64, 4},
|
|
{209, 12},
|
|
{157, 6},
|
|
{169, 6},
|
|
{70, 6},
|
|
{181, 6},
|
|
{76, 6},
|
|
{94, 6},
|
|
{65, 5},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{67, 5},
|
|
{118, 6},
|
|
{73, 5},
|
|
{91, 5},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{209, 12},
|
|
{155, 7},
|
|
{167, 7},
|
|
{69, 7},
|
|
{179, 7},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{170, 7},
|
|
{71, 7},
|
|
{182, 7},
|
|
{77, 7},
|
|
{95, 7},
|
|
{65, 5},
|
|
{194, 7},
|
|
{83, 7},
|
|
{101, 7},
|
|
{67, 5},
|
|
{119, 7},
|
|
{73, 5},
|
|
{91, 5},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{185, 7},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{103, 7},
|
|
{68, 6},
|
|
{121, 7},
|
|
{74, 6},
|
|
{92, 6},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{76, 6},
|
|
{94, 6},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{208, 12},
|
|
{156, 8},
|
|
{168, 8},
|
|
{146, 4},
|
|
{180, 8},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{159, 8},
|
|
{171, 8},
|
|
{72, 8},
|
|
{183, 8},
|
|
{78, 8},
|
|
{96, 8},
|
|
{65, 5},
|
|
{195, 8},
|
|
{84, 8},
|
|
{102, 8},
|
|
{67, 5},
|
|
{120, 8},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{174, 8},
|
|
{148, 6},
|
|
{186, 8},
|
|
{80, 8},
|
|
{98, 8},
|
|
{66, 6},
|
|
{198, 8},
|
|
{86, 8},
|
|
{104, 8},
|
|
{68, 6},
|
|
{122, 8},
|
|
{74, 6},
|
|
{92, 6},
|
|
{3, 8},
|
|
{209, 12},
|
|
{157, 6},
|
|
{110, 8},
|
|
{70, 6},
|
|
{128, 8},
|
|
{76, 6},
|
|
{94, 6},
|
|
{5, 8},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{9, 8},
|
|
{118, 6},
|
|
{17, 8},
|
|
{33, 8},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{189, 8},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{201, 8},
|
|
{88, 8},
|
|
{106, 8},
|
|
{69, 7},
|
|
{124, 8},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{112, 8},
|
|
{71, 7},
|
|
{130, 8},
|
|
{77, 7},
|
|
{95, 7},
|
|
{6, 8},
|
|
{194, 7},
|
|
{83, 7},
|
|
{101, 7},
|
|
{10, 8},
|
|
{119, 7},
|
|
{18, 8},
|
|
{34, 8},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{136, 8},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{103, 7},
|
|
{12, 8},
|
|
{121, 7},
|
|
{20, 8},
|
|
{36, 8},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{24, 8},
|
|
{40, 8},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{48, 8},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{146, 4},
|
|
{209, 12},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{160, 9},
|
|
{172, 9},
|
|
{147, 5},
|
|
{184, 9},
|
|
{150, 5},
|
|
{162, 5},
|
|
{65, 5},
|
|
{196, 9},
|
|
{153, 5},
|
|
{165, 5},
|
|
{67, 5},
|
|
{177, 5},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{175, 9},
|
|
{148, 6},
|
|
{144, 12},
|
|
{81, 9},
|
|
{99, 9},
|
|
{66, 6},
|
|
{199, 9},
|
|
{87, 9},
|
|
{105, 9},
|
|
{68, 6},
|
|
{123, 9},
|
|
{74, 6},
|
|
{92, 6},
|
|
{64, 4},
|
|
{209, 12},
|
|
{157, 6},
|
|
{111, 9},
|
|
{70, 6},
|
|
{129, 9},
|
|
{76, 6},
|
|
{94, 6},
|
|
{65, 5},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{67, 5},
|
|
{118, 6},
|
|
{73, 5},
|
|
{91, 5},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{190, 9},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{202, 9},
|
|
{89, 9},
|
|
{107, 9},
|
|
{69, 7},
|
|
{125, 9},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{113, 9},
|
|
{71, 7},
|
|
{131, 9},
|
|
{77, 7},
|
|
{95, 7},
|
|
{7, 9},
|
|
{194, 7},
|
|
{83, 7},
|
|
{101, 7},
|
|
{11, 9},
|
|
{119, 7},
|
|
{19, 9},
|
|
{35, 9},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{137, 9},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{103, 7},
|
|
{13, 9},
|
|
{121, 7},
|
|
{21, 9},
|
|
{37, 9},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{25, 9},
|
|
{41, 9},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{49, 9},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{205, 9},
|
|
{156, 8},
|
|
{168, 8},
|
|
{146, 4},
|
|
{180, 8},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{159, 8},
|
|
{115, 9},
|
|
{72, 8},
|
|
{133, 9},
|
|
{78, 8},
|
|
{96, 8},
|
|
{65, 5},
|
|
{195, 8},
|
|
{84, 8},
|
|
{102, 8},
|
|
{67, 5},
|
|
{120, 8},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{174, 8},
|
|
{148, 6},
|
|
{139, 9},
|
|
{80, 8},
|
|
{98, 8},
|
|
{66, 6},
|
|
{198, 8},
|
|
{86, 8},
|
|
{104, 8},
|
|
{14, 9},
|
|
{122, 8},
|
|
{22, 9},
|
|
{38, 9},
|
|
{3, 8},
|
|
{209, 12},
|
|
{157, 6},
|
|
{110, 8},
|
|
{70, 6},
|
|
{128, 8},
|
|
{26, 9},
|
|
{42, 9},
|
|
{5, 8},
|
|
{193, 6},
|
|
{82, 6},
|
|
{50, 9},
|
|
{9, 8},
|
|
{118, 6},
|
|
{17, 8},
|
|
{33, 8},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{189, 8},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{201, 8},
|
|
{88, 8},
|
|
{106, 8},
|
|
{69, 7},
|
|
{124, 8},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{112, 8},
|
|
{71, 7},
|
|
{130, 8},
|
|
{28, 9},
|
|
{44, 9},
|
|
{6, 8},
|
|
{194, 7},
|
|
{83, 7},
|
|
{52, 9},
|
|
{10, 8},
|
|
{119, 7},
|
|
{18, 8},
|
|
{34, 8},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{136, 8},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{56, 9},
|
|
{12, 8},
|
|
{121, 7},
|
|
{20, 8},
|
|
{36, 8},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{24, 8},
|
|
{40, 8},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{48, 8},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{146, 4},
|
|
{209, 12},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{147, 5},
|
|
{209, 12},
|
|
{150, 5},
|
|
{162, 5},
|
|
{65, 5},
|
|
{209, 12},
|
|
{153, 5},
|
|
{165, 5},
|
|
{67, 5},
|
|
{177, 5},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{176, 10},
|
|
{148, 6},
|
|
{188, 10},
|
|
{151, 6},
|
|
{163, 6},
|
|
{66, 6},
|
|
{200, 10},
|
|
{154, 6},
|
|
{166, 6},
|
|
{68, 6},
|
|
{178, 6},
|
|
{74, 6},
|
|
{92, 6},
|
|
{64, 4},
|
|
{209, 12},
|
|
{157, 6},
|
|
{169, 6},
|
|
{70, 6},
|
|
{181, 6},
|
|
{76, 6},
|
|
{94, 6},
|
|
{65, 5},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{67, 5},
|
|
{118, 6},
|
|
{73, 5},
|
|
{91, 5},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{191, 10},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{203, 10},
|
|
{90, 10},
|
|
{108, 10},
|
|
{69, 7},
|
|
{126, 10},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{114, 10},
|
|
{71, 7},
|
|
{132, 10},
|
|
{77, 7},
|
|
{95, 7},
|
|
{65, 5},
|
|
{194, 7},
|
|
{83, 7},
|
|
{101, 7},
|
|
{67, 5},
|
|
{119, 7},
|
|
{73, 5},
|
|
{91, 5},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{138, 10},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{103, 7},
|
|
{68, 6},
|
|
{121, 7},
|
|
{74, 6},
|
|
{92, 6},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{76, 6},
|
|
{94, 6},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{206, 10},
|
|
{156, 8},
|
|
{168, 8},
|
|
{146, 4},
|
|
{180, 8},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{159, 8},
|
|
{116, 10},
|
|
{72, 8},
|
|
{134, 10},
|
|
{78, 8},
|
|
{96, 8},
|
|
{65, 5},
|
|
{195, 8},
|
|
{84, 8},
|
|
{102, 8},
|
|
{67, 5},
|
|
{120, 8},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{174, 8},
|
|
{148, 6},
|
|
{140, 10},
|
|
{80, 8},
|
|
{98, 8},
|
|
{66, 6},
|
|
{198, 8},
|
|
{86, 8},
|
|
{63, 12},
|
|
{15, 10},
|
|
{122, 8},
|
|
{23, 10},
|
|
{39, 10},
|
|
{3, 8},
|
|
{209, 12},
|
|
{157, 6},
|
|
{110, 8},
|
|
{70, 6},
|
|
{128, 8},
|
|
{27, 10},
|
|
{43, 10},
|
|
{5, 8},
|
|
{193, 6},
|
|
{82, 6},
|
|
{51, 10},
|
|
{9, 8},
|
|
{118, 6},
|
|
{17, 8},
|
|
{33, 8},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{189, 8},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{201, 8},
|
|
{88, 8},
|
|
{106, 8},
|
|
{69, 7},
|
|
{124, 8},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{112, 8},
|
|
{71, 7},
|
|
{130, 8},
|
|
{29, 10},
|
|
{45, 10},
|
|
{6, 8},
|
|
{194, 7},
|
|
{83, 7},
|
|
{53, 10},
|
|
{10, 8},
|
|
{119, 7},
|
|
{18, 8},
|
|
{34, 8},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{136, 8},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{57, 10},
|
|
{12, 8},
|
|
{121, 7},
|
|
{20, 8},
|
|
{36, 8},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{24, 8},
|
|
{40, 8},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{48, 8},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{146, 4},
|
|
{209, 12},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{160, 9},
|
|
{172, 9},
|
|
{147, 5},
|
|
{184, 9},
|
|
{150, 5},
|
|
{162, 5},
|
|
{65, 5},
|
|
{196, 9},
|
|
{153, 5},
|
|
{165, 5},
|
|
{67, 5},
|
|
{177, 5},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{175, 9},
|
|
{148, 6},
|
|
{142, 10},
|
|
{81, 9},
|
|
{99, 9},
|
|
{66, 6},
|
|
{199, 9},
|
|
{87, 9},
|
|
{105, 9},
|
|
{68, 6},
|
|
{123, 9},
|
|
{74, 6},
|
|
{92, 6},
|
|
{64, 4},
|
|
{209, 12},
|
|
{157, 6},
|
|
{111, 9},
|
|
{70, 6},
|
|
{129, 9},
|
|
{76, 6},
|
|
{94, 6},
|
|
{65, 5},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{67, 5},
|
|
{118, 6},
|
|
{73, 5},
|
|
{91, 5},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{190, 9},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{202, 9},
|
|
{89, 9},
|
|
{107, 9},
|
|
{69, 7},
|
|
{125, 9},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{113, 9},
|
|
{71, 7},
|
|
{131, 9},
|
|
{30, 10},
|
|
{46, 10},
|
|
{7, 9},
|
|
{194, 7},
|
|
{83, 7},
|
|
{54, 10},
|
|
{11, 9},
|
|
{119, 7},
|
|
{19, 9},
|
|
{35, 9},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{137, 9},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{58, 10},
|
|
{13, 9},
|
|
{121, 7},
|
|
{21, 9},
|
|
{37, 9},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{25, 9},
|
|
{41, 9},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{49, 9},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{205, 9},
|
|
{156, 8},
|
|
{168, 8},
|
|
{146, 4},
|
|
{180, 8},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{159, 8},
|
|
{115, 9},
|
|
{72, 8},
|
|
{133, 9},
|
|
{78, 8},
|
|
{96, 8},
|
|
{65, 5},
|
|
{195, 8},
|
|
{84, 8},
|
|
{102, 8},
|
|
{67, 5},
|
|
{120, 8},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{174, 8},
|
|
{148, 6},
|
|
{139, 9},
|
|
{80, 8},
|
|
{98, 8},
|
|
{66, 6},
|
|
{198, 8},
|
|
{86, 8},
|
|
{60, 10},
|
|
{14, 9},
|
|
{122, 8},
|
|
{22, 9},
|
|
{38, 9},
|
|
{3, 8},
|
|
{209, 12},
|
|
{157, 6},
|
|
{110, 8},
|
|
{70, 6},
|
|
{128, 8},
|
|
{26, 9},
|
|
{42, 9},
|
|
{5, 8},
|
|
{193, 6},
|
|
{82, 6},
|
|
{50, 9},
|
|
{9, 8},
|
|
{118, 6},
|
|
{17, 8},
|
|
{33, 8},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{189, 8},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{201, 8},
|
|
{88, 8},
|
|
{106, 8},
|
|
{69, 7},
|
|
{124, 8},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{112, 8},
|
|
{71, 7},
|
|
{130, 8},
|
|
{28, 9},
|
|
{44, 9},
|
|
{6, 8},
|
|
{194, 7},
|
|
{83, 7},
|
|
{52, 9},
|
|
{10, 8},
|
|
{119, 7},
|
|
{18, 8},
|
|
{34, 8},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{136, 8},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{56, 9},
|
|
{12, 8},
|
|
{121, 7},
|
|
{20, 8},
|
|
{36, 8},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{24, 8},
|
|
{40, 8},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{48, 8},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{146, 4},
|
|
{209, 12},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{147, 5},
|
|
{209, 12},
|
|
{150, 5},
|
|
{162, 5},
|
|
{65, 5},
|
|
{209, 12},
|
|
{153, 5},
|
|
{165, 5},
|
|
{67, 5},
|
|
{177, 5},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{148, 6},
|
|
{209, 12},
|
|
{151, 6},
|
|
{163, 6},
|
|
{66, 6},
|
|
{209, 12},
|
|
{154, 6},
|
|
{166, 6},
|
|
{68, 6},
|
|
{178, 6},
|
|
{74, 6},
|
|
{92, 6},
|
|
{64, 4},
|
|
{209, 12},
|
|
{157, 6},
|
|
{169, 6},
|
|
{70, 6},
|
|
{181, 6},
|
|
{76, 6},
|
|
{94, 6},
|
|
{65, 5},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{67, 5},
|
|
{118, 6},
|
|
{73, 5},
|
|
{91, 5},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{192, 11},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{204, 11},
|
|
{155, 7},
|
|
{167, 7},
|
|
{69, 7},
|
|
{179, 7},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{170, 7},
|
|
{71, 7},
|
|
{182, 7},
|
|
{77, 7},
|
|
{95, 7},
|
|
{65, 5},
|
|
{194, 7},
|
|
{83, 7},
|
|
{101, 7},
|
|
{67, 5},
|
|
{119, 7},
|
|
{73, 5},
|
|
{91, 5},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{185, 7},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{103, 7},
|
|
{68, 6},
|
|
{121, 7},
|
|
{74, 6},
|
|
{92, 6},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{76, 6},
|
|
{94, 6},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{207, 11},
|
|
{156, 8},
|
|
{168, 8},
|
|
{146, 4},
|
|
{180, 8},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{159, 8},
|
|
{117, 11},
|
|
{72, 8},
|
|
{135, 11},
|
|
{78, 8},
|
|
{96, 8},
|
|
{65, 5},
|
|
{195, 8},
|
|
{84, 8},
|
|
{102, 8},
|
|
{67, 5},
|
|
{120, 8},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{174, 8},
|
|
{148, 6},
|
|
{141, 11},
|
|
{80, 8},
|
|
{98, 8},
|
|
{66, 6},
|
|
{198, 8},
|
|
{86, 8},
|
|
{104, 8},
|
|
{68, 6},
|
|
{122, 8},
|
|
{74, 6},
|
|
{92, 6},
|
|
{3, 8},
|
|
{209, 12},
|
|
{157, 6},
|
|
{110, 8},
|
|
{70, 6},
|
|
{128, 8},
|
|
{76, 6},
|
|
{94, 6},
|
|
{5, 8},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{9, 8},
|
|
{118, 6},
|
|
{17, 8},
|
|
{33, 8},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{189, 8},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{201, 8},
|
|
{88, 8},
|
|
{106, 8},
|
|
{69, 7},
|
|
{124, 8},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{112, 8},
|
|
{71, 7},
|
|
{130, 8},
|
|
{77, 7},
|
|
{95, 7},
|
|
{6, 8},
|
|
{194, 7},
|
|
{83, 7},
|
|
{101, 7},
|
|
{10, 8},
|
|
{119, 7},
|
|
{18, 8},
|
|
{34, 8},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{136, 8},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{103, 7},
|
|
{12, 8},
|
|
{121, 7},
|
|
{20, 8},
|
|
{36, 8},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{24, 8},
|
|
{40, 8},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{48, 8},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{146, 4},
|
|
{209, 12},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{160, 9},
|
|
{172, 9},
|
|
{147, 5},
|
|
{184, 9},
|
|
{150, 5},
|
|
{162, 5},
|
|
{65, 5},
|
|
{196, 9},
|
|
{153, 5},
|
|
{165, 5},
|
|
{67, 5},
|
|
{177, 5},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{175, 9},
|
|
{148, 6},
|
|
{143, 11},
|
|
{81, 9},
|
|
{99, 9},
|
|
{66, 6},
|
|
{199, 9},
|
|
{87, 9},
|
|
{105, 9},
|
|
{68, 6},
|
|
{123, 9},
|
|
{74, 6},
|
|
{92, 6},
|
|
{64, 4},
|
|
{209, 12},
|
|
{157, 6},
|
|
{111, 9},
|
|
{70, 6},
|
|
{129, 9},
|
|
{76, 6},
|
|
{94, 6},
|
|
{65, 5},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{67, 5},
|
|
{118, 6},
|
|
{73, 5},
|
|
{91, 5},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{190, 9},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{202, 9},
|
|
{89, 9},
|
|
{107, 9},
|
|
{69, 7},
|
|
{125, 9},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{113, 9},
|
|
{71, 7},
|
|
{131, 9},
|
|
{31, 11},
|
|
{47, 11},
|
|
{7, 9},
|
|
{194, 7},
|
|
{83, 7},
|
|
{55, 11},
|
|
{11, 9},
|
|
{119, 7},
|
|
{19, 9},
|
|
{35, 9},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{137, 9},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{59, 11},
|
|
{13, 9},
|
|
{121, 7},
|
|
{21, 9},
|
|
{37, 9},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{25, 9},
|
|
{41, 9},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{49, 9},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{205, 9},
|
|
{156, 8},
|
|
{168, 8},
|
|
{146, 4},
|
|
{180, 8},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{159, 8},
|
|
{115, 9},
|
|
{72, 8},
|
|
{133, 9},
|
|
{78, 8},
|
|
{96, 8},
|
|
{65, 5},
|
|
{195, 8},
|
|
{84, 8},
|
|
{102, 8},
|
|
{67, 5},
|
|
{120, 8},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{174, 8},
|
|
{148, 6},
|
|
{139, 9},
|
|
{80, 8},
|
|
{98, 8},
|
|
{66, 6},
|
|
{198, 8},
|
|
{86, 8},
|
|
{61, 11},
|
|
{14, 9},
|
|
{122, 8},
|
|
{22, 9},
|
|
{38, 9},
|
|
{3, 8},
|
|
{209, 12},
|
|
{157, 6},
|
|
{110, 8},
|
|
{70, 6},
|
|
{128, 8},
|
|
{26, 9},
|
|
{42, 9},
|
|
{5, 8},
|
|
{193, 6},
|
|
{82, 6},
|
|
{50, 9},
|
|
{9, 8},
|
|
{118, 6},
|
|
{17, 8},
|
|
{33, 8},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{189, 8},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{201, 8},
|
|
{88, 8},
|
|
{106, 8},
|
|
{69, 7},
|
|
{124, 8},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{112, 8},
|
|
{71, 7},
|
|
{130, 8},
|
|
{28, 9},
|
|
{44, 9},
|
|
{6, 8},
|
|
{194, 7},
|
|
{83, 7},
|
|
{52, 9},
|
|
{10, 8},
|
|
{119, 7},
|
|
{18, 8},
|
|
{34, 8},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{136, 8},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{56, 9},
|
|
{12, 8},
|
|
{121, 7},
|
|
{20, 8},
|
|
{36, 8},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{24, 8},
|
|
{40, 8},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{48, 8},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{146, 4},
|
|
{209, 12},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{147, 5},
|
|
{209, 12},
|
|
{150, 5},
|
|
{162, 5},
|
|
{65, 5},
|
|
{209, 12},
|
|
{153, 5},
|
|
{165, 5},
|
|
{67, 5},
|
|
{177, 5},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{176, 10},
|
|
{148, 6},
|
|
{188, 10},
|
|
{151, 6},
|
|
{163, 6},
|
|
{66, 6},
|
|
{200, 10},
|
|
{154, 6},
|
|
{166, 6},
|
|
{68, 6},
|
|
{178, 6},
|
|
{74, 6},
|
|
{92, 6},
|
|
{64, 4},
|
|
{209, 12},
|
|
{157, 6},
|
|
{169, 6},
|
|
{70, 6},
|
|
{181, 6},
|
|
{76, 6},
|
|
{94, 6},
|
|
{65, 5},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{67, 5},
|
|
{118, 6},
|
|
{73, 5},
|
|
{91, 5},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{191, 10},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{203, 10},
|
|
{90, 10},
|
|
{108, 10},
|
|
{69, 7},
|
|
{126, 10},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{114, 10},
|
|
{71, 7},
|
|
{132, 10},
|
|
{77, 7},
|
|
{95, 7},
|
|
{65, 5},
|
|
{194, 7},
|
|
{83, 7},
|
|
{101, 7},
|
|
{67, 5},
|
|
{119, 7},
|
|
{73, 5},
|
|
{91, 5},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{138, 10},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{103, 7},
|
|
{68, 6},
|
|
{121, 7},
|
|
{74, 6},
|
|
{92, 6},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{76, 6},
|
|
{94, 6},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{206, 10},
|
|
{156, 8},
|
|
{168, 8},
|
|
{146, 4},
|
|
{180, 8},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{159, 8},
|
|
{116, 10},
|
|
{72, 8},
|
|
{134, 10},
|
|
{78, 8},
|
|
{96, 8},
|
|
{65, 5},
|
|
{195, 8},
|
|
{84, 8},
|
|
{102, 8},
|
|
{67, 5},
|
|
{120, 8},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{174, 8},
|
|
{148, 6},
|
|
{140, 10},
|
|
{80, 8},
|
|
{98, 8},
|
|
{66, 6},
|
|
{198, 8},
|
|
{86, 8},
|
|
{62, 11},
|
|
{15, 10},
|
|
{122, 8},
|
|
{23, 10},
|
|
{39, 10},
|
|
{3, 8},
|
|
{209, 12},
|
|
{157, 6},
|
|
{110, 8},
|
|
{70, 6},
|
|
{128, 8},
|
|
{27, 10},
|
|
{43, 10},
|
|
{5, 8},
|
|
{193, 6},
|
|
{82, 6},
|
|
{51, 10},
|
|
{9, 8},
|
|
{118, 6},
|
|
{17, 8},
|
|
{33, 8},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{189, 8},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{201, 8},
|
|
{88, 8},
|
|
{106, 8},
|
|
{69, 7},
|
|
{124, 8},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{112, 8},
|
|
{71, 7},
|
|
{130, 8},
|
|
{29, 10},
|
|
{45, 10},
|
|
{6, 8},
|
|
{194, 7},
|
|
{83, 7},
|
|
{53, 10},
|
|
{10, 8},
|
|
{119, 7},
|
|
{18, 8},
|
|
{34, 8},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{136, 8},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{57, 10},
|
|
{12, 8},
|
|
{121, 7},
|
|
{20, 8},
|
|
{36, 8},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{24, 8},
|
|
{40, 8},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{48, 8},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{146, 4},
|
|
{209, 12},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{160, 9},
|
|
{172, 9},
|
|
{147, 5},
|
|
{184, 9},
|
|
{150, 5},
|
|
{162, 5},
|
|
{65, 5},
|
|
{196, 9},
|
|
{153, 5},
|
|
{165, 5},
|
|
{67, 5},
|
|
{177, 5},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{175, 9},
|
|
{148, 6},
|
|
{142, 10},
|
|
{81, 9},
|
|
{99, 9},
|
|
{66, 6},
|
|
{199, 9},
|
|
{87, 9},
|
|
{105, 9},
|
|
{68, 6},
|
|
{123, 9},
|
|
{74, 6},
|
|
{92, 6},
|
|
{64, 4},
|
|
{209, 12},
|
|
{157, 6},
|
|
{111, 9},
|
|
{70, 6},
|
|
{129, 9},
|
|
{76, 6},
|
|
{94, 6},
|
|
{65, 5},
|
|
{193, 6},
|
|
{82, 6},
|
|
{100, 6},
|
|
{67, 5},
|
|
{118, 6},
|
|
{73, 5},
|
|
{91, 5},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{190, 9},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{202, 9},
|
|
{89, 9},
|
|
{107, 9},
|
|
{69, 7},
|
|
{125, 9},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{113, 9},
|
|
{71, 7},
|
|
{131, 9},
|
|
{30, 10},
|
|
{46, 10},
|
|
{7, 9},
|
|
{194, 7},
|
|
{83, 7},
|
|
{54, 10},
|
|
{11, 9},
|
|
{119, 7},
|
|
{19, 9},
|
|
{35, 9},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{137, 9},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{58, 10},
|
|
{13, 9},
|
|
{121, 7},
|
|
{21, 9},
|
|
{37, 9},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{25, 9},
|
|
{41, 9},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{49, 9},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{145, 3},
|
|
{205, 9},
|
|
{156, 8},
|
|
{168, 8},
|
|
{146, 4},
|
|
{180, 8},
|
|
{149, 4},
|
|
{161, 4},
|
|
{64, 4},
|
|
{209, 12},
|
|
{159, 8},
|
|
{115, 9},
|
|
{72, 8},
|
|
{133, 9},
|
|
{78, 8},
|
|
{96, 8},
|
|
{65, 5},
|
|
{195, 8},
|
|
{84, 8},
|
|
{102, 8},
|
|
{67, 5},
|
|
{120, 8},
|
|
{73, 5},
|
|
{91, 5},
|
|
{64, 4},
|
|
{209, 12},
|
|
{209, 12},
|
|
{174, 8},
|
|
{148, 6},
|
|
{139, 9},
|
|
{80, 8},
|
|
{98, 8},
|
|
{66, 6},
|
|
{198, 8},
|
|
{86, 8},
|
|
{60, 10},
|
|
{14, 9},
|
|
{122, 8},
|
|
{22, 9},
|
|
{38, 9},
|
|
{3, 8},
|
|
{209, 12},
|
|
{157, 6},
|
|
{110, 8},
|
|
{70, 6},
|
|
{128, 8},
|
|
{26, 9},
|
|
{42, 9},
|
|
{5, 8},
|
|
{193, 6},
|
|
{82, 6},
|
|
{50, 9},
|
|
{9, 8},
|
|
{118, 6},
|
|
{17, 8},
|
|
{33, 8},
|
|
{0, 6},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{209, 12},
|
|
{189, 8},
|
|
{152, 7},
|
|
{164, 7},
|
|
{145, 3},
|
|
{201, 8},
|
|
{88, 8},
|
|
{106, 8},
|
|
{69, 7},
|
|
{124, 8},
|
|
{75, 7},
|
|
{93, 7},
|
|
{64, 4},
|
|
{209, 12},
|
|
{158, 7},
|
|
{112, 8},
|
|
{71, 7},
|
|
{130, 8},
|
|
{28, 9},
|
|
{44, 9},
|
|
{6, 8},
|
|
{194, 7},
|
|
{83, 7},
|
|
{52, 9},
|
|
{10, 8},
|
|
{119, 7},
|
|
{18, 8},
|
|
{34, 8},
|
|
{1, 7},
|
|
{209, 12},
|
|
{209, 12},
|
|
{173, 7},
|
|
{148, 6},
|
|
{136, 8},
|
|
{79, 7},
|
|
{97, 7},
|
|
{66, 6},
|
|
{197, 7},
|
|
{85, 7},
|
|
{56, 9},
|
|
{12, 8},
|
|
{121, 7},
|
|
{20, 8},
|
|
{36, 8},
|
|
{2, 7},
|
|
{209, 12},
|
|
{157, 6},
|
|
{109, 7},
|
|
{70, 6},
|
|
{127, 7},
|
|
{24, 8},
|
|
{40, 8},
|
|
{4, 7},
|
|
{193, 6},
|
|
{82, 6},
|
|
{48, 8},
|
|
{8, 7},
|
|
{118, 6},
|
|
{16, 7},
|
|
{32, 7},
|
|
{0, 6}};
|
|
} // utf8_to_utf16 namespace
|
|
} // tables namespace
|
|
} // unnamed namespace
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_UTF8_TO_UTF16_TABLES_H
|
|
/* end file src/tables/utf8_to_utf16_tables.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=tables/utf16_to_utf8_tables.h
|
|
/* begin file src/tables/utf16_to_utf8_tables.h */
|
|
// file generated by scripts/sse_convert_utf16_to_utf8.py
|
|
#ifndef SIMDUTF_UTF16_TO_UTF8_TABLES_H
|
|
#define SIMDUTF_UTF16_TO_UTF8_TABLES_H
|
|
|
|
namespace simdutf {
|
|
namespace {
|
|
namespace tables {
|
|
namespace utf16_to_utf8 {
|
|
|
|
// 1 byte for length, 16 bytes for mask
|
|
const uint8_t pack_1_2_utf8_bytes[256][17] = {
|
|
{16,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14},
|
|
{15,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80},
|
|
{15,1,0,3,2,5,4,7,6,8,11,10,13,12,15,14,0x80},
|
|
{14,0,3,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80},
|
|
{15,1,0,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80},
|
|
{14,0,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80},
|
|
{14,1,0,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80},
|
|
{13,0,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
|
|
{15,1,0,3,2,5,4,7,6,9,8,10,13,12,15,14,0x80},
|
|
{14,0,3,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80},
|
|
{14,1,0,3,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80},
|
|
{13,0,3,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80},
|
|
{14,1,0,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80},
|
|
{13,0,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
|
|
{13,1,0,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80},
|
|
{12,0,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
|
|
{15,1,0,3,2,4,7,6,9,8,11,10,13,12,15,14,0x80},
|
|
{14,0,3,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80},
|
|
{14,1,0,3,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80},
|
|
{13,0,3,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
|
|
{14,1,0,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80},
|
|
{13,0,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
|
|
{13,1,0,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
|
|
{12,0,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
|
|
{14,1,0,3,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80},
|
|
{13,0,3,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
|
|
{13,1,0,3,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80},
|
|
{12,0,3,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
|
|
{13,1,0,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
|
|
{12,0,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
|
|
{12,1,0,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
|
|
{11,0,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
|
|
{15,1,0,3,2,5,4,7,6,9,8,11,10,12,15,14,0x80},
|
|
{14,0,3,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80},
|
|
{14,1,0,3,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80},
|
|
{13,0,3,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80},
|
|
{14,1,0,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80},
|
|
{13,0,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
|
|
{13,1,0,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80},
|
|
{12,0,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
|
|
{14,1,0,3,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80},
|
|
{13,0,3,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80},
|
|
{13,1,0,3,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80},
|
|
{12,0,3,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
|
|
{13,1,0,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80},
|
|
{12,0,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
|
|
{12,1,0,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
|
|
{11,0,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
|
|
{14,1,0,3,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80},
|
|
{13,0,3,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
|
|
{13,1,0,3,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80},
|
|
{12,0,3,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
|
|
{13,1,0,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
|
|
{12,0,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
|
|
{12,1,0,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
|
|
{11,0,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
|
|
{13,1,0,3,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80},
|
|
{12,0,3,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
|
|
{12,1,0,3,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
|
|
{11,0,3,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
|
|
{12,1,0,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
|
|
{11,0,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
|
|
{11,1,0,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{15,1,0,3,2,5,4,6,9,8,11,10,13,12,15,14,0x80},
|
|
{14,0,3,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80},
|
|
{14,1,0,3,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80},
|
|
{13,0,3,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
|
|
{14,1,0,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80},
|
|
{13,0,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
|
|
{13,1,0,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
|
|
{12,0,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
|
|
{14,1,0,3,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80},
|
|
{13,0,3,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
|
|
{13,1,0,3,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80},
|
|
{12,0,3,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
|
|
{13,1,0,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
|
|
{12,0,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
|
|
{12,1,0,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
|
|
{11,0,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
|
|
{14,1,0,3,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80},
|
|
{13,0,3,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
|
|
{13,1,0,3,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
|
|
{12,0,3,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
|
|
{13,1,0,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
|
|
{12,0,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
|
|
{12,1,0,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
|
|
{11,0,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
|
|
{13,1,0,3,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
|
|
{12,0,3,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
|
|
{12,1,0,3,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
|
|
{11,0,3,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
|
|
{12,1,0,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
|
|
{11,0,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
|
|
{11,1,0,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{14,1,0,3,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80},
|
|
{13,0,3,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
|
|
{13,1,0,3,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80},
|
|
{12,0,3,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
|
|
{13,1,0,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
|
|
{12,0,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
|
|
{12,1,0,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
|
|
{11,0,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
|
|
{13,1,0,3,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80},
|
|
{12,0,3,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
|
|
{12,1,0,3,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
|
|
{11,0,3,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
|
|
{12,1,0,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
|
|
{11,0,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
|
|
{11,1,0,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{13,1,0,3,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
|
|
{12,0,3,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
|
|
{12,1,0,3,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
|
|
{11,0,3,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
|
|
{12,1,0,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
|
|
{11,0,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
|
|
{11,1,0,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{12,1,0,3,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
|
|
{11,0,3,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
|
|
{11,1,0,3,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,3,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{11,1,0,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{10,1,0,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,0,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{15,1,0,3,2,5,4,7,6,9,8,11,10,13,12,14,0x80},
|
|
{14,0,3,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80},
|
|
{14,1,0,3,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80},
|
|
{13,0,3,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80},
|
|
{14,1,0,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80},
|
|
{13,0,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
|
|
{13,1,0,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80},
|
|
{12,0,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
|
|
{14,1,0,3,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80},
|
|
{13,0,3,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80},
|
|
{13,1,0,3,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80},
|
|
{12,0,3,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
|
|
{13,1,0,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80},
|
|
{12,0,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
|
|
{12,1,0,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
|
|
{11,0,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{14,1,0,3,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80},
|
|
{13,0,3,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
|
|
{13,1,0,3,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80},
|
|
{12,0,3,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
|
|
{13,1,0,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
|
|
{12,0,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
|
|
{12,1,0,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
|
|
{11,0,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{13,1,0,3,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80},
|
|
{12,0,3,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
|
|
{12,1,0,3,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
|
|
{11,0,3,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{12,1,0,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
|
|
{11,0,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{11,1,0,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{14,1,0,3,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80},
|
|
{13,0,3,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80},
|
|
{13,1,0,3,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80},
|
|
{12,0,3,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
|
|
{13,1,0,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80},
|
|
{12,0,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
|
|
{12,1,0,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
|
|
{11,0,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{13,1,0,3,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80},
|
|
{12,0,3,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
|
|
{12,1,0,3,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80},
|
|
{11,0,3,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{12,1,0,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
|
|
{11,0,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{11,1,0,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{13,1,0,3,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80},
|
|
{12,0,3,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
|
|
{12,1,0,3,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
|
|
{11,0,3,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{12,1,0,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
|
|
{11,0,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{11,1,0,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{12,1,0,3,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
|
|
{11,0,3,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{11,1,0,3,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,3,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{11,1,0,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{10,1,0,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,0,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{14,1,0,3,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80},
|
|
{13,0,3,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
|
|
{13,1,0,3,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80},
|
|
{12,0,3,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
|
|
{13,1,0,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
|
|
{12,0,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
|
|
{12,1,0,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
|
|
{11,0,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{13,1,0,3,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80},
|
|
{12,0,3,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
|
|
{12,1,0,3,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
|
|
{11,0,3,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{12,1,0,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
|
|
{11,0,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{11,1,0,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{13,1,0,3,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
|
|
{12,0,3,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
|
|
{12,1,0,3,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
|
|
{11,0,3,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{12,1,0,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
|
|
{11,0,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{11,1,0,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{12,1,0,3,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
|
|
{11,0,3,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{11,1,0,3,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,3,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{11,1,0,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{10,1,0,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,0,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{13,1,0,3,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80},
|
|
{12,0,3,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
|
|
{12,1,0,3,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
|
|
{11,0,3,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{12,1,0,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
|
|
{11,0,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{11,1,0,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{12,1,0,3,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
|
|
{11,0,3,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{11,1,0,3,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,3,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{11,1,0,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{10,1,0,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,0,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{12,1,0,3,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
|
|
{11,0,3,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{11,1,0,3,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,3,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{11,1,0,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{10,1,0,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,0,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{11,1,0,3,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,3,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{10,1,0,3,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,0,3,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{10,1,0,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,0,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,1,0,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,0,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}
|
|
};
|
|
|
|
// 1 byte for length, 16 bytes for mask
|
|
const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
|
|
{12,2,3,1,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80},
|
|
{9,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{11,3,1,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80},
|
|
{10,0,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,2,3,1,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,3,1,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,0,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{11,2,3,1,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80},
|
|
{8,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{10,3,1,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,0,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{10,2,3,1,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,3,1,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,0,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,2,3,1,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,3,1,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,0,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,2,3,1,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{3,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,3,1,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,0,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,2,3,1,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,3,1,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,0,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,2,3,1,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,3,1,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,0,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{11,2,3,1,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80},
|
|
{8,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{10,3,1,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,0,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,2,3,1,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,3,1,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,0,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{10,2,3,1,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,3,1,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,0,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,2,3,1,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,3,1,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,0,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{10,2,3,1,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,3,1,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,0,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,2,3,1,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,3,1,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,0,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,2,3,1,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,3,1,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,0,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,2,3,1,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,3,1,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,0,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,2,3,1,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,3,1,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,0,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,2,3,1,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{3,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,3,1,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,0,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,2,3,1,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,3,1,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,0,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,2,3,1,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,3,1,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,0,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,2,3,1,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{3,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,3,1,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,0,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{3,2,3,1,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{0,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{2,3,1,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{1,0,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,2,3,1,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{2,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,3,1,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{3,0,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,2,3,1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{3,3,1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{2,0,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,2,3,1,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,3,1,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,0,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,2,3,1,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{2,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,3,1,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{3,0,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,2,3,1,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,3,1,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,0,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,2,3,1,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{3,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,3,1,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,0,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,2,3,1,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,3,1,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,0,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,2,3,1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{3,3,1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{2,0,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,2,3,1,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{3,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,3,1,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,0,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,2,3,1,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{2,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,3,1,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{3,0,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{11,2,3,1,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80},
|
|
{8,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{10,3,1,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,0,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,2,3,1,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,3,1,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,0,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{10,2,3,1,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,3,1,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,0,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,2,3,1,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,3,1,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,0,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,2,3,1,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,3,1,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,0,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,2,3,1,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{2,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,3,1,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{3,0,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,2,3,1,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,3,1,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,0,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,2,3,1,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{3,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,3,1,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,0,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{10,2,3,1,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,3,1,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,0,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,2,3,1,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,3,1,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,0,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,2,3,1,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,3,1,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,0,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,2,3,1,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,3,1,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,0,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,2,3,1,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,3,1,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,0,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,2,3,1,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{3,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,3,1,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,0,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,2,3,1,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,3,1,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,0,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,2,3,1,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,3,1,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,0,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{10,2,3,1,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,3,1,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,0,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,2,3,1,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,3,1,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,0,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,2,3,1,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,3,1,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,0,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,2,3,1,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,3,1,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,0,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,2,3,1,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,3,1,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,0,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,2,3,1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{3,3,1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{2,0,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,2,3,1,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{3,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,3,1,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,0,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,2,3,1,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{2,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,3,1,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{3,0,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{9,2,3,1,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,3,1,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,0,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,2,3,1,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{3,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,3,1,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,0,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,2,3,1,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,3,1,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,0,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,2,3,1,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,3,1,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,0,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{8,2,3,1,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,3,1,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,0,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,2,3,1,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{2,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,3,1,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{3,0,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{7,2,3,1,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,3,1,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,0,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{6,2,3,1,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{3,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{5,3,1,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
|
|
{4,0,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}
|
|
};
|
|
|
|
} // utf16_to_utf8 namespace
|
|
} // tables namespace
|
|
} // unnamed namespace
|
|
} // namespace simdutf
|
|
|
|
#endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H
|
|
/* end file src/tables/utf16_to_utf8_tables.h */
|
|
// End of tables.
|
|
|
|
// The scalar routines should be included once.
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/ascii.h
|
|
/* begin file src/scalar/ascii.h */
|
|
#ifndef SIMDUTF_ASCII_H
|
|
#define SIMDUTF_ASCII_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace ascii {
|
|
#if SIMDUTF_IMPLEMENTATION_FALLBACK
|
|
// Only used by the fallback kernel.
|
|
inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
uint64_t pos = 0;
|
|
// process in blocks of 16 bytes when possible
|
|
for (;pos + 16 < len; pos += 16) {
|
|
uint64_t v1;
|
|
std::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 | v2};
|
|
if ((v & 0x8080808080808080) != 0) { return false; }
|
|
}
|
|
// process the tail byte-by-byte
|
|
for (;pos < len; pos ++) {
|
|
if (data[pos] >= 0b10000000) { return false; }
|
|
}
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t len) noexcept {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
size_t pos = 0;
|
|
// process in blocks of 16 bytes when possible
|
|
for (;pos + 16 < len; pos += 16) {
|
|
uint64_t v1;
|
|
std::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 | v2};
|
|
if ((v & 0x8080808080808080) != 0) {
|
|
for (;pos < len; pos ++) {
|
|
if (data[pos] >= 0b10000000) { return result(error_code::TOO_LARGE, pos); }
|
|
}
|
|
}
|
|
}
|
|
// process the tail byte-by-byte
|
|
for (;pos < len; pos ++) {
|
|
if (data[pos] >= 0b10000000) { return result(error_code::TOO_LARGE, pos); }
|
|
}
|
|
return result(error_code::SUCCESS, pos);
|
|
}
|
|
|
|
} // ascii namespace
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/ascii.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8.h
|
|
/* begin file src/scalar/utf8.h */
|
|
#ifndef SIMDUTF_UTF8_H
|
|
#define SIMDUTF_UTF8_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf8 {
|
|
#if SIMDUTF_IMPLEMENTATION_FALLBACK
|
|
// only used by the fallback kernel.
|
|
// credit: based on code from Google Fuchsia (Apache Licensed)
|
|
inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
uint64_t pos = 0;
|
|
uint32_t code_point = 0;
|
|
while (pos < len) {
|
|
// check of the next 8 bytes are ascii.
|
|
uint64_t next_pos = pos + 16;
|
|
if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
|
|
uint64_t v1;
|
|
std::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 | v2};
|
|
if ((v & 0x8080808080808080) == 0) {
|
|
pos = next_pos;
|
|
continue;
|
|
}
|
|
}
|
|
unsigned char byte = data[pos];
|
|
|
|
while (byte < 0b10000000) {
|
|
if (++pos == len) { return true; }
|
|
byte = data[pos];
|
|
}
|
|
|
|
if ((byte & 0b11100000) == 0b11000000) {
|
|
next_pos = pos + 2;
|
|
if (next_pos > len) { return false; }
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
|
|
// range check
|
|
code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
|
|
if ((code_point < 0x80) || (0x7ff < code_point)) { return false; }
|
|
} else if ((byte & 0b11110000) == 0b11100000) {
|
|
next_pos = pos + 3;
|
|
if (next_pos > len) { return false; }
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
|
|
// range check
|
|
code_point = (byte & 0b00001111) << 12 |
|
|
(data[pos + 1] & 0b00111111) << 6 |
|
|
(data[pos + 2] & 0b00111111);
|
|
if ((code_point < 0x800) || (0xffff < code_point) ||
|
|
(0xd7ff < code_point && code_point < 0xe000)) {
|
|
return false;
|
|
}
|
|
} else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
|
|
next_pos = pos + 4;
|
|
if (next_pos > len) { return false; }
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
|
|
if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
|
|
// range check
|
|
code_point =
|
|
(byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
|
|
(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
|
|
if (code_point <= 0xffff || 0x10ffff < code_point) { return false; }
|
|
} else {
|
|
// we may have a continuation
|
|
return false;
|
|
}
|
|
pos = next_pos;
|
|
}
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t len) noexcept {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
size_t pos = 0;
|
|
uint32_t code_point = 0;
|
|
while (pos < len) {
|
|
// check of the next 8 bytes are ascii.
|
|
size_t next_pos = pos + 16;
|
|
if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
|
|
uint64_t v1;
|
|
std::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 | v2};
|
|
if ((v & 0x8080808080808080) == 0) {
|
|
pos = next_pos;
|
|
continue;
|
|
}
|
|
}
|
|
unsigned char byte = data[pos];
|
|
|
|
while (byte < 0b10000000) {
|
|
if (++pos == len) { return result(error_code::SUCCESS, len); }
|
|
byte = data[pos];
|
|
}
|
|
|
|
if ((byte & 0b11100000) == 0b11000000) {
|
|
next_pos = pos + 2;
|
|
if (next_pos > len) { return result(error_code::TOO_SHORT, pos); }
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
|
|
// range check
|
|
code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
|
|
if ((code_point < 0x80) || (0x7ff < code_point)) { return result(error_code::OVERLONG, pos); }
|
|
} else if ((byte & 0b11110000) == 0b11100000) {
|
|
next_pos = pos + 3;
|
|
if (next_pos > len) { return result(error_code::TOO_SHORT, pos); }
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
|
|
// range check
|
|
code_point = (byte & 0b00001111) << 12 |
|
|
(data[pos + 1] & 0b00111111) << 6 |
|
|
(data[pos + 2] & 0b00111111);
|
|
if ((code_point < 0x800) || (0xffff < code_point)) { return result(error_code::OVERLONG, pos);}
|
|
if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); }
|
|
} else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
|
|
next_pos = pos + 4;
|
|
if (next_pos > len) { return result(error_code::TOO_SHORT, pos); }
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
|
|
if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
|
|
// range check
|
|
code_point =
|
|
(byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
|
|
(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
|
|
if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); }
|
|
if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); }
|
|
} else {
|
|
// we either have too many continuation bytes or an invalid leading byte
|
|
if ((byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); }
|
|
else { return result(error_code::HEADER_BITS, pos); }
|
|
}
|
|
pos = next_pos;
|
|
}
|
|
return result(error_code::SUCCESS, len);
|
|
}
|
|
|
|
// Finds the previous leading byte and validates with errors from there
|
|
// Used to pinpoint the location of an error when an invalid chunk is detected
|
|
inline simdutf_warn_unused result rewind_and_validate_with_errors(const char *start, const char *buf, size_t len) noexcept {
|
|
// First check that we start with a leading byte
|
|
if ((*start & 0b11000000) == 0b10000000) {
|
|
return result(error_code::TOO_LONG, 0);
|
|
}
|
|
size_t extra_len{0};
|
|
// A leading byte cannot be further than 4 bytes away
|
|
for(int i = 0; i < 5; i++) {
|
|
unsigned char byte = *buf;
|
|
if ((byte & 0b11000000) != 0b10000000) {
|
|
break;
|
|
} else {
|
|
buf--;
|
|
extra_len++;
|
|
}
|
|
}
|
|
|
|
result res = validate_with_errors(buf, len + extra_len);
|
|
res.count -= extra_len;
|
|
return res;
|
|
}
|
|
|
|
inline size_t count_code_points(const char* buf, size_t len) {
|
|
const int8_t * p = reinterpret_cast<const int8_t *>(buf);
|
|
size_t counter{0};
|
|
for(size_t i = 0; i < len; i++) {
|
|
// -65 is 0b10111111, anything larger in two-complement's should start a new code point.
|
|
if(p[i] > -65) { counter++; }
|
|
}
|
|
return counter;
|
|
}
|
|
|
|
inline size_t utf16_length_from_utf8(const char* buf, size_t len) {
|
|
const int8_t * p = reinterpret_cast<const int8_t *>(buf);
|
|
size_t counter{0};
|
|
for(size_t i = 0; i < len; i++) {
|
|
if(p[i] > -65) { counter++; }
|
|
if(uint8_t(p[i]) >= 240) { counter++; }
|
|
}
|
|
return counter;
|
|
}
|
|
|
|
} // utf8 namespace
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf8.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16.h
|
|
/* begin file src/scalar/utf16.h */
|
|
#ifndef SIMDUTF_UTF16_H
|
|
#define SIMDUTF_UTF16_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf16 {
|
|
|
|
inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word) {
|
|
return uint16_t((word >> 8) | (word << 8));
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
inline simdutf_warn_unused bool validate(const char16_t *buf, size_t len) noexcept {
|
|
const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
|
|
uint64_t pos = 0;
|
|
while (pos < len) {
|
|
uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
|
|
if((word &0xF800) == 0xD800) {
|
|
if(pos + 1 >= len) { return false; }
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
if(diff > 0x3FF) { return false; }
|
|
uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if(diff2 > 0x3FF) { return false; }
|
|
pos += 2;
|
|
} else {
|
|
pos++;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
inline simdutf_warn_unused result validate_with_errors(const char16_t *buf, size_t len) noexcept {
|
|
const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
|
|
size_t pos = 0;
|
|
while (pos < len) {
|
|
uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
|
|
if((word & 0xF800) == 0xD800) {
|
|
if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); }
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); }
|
|
uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); }
|
|
pos += 2;
|
|
} else {
|
|
pos++;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, pos);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
inline size_t count_code_points(const char16_t* buf, size_t len) {
|
|
// We are not BOM aware.
|
|
const uint16_t * p = reinterpret_cast<const uint16_t *>(buf);
|
|
size_t counter{0};
|
|
for(size_t i = 0; i < len; i++) {
|
|
uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
|
|
counter += ((word & 0xFC00) != 0xDC00);
|
|
}
|
|
return counter;
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
inline size_t utf8_length_from_utf16(const char16_t* buf, size_t len) {
|
|
// We are not BOM aware.
|
|
const uint16_t * p = reinterpret_cast<const uint16_t *>(buf);
|
|
size_t counter{0};
|
|
for(size_t i = 0; i < len; i++) {
|
|
uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
|
|
/** ASCII **/
|
|
if(word <= 0x7F) { counter++; }
|
|
/** two-byte **/
|
|
else if (word <= 0x7FF) { counter += 2; }
|
|
/** three-byte **/
|
|
else if((word <= 0xD7FF) || (word >= 0xE000)) { counter += 3; }
|
|
/** surrogates -- 4 bytes **/
|
|
else { counter += 2; }
|
|
}
|
|
return counter;
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
inline size_t utf32_length_from_utf16(const char16_t* buf, size_t len) {
|
|
// We are not BOM aware.
|
|
const uint16_t * p = reinterpret_cast<const uint16_t *>(buf);
|
|
size_t counter{0};
|
|
for(size_t i = 0; i < len; i++) {
|
|
uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
|
|
counter += ((word & 0xFC00) != 0xDC00);
|
|
}
|
|
return counter;
|
|
}
|
|
|
|
simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* out) {
|
|
const uint16_t * input = reinterpret_cast<const uint16_t *>(in);
|
|
uint16_t * output = reinterpret_cast<uint16_t *>(out);
|
|
for (size_t i = 0; i < size; i++) {
|
|
*output++ = uint16_t(input[i] >> 8 | input[i] << 8);
|
|
}
|
|
}
|
|
|
|
} // utf16 namespace
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf16.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32.h
|
|
/* begin file src/scalar/utf32.h */
|
|
#ifndef SIMDUTF_UTF32_H
|
|
#define SIMDUTF_UTF32_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf32 {
|
|
|
|
inline simdutf_warn_unused bool validate(const char32_t *buf, size_t len) noexcept {
|
|
const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
|
|
uint64_t pos = 0;
|
|
for(;pos < len; pos++) {
|
|
uint32_t word = data[pos];
|
|
if(word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
inline simdutf_warn_unused result validate_with_errors(const char32_t *buf, size_t len) noexcept {
|
|
const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
|
|
size_t pos = 0;
|
|
for(;pos < len; pos++) {
|
|
uint32_t word = data[pos];
|
|
if(word > 0x10FFFF) {
|
|
return result(error_code::TOO_LARGE, pos);
|
|
}
|
|
if(word >= 0xD800 && word <= 0xDFFF) {
|
|
return result(error_code::SURROGATE, pos);
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, pos);
|
|
}
|
|
|
|
inline size_t utf8_length_from_utf32(const char32_t* buf, size_t len) {
|
|
// We are not BOM aware.
|
|
const uint32_t * p = reinterpret_cast<const uint32_t *>(buf);
|
|
size_t counter{0};
|
|
for(size_t i = 0; i < len; i++) {
|
|
/** ASCII **/
|
|
if(p[i] <= 0x7F) { counter++; }
|
|
/** two-byte **/
|
|
else if(p[i] <= 0x7FF) { counter += 2; }
|
|
/** three-byte **/
|
|
else if(p[i] <= 0xFFFF) { counter += 3; }
|
|
/** four-bytes **/
|
|
else { counter += 4; }
|
|
}
|
|
return counter;
|
|
}
|
|
|
|
inline size_t utf16_length_from_utf32(const char32_t* buf, size_t len) {
|
|
// We are not BOM aware.
|
|
const uint32_t * p = reinterpret_cast<const uint32_t *>(buf);
|
|
size_t counter{0};
|
|
for(size_t i = 0; i < len; i++) {
|
|
/** non-surrogate word **/
|
|
if(p[i] <= 0xFFFF) { counter++; }
|
|
/** surrogate pair **/
|
|
else { counter += 2; }
|
|
}
|
|
return counter;
|
|
}
|
|
|
|
} // utf32 namespace
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf32.h */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf8/valid_utf32_to_utf8.h
|
|
/* begin file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
|
|
#ifndef SIMDUTF_VALID_UTF32_TO_UTF8_H
|
|
#define SIMDUTF_VALID_UTF32_TO_UTF8_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf32_to_utf8 {
|
|
|
|
#if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
|
|
// only used by the fallback and POWER kernel
|
|
inline size_t convert_valid(const char32_t* buf, size_t len, char* utf8_output) {
|
|
const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
|
|
size_t pos = 0;
|
|
char* start{utf8_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 2 ASCII characters
|
|
if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
|
|
uint64_t v;
|
|
::memcpy(&v, data + pos, sizeof(uint64_t));
|
|
if ((v & 0xFFFFFF80FFFFFF80) == 0) {
|
|
*utf8_output++ = char(buf[pos]);
|
|
*utf8_output++ = char(buf[pos+1]);
|
|
pos += 2;
|
|
continue;
|
|
}
|
|
}
|
|
uint32_t word = data[pos];
|
|
if((word & 0xFFFFFF80)==0) {
|
|
// will generate one UTF-8 bytes
|
|
*utf8_output++ = char(word);
|
|
pos++;
|
|
} else if((word & 0xFFFFF800)==0) {
|
|
// will generate two UTF-8 bytes
|
|
// we have 0b110XXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word>>6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else if((word & 0xFFFF0000)==0) {
|
|
// will generate three UTF-8 bytes
|
|
// we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word>>12) | 0b11100000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else {
|
|
// will generate four UTF-8 bytes
|
|
// we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word>>18) | 0b11110000);
|
|
*utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos ++;
|
|
}
|
|
}
|
|
return utf8_output - start;
|
|
}
|
|
#endif // SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
|
|
|
|
} // utf32_to_utf8 namespace
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf8/utf32_to_utf8.h
|
|
/* begin file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
|
|
#ifndef SIMDUTF_UTF32_TO_UTF8_H
|
|
#define SIMDUTF_UTF32_TO_UTF8_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf32_to_utf8 {
|
|
|
|
inline size_t convert(const char32_t* buf, size_t len, char* utf8_output) {
|
|
const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
|
|
size_t pos = 0;
|
|
char* start{utf8_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 2 ASCII characters
|
|
if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
|
|
uint64_t v;
|
|
::memcpy(&v, data + pos, sizeof(uint64_t));
|
|
if ((v & 0xFFFFFF80FFFFFF80) == 0) {
|
|
*utf8_output++ = char(buf[pos]);
|
|
*utf8_output++ = char(buf[pos+1]);
|
|
pos += 2;
|
|
continue;
|
|
}
|
|
}
|
|
uint32_t word = data[pos];
|
|
if((word & 0xFFFFFF80)==0) {
|
|
// will generate one UTF-8 bytes
|
|
*utf8_output++ = char(word);
|
|
pos++;
|
|
} else if((word & 0xFFFFF800)==0) {
|
|
// will generate two UTF-8 bytes
|
|
// we have 0b110XXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word>>6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else if((word & 0xFFFF0000)==0) {
|
|
// will generate three UTF-8 bytes
|
|
// we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
|
|
if (word >= 0xD800 && word <= 0xDFFF) { return 0; }
|
|
*utf8_output++ = char((word>>12) | 0b11100000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else {
|
|
// will generate four UTF-8 bytes
|
|
// we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
|
|
if (word > 0x10FFFF) { return 0; }
|
|
*utf8_output++ = char((word>>18) | 0b11110000);
|
|
*utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos ++;
|
|
}
|
|
}
|
|
return utf8_output - start;
|
|
}
|
|
|
|
inline result convert_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
|
|
const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
|
|
size_t pos = 0;
|
|
char* start{utf8_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 2 ASCII characters
|
|
if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
|
|
uint64_t v;
|
|
::memcpy(&v, data + pos, sizeof(uint64_t));
|
|
if ((v & 0xFFFFFF80FFFFFF80) == 0) {
|
|
*utf8_output++ = char(buf[pos]);
|
|
*utf8_output++ = char(buf[pos+1]);
|
|
pos += 2;
|
|
continue;
|
|
}
|
|
}
|
|
uint32_t word = data[pos];
|
|
if((word & 0xFFFFFF80)==0) {
|
|
// will generate one UTF-8 bytes
|
|
*utf8_output++ = char(word);
|
|
pos++;
|
|
} else if((word & 0xFFFFF800)==0) {
|
|
// will generate two UTF-8 bytes
|
|
// we have 0b110XXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word>>6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else if((word & 0xFFFF0000)==0) {
|
|
// will generate three UTF-8 bytes
|
|
// we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
|
|
if (word >= 0xD800 && word <= 0xDFFF) { return result(error_code::SURROGATE, pos); }
|
|
*utf8_output++ = char((word>>12) | 0b11100000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else {
|
|
// will generate four UTF-8 bytes
|
|
// we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
|
|
if (word > 0x10FFFF) { return result(error_code::TOO_LARGE, pos); }
|
|
*utf8_output++ = char((word>>18) | 0b11110000);
|
|
*utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos ++;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf8_output - start);
|
|
}
|
|
|
|
} // utf32_to_utf8 namespace
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf16/valid_utf32_to_utf16.h
|
|
/* begin file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
|
|
#ifndef SIMDUTF_VALID_UTF32_TO_UTF16_H
|
|
#define SIMDUTF_VALID_UTF32_TO_UTF16_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf32_to_utf16 {
|
|
|
|
template <endianness big_endian>
|
|
inline size_t convert_valid(const char32_t* buf, size_t len, char16_t* utf16_output) {
|
|
const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
|
|
size_t pos = 0;
|
|
char16_t* start{utf16_output};
|
|
while (pos < len) {
|
|
uint32_t word = data[pos];
|
|
if((word & 0xFFFF0000)==0) {
|
|
// will not generate a surrogate pair
|
|
*utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
|
|
pos++;
|
|
} else {
|
|
// will generate a surrogate pair
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (!match_system(big_endian)) {
|
|
high_surrogate = utf16::swap_bytes(high_surrogate);
|
|
low_surrogate = utf16::swap_bytes(low_surrogate);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
pos++;
|
|
}
|
|
}
|
|
return utf16_output - start;
|
|
}
|
|
|
|
} // utf32_to_utf16 namespace
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf16/utf32_to_utf16.h
|
|
/* begin file src/scalar/utf32_to_utf16/utf32_to_utf16.h */
|
|
#ifndef SIMDUTF_UTF32_TO_UTF16_H
|
|
#define SIMDUTF_UTF32_TO_UTF16_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf32_to_utf16 {
|
|
|
|
template <endianness big_endian>
|
|
inline size_t convert(const char32_t* buf, size_t len, char16_t* utf16_output) {
|
|
const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
|
|
size_t pos = 0;
|
|
char16_t* start{utf16_output};
|
|
while (pos < len) {
|
|
uint32_t word = data[pos];
|
|
if((word & 0xFFFF0000)==0) {
|
|
if (word >= 0xD800 && word <= 0xDFFF) { return 0; }
|
|
// will not generate a surrogate pair
|
|
*utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) { return 0; }
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (!match_system(big_endian)) {
|
|
high_surrogate = utf16::swap_bytes(high_surrogate);
|
|
low_surrogate = utf16::swap_bytes(low_surrogate);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
pos++;
|
|
}
|
|
return utf16_output - start;
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
inline result convert_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
|
|
const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
|
|
size_t pos = 0;
|
|
char16_t* start{utf16_output};
|
|
while (pos < len) {
|
|
uint32_t word = data[pos];
|
|
if((word & 0xFFFF0000)==0) {
|
|
if (word >= 0xD800 && word <= 0xDFFF) { return result(error_code::SURROGATE, pos); }
|
|
// will not generate a surrogate pair
|
|
*utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) { return result(error_code::TOO_LARGE, pos); }
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (!match_system(big_endian)) {
|
|
high_surrogate = utf16::swap_bytes(high_surrogate);
|
|
low_surrogate = utf16::swap_bytes(low_surrogate);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
pos++;
|
|
}
|
|
return result(error_code::SUCCESS, utf16_output - start);
|
|
}
|
|
|
|
} // utf32_to_utf16 namespace
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf32_to_utf16/utf32_to_utf16.h */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf8/valid_utf16_to_utf8.h
|
|
/* begin file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
|
|
#ifndef SIMDUTF_VALID_UTF16_TO_UTF8_H
|
|
#define SIMDUTF_VALID_UTF16_TO_UTF8_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf16_to_utf8 {
|
|
|
|
template <endianness big_endian>
|
|
inline size_t convert_valid(const char16_t* buf, size_t len, char* utf8_output) {
|
|
const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
|
|
size_t pos = 0;
|
|
char* start{utf8_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 4 ASCII characters
|
|
if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
|
|
uint64_t v;
|
|
::memcpy(&v, data + pos, sizeof(uint64_t));
|
|
if (!match_system(big_endian)) v = (v >> 8) | (v << (64 - 8));
|
|
if ((v & 0xFF80FF80FF80FF80) == 0) {
|
|
size_t final_pos = pos + 4;
|
|
while(pos < final_pos) {
|
|
*utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
|
|
uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
|
|
if((word & 0xFF80)==0) {
|
|
// will generate one UTF-8 bytes
|
|
*utf8_output++ = char(word);
|
|
pos++;
|
|
} else if((word & 0xF800)==0) {
|
|
// will generate two UTF-8 bytes
|
|
// we have 0b110XXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word>>6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else if((word &0xF800 ) != 0xD800) {
|
|
// will generate three UTF-8 bytes
|
|
// we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word>>12) | 0b11100000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
if(pos + 1 >= len) { return 0; } // minimal bound checking
|
|
uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
// will generate four UTF-8 bytes
|
|
// we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((value>>18) | 0b11110000);
|
|
*utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
pos += 2;
|
|
}
|
|
}
|
|
return utf8_output - start;
|
|
}
|
|
|
|
} // utf16_to_utf8 namespace
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf8/utf16_to_utf8.h
|
|
/* begin file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
|
|
#ifndef SIMDUTF_UTF16_TO_UTF8_H
|
|
#define SIMDUTF_UTF16_TO_UTF8_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf16_to_utf8 {
|
|
|
|
template <endianness big_endian>
|
|
inline size_t convert(const char16_t* buf, size_t len, char* utf8_output) {
|
|
const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
|
|
size_t pos = 0;
|
|
char* start{utf8_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 8 ASCII characters
|
|
if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
|
|
uint64_t v;
|
|
::memcpy(&v, data + pos, sizeof(uint64_t));
|
|
if (!match_system(big_endian)) v = (v >> 8) | (v << (64 - 8));
|
|
if ((v & 0xFF80FF80FF80FF80) == 0) {
|
|
size_t final_pos = pos + 4;
|
|
while(pos < final_pos) {
|
|
*utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
|
|
if((word & 0xFF80)==0) {
|
|
// will generate one UTF-8 bytes
|
|
*utf8_output++ = char(word);
|
|
pos++;
|
|
} else if((word & 0xF800)==0) {
|
|
// will generate two UTF-8 bytes
|
|
// we have 0b110XXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word>>6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else if((word &0xF800 ) != 0xD800) {
|
|
// will generate three UTF-8 bytes
|
|
// we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word>>12) | 0b11100000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else {
|
|
// must be a surrogate pair
|
|
if(pos + 1 >= len) { return 0; }
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
if(diff > 0x3FF) { return 0; }
|
|
uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if(diff2 > 0x3FF) { return 0; }
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
// will generate four UTF-8 bytes
|
|
// we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((value>>18) | 0b11110000);
|
|
*utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
pos += 2;
|
|
}
|
|
}
|
|
return utf8_output - start;
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
inline result convert_with_errors(const char16_t* buf, size_t len, char* utf8_output) {
|
|
const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
|
|
size_t pos = 0;
|
|
char* start{utf8_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 8 ASCII characters
|
|
if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
|
|
uint64_t v;
|
|
::memcpy(&v, data + pos, sizeof(uint64_t));
|
|
if (!match_system(big_endian)) v = (v >> 8) | (v << (64 - 8));
|
|
if ((v & 0xFF80FF80FF80FF80) == 0) {
|
|
size_t final_pos = pos + 4;
|
|
while(pos < final_pos) {
|
|
*utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
|
|
if((word & 0xFF80)==0) {
|
|
// will generate one UTF-8 bytes
|
|
*utf8_output++ = char(word);
|
|
pos++;
|
|
} else if((word & 0xF800)==0) {
|
|
// will generate two UTF-8 bytes
|
|
// we have 0b110XXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word>>6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else if((word &0xF800 ) != 0xD800) {
|
|
// will generate three UTF-8 bytes
|
|
// we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((word>>12) | 0b11100000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
pos++;
|
|
} else {
|
|
// must be a surrogate pair
|
|
if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); }
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); }
|
|
uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); }
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
// will generate four UTF-8 bytes
|
|
// we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
|
|
*utf8_output++ = char((value>>18) | 0b11110000);
|
|
*utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
pos += 2;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf8_output - start);
|
|
}
|
|
|
|
} // utf16_to_utf8 namespace
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf32/valid_utf16_to_utf32.h
|
|
/* begin file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
|
|
#ifndef SIMDUTF_VALID_UTF16_TO_UTF32_H
|
|
#define SIMDUTF_VALID_UTF16_TO_UTF32_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf16_to_utf32 {
|
|
|
|
template <endianness big_endian>
|
|
inline size_t convert_valid(const char16_t* buf, size_t len, char32_t* utf32_output) {
|
|
const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
|
|
size_t pos = 0;
|
|
char32_t* start{utf32_output};
|
|
while (pos < len) {
|
|
uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
|
|
if((word &0xF800 ) != 0xD800) {
|
|
// No surrogate pair, extend 16-bit word to 32-bit word
|
|
*utf32_output++ = char32_t(word);
|
|
pos++;
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
if(pos + 1 >= len) { return 0; } // minimal bound checking
|
|
uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
pos += 2;
|
|
}
|
|
}
|
|
return utf32_output - start;
|
|
}
|
|
|
|
} // utf16_to_utf32 namespace
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf32/utf16_to_utf32.h
|
|
/* begin file src/scalar/utf16_to_utf32/utf16_to_utf32.h */
|
|
#ifndef SIMDUTF_UTF16_TO_UTF32_H
|
|
#define SIMDUTF_UTF16_TO_UTF32_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf16_to_utf32 {
|
|
|
|
template <endianness big_endian>
|
|
inline size_t convert(const char16_t* buf, size_t len, char32_t* utf32_output) {
|
|
const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
|
|
size_t pos = 0;
|
|
char32_t* start{utf32_output};
|
|
while (pos < len) {
|
|
uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
|
|
if((word &0xF800 ) != 0xD800) {
|
|
// No surrogate pair, extend 16-bit word to 32-bit word
|
|
*utf32_output++ = char32_t(word);
|
|
pos++;
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
if(diff > 0x3FF) { return 0; }
|
|
if(pos + 1 >= len) { return 0; } // minimal bound checking
|
|
uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if(diff2 > 0x3FF) { return 0; }
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
pos += 2;
|
|
}
|
|
}
|
|
return utf32_output - start;
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
inline result convert_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) {
|
|
const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
|
|
size_t pos = 0;
|
|
char32_t* start{utf32_output};
|
|
while (pos < len) {
|
|
uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
|
|
if((word &0xF800 ) != 0xD800) {
|
|
// No surrogate pair, extend 16-bit word to 32-bit word
|
|
*utf32_output++ = char32_t(word);
|
|
pos++;
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); }
|
|
if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); } // minimal bound checking
|
|
uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); }
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
pos += 2;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf32_output - start);
|
|
}
|
|
|
|
} // utf16_to_utf32 namespace
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf16_to_utf32/utf16_to_utf32.h */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf16/valid_utf8_to_utf16.h
|
|
/* begin file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
#ifndef SIMDUTF_VALID_UTF8_TO_UTF16_H
|
|
#define SIMDUTF_VALID_UTF8_TO_UTF16_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
|
|
template <endianness big_endian>
|
|
inline size_t convert_valid(const char* buf, size_t len, char16_t* utf16_output) {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
size_t pos = 0;
|
|
char16_t* start{utf16_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 8 ASCII bytes
|
|
if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
|
|
uint64_t v;
|
|
::memcpy(&v, data + pos, sizeof(uint64_t));
|
|
if ((v & 0x8080808080808080) == 0) {
|
|
size_t final_pos = pos + 8;
|
|
while(pos < final_pos) {
|
|
*utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
uint8_t leading_byte = data[pos]; // leading byte
|
|
if (leading_byte < 0b10000000) {
|
|
// converting one ASCII byte !!!
|
|
*utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)) : char16_t(leading_byte);
|
|
pos++;
|
|
} else if ((leading_byte & 0b11100000) == 0b11000000) {
|
|
// We have a two-byte UTF-8, it should become
|
|
// a single UTF-16 word.
|
|
if(pos + 1 >= len) { break; } // minimal bound checking
|
|
uint16_t code_point = uint16_t(((leading_byte &0b00011111) << 6) | (data[pos + 1] &0b00111111));
|
|
if (!match_system(big_endian)) {
|
|
code_point = utf16::swap_bytes(uint16_t(code_point));
|
|
}
|
|
*utf16_output++ = char16_t(code_point);
|
|
pos += 2;
|
|
} else if ((leading_byte & 0b11110000) == 0b11100000) {
|
|
// We have a three-byte UTF-8, it should become
|
|
// a single UTF-16 word.
|
|
if(pos + 2 >= len) { break; } // minimal bound checking
|
|
uint16_t code_point = uint16_t(((leading_byte &0b00001111) << 12) | ((data[pos + 1] &0b00111111) << 6) | (data[pos + 2] &0b00111111));
|
|
if (!match_system(big_endian)) {
|
|
code_point = utf16::swap_bytes(uint16_t(code_point));
|
|
}
|
|
*utf16_output++ = char16_t(code_point);
|
|
pos += 3;
|
|
} else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
|
|
// we have a 4-byte UTF-8 word.
|
|
if(pos + 3 >= len) { break; } // minimal bound checking
|
|
uint32_t code_point = ((leading_byte & 0b00000111) << 18 )| ((data[pos + 1] &0b00111111) << 12)
|
|
| ((data[pos + 2] &0b00111111) << 6) | (data[pos + 3] &0b00111111);
|
|
code_point -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
|
|
if (!match_system(big_endian)) {
|
|
high_surrogate = utf16::swap_bytes(high_surrogate);
|
|
low_surrogate = utf16::swap_bytes(low_surrogate);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
pos += 4;
|
|
} else {
|
|
// we may have a continuation but we do not do error checking
|
|
return 0;
|
|
}
|
|
}
|
|
return utf16_output - start;
|
|
}
|
|
|
|
|
|
} // namespace utf8_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf16/utf8_to_utf16.h
|
|
/* begin file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
|
|
#ifndef SIMDUTF_UTF8_TO_UTF16_H
|
|
#define SIMDUTF_UTF8_TO_UTF16_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
|
|
template <endianness big_endian>
|
|
inline size_t convert(const char* buf, size_t len, char16_t* utf16_output) {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
size_t pos = 0;
|
|
char16_t* start{utf16_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 16 ASCII bytes
|
|
if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
|
|
uint64_t v1;
|
|
::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 | v2};
|
|
if ((v & 0x8080808080808080) == 0) {
|
|
size_t final_pos = pos + 16;
|
|
while(pos < final_pos) {
|
|
*utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
|
|
uint8_t leading_byte = data[pos]; // leading byte
|
|
if (leading_byte < 0b10000000) {
|
|
// converting one ASCII byte !!!
|
|
*utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)): char16_t(leading_byte);
|
|
pos++;
|
|
} else if ((leading_byte & 0b11100000) == 0b11000000) {
|
|
// We have a two-byte UTF-8, it should become
|
|
// a single UTF-16 word.
|
|
if(pos + 1 >= len) { return 0; } // minimal bound checking
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
|
|
// range check
|
|
uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
|
|
if (code_point < 0x80 || 0x7ff < code_point) { return 0; }
|
|
if (!match_system(big_endian)) {
|
|
code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
|
|
}
|
|
*utf16_output++ = char16_t(code_point);
|
|
pos += 2;
|
|
} else if ((leading_byte & 0b11110000) == 0b11100000) {
|
|
// We have a three-byte UTF-8, it should become
|
|
// a single UTF-16 word.
|
|
if(pos + 2 >= len) { return 0; } // minimal bound checking
|
|
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
|
|
// range check
|
|
uint32_t code_point = (leading_byte & 0b00001111) << 12 |
|
|
(data[pos + 1] & 0b00111111) << 6 |
|
|
(data[pos + 2] & 0b00111111);
|
|
if (code_point < 0x800 || 0xffff < code_point ||
|
|
(0xd7ff < code_point && code_point < 0xe000)) {
|
|
return 0;
|
|
}
|
|
if (!match_system(big_endian)) {
|
|
code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
|
|
}
|
|
*utf16_output++ = char16_t(code_point);
|
|
pos += 3;
|
|
} else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
|
|
// we have a 4-byte UTF-8 word.
|
|
if(pos + 3 >= len) { return 0; } // minimal bound checking
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
|
|
if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; }
|
|
|
|
// range check
|
|
uint32_t code_point =
|
|
(leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
|
|
(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
|
|
if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; }
|
|
code_point -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
|
|
if (!match_system(big_endian)) {
|
|
high_surrogate = utf16::swap_bytes(high_surrogate);
|
|
low_surrogate = utf16::swap_bytes(low_surrogate);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
pos += 4;
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
return utf16_output - start;
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_output) {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
size_t pos = 0;
|
|
char16_t* start{utf16_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 16 ASCII bytes
|
|
if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
|
|
uint64_t v1;
|
|
::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 | v2};
|
|
if ((v & 0x8080808080808080) == 0) {
|
|
size_t final_pos = pos + 16;
|
|
while(pos < final_pos) {
|
|
*utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
uint8_t leading_byte = data[pos]; // leading byte
|
|
if (leading_byte < 0b10000000) {
|
|
// converting one ASCII byte !!!
|
|
*utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)): char16_t(leading_byte);
|
|
pos++;
|
|
} else if ((leading_byte & 0b11100000) == 0b11000000) {
|
|
// We have a two-byte UTF-8, it should become
|
|
// a single UTF-16 word.
|
|
if(pos + 1 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
|
|
// range check
|
|
uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
|
|
if (code_point < 0x80 || 0x7ff < code_point) { return result(error_code::OVERLONG, pos); }
|
|
if (!match_system(big_endian)) {
|
|
code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
|
|
}
|
|
*utf16_output++ = char16_t(code_point);
|
|
pos += 2;
|
|
} else if ((leading_byte & 0b11110000) == 0b11100000) {
|
|
// We have a three-byte UTF-8, it should become
|
|
// a single UTF-16 word.
|
|
if(pos + 2 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
|
|
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
|
|
// range check
|
|
uint32_t code_point = (leading_byte & 0b00001111) << 12 |
|
|
(data[pos + 1] & 0b00111111) << 6 |
|
|
(data[pos + 2] & 0b00111111);
|
|
if ((code_point < 0x800) || (0xffff < code_point)) { return result(error_code::OVERLONG, pos);}
|
|
if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); }
|
|
if (!match_system(big_endian)) {
|
|
code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
|
|
}
|
|
*utf16_output++ = char16_t(code_point);
|
|
pos += 3;
|
|
} else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
|
|
// we have a 4-byte UTF-8 word.
|
|
if(pos + 3 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
|
|
if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
|
|
|
|
// range check
|
|
uint32_t code_point =
|
|
(leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
|
|
(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
|
|
if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); }
|
|
if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); }
|
|
code_point -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
|
|
if (!match_system(big_endian)) {
|
|
high_surrogate = utf16::swap_bytes(high_surrogate);
|
|
low_surrogate = utf16::swap_bytes(low_surrogate);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
pos += 4;
|
|
} else {
|
|
// we either have too many continuation bytes or an invalid leading byte
|
|
if ((leading_byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); }
|
|
else { return result(error_code::HEADER_BITS, pos); }
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf16_output - start);
|
|
}
|
|
|
|
/**
|
|
* When rewind_and_convert_with_errors is called, we are pointing at 'buf' and we have
|
|
* up to len input bytes left, and we encountered some error. It is possible that
|
|
* the error is at 'buf' exactly, but it could also be in the previous bytes (up to 3 bytes back).
|
|
*
|
|
* prior_bytes indicates how many bytes, prior to 'buf' may belong to the current memory section
|
|
* and can be safely accessed. We prior_bytes to access safely up to three bytes before 'buf'.
|
|
*
|
|
* The caller is responsible to ensure that len > 0.
|
|
*
|
|
* If the error is believed to have occured prior to 'buf', the count value contain in the result
|
|
* will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
|
|
*/
|
|
template <endianness endian>
|
|
inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char16_t* utf16_output) {
|
|
size_t extra_len{0};
|
|
// We potentially need to go back in time and find a leading byte.
|
|
// In theory '3' would be sufficient, but sometimes the error can go back quite far.
|
|
size_t how_far_back = prior_bytes;
|
|
// size_t how_far_back = 3; // 3 bytes in the past + current position
|
|
// if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
|
|
bool found_leading_bytes{false};
|
|
// important: it is i <= how_far_back and not 'i < how_far_back'.
|
|
for(size_t i = 0; i <= how_far_back; i++) {
|
|
unsigned char byte = buf[0-i];
|
|
found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
|
|
if(found_leading_bytes) {
|
|
buf -= i;
|
|
extra_len = i;
|
|
break;
|
|
}
|
|
}
|
|
//
|
|
// It is possible for this function to return a negative count in its result.
|
|
// C++ Standard Section 18.1 defines size_t is in <cstddef> which is described in C Standard as <stddef.h>.
|
|
// C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator
|
|
//
|
|
// An unsigned type will simply wrap round arithmetically (well defined).
|
|
//
|
|
if(!found_leading_bytes) {
|
|
// If how_far_back == 3, we may have four consecutive continuation bytes!!!
|
|
// [....] [continuation] [continuation] [continuation] | [buf is continuation]
|
|
// Or we possibly have a stream that does not start with a leading byte.
|
|
return result(error_code::TOO_LONG, 0-how_far_back);
|
|
}
|
|
result res = convert_with_errors<endian>(buf, len + extra_len, utf16_output);
|
|
if (res.error) {
|
|
res.count -= extra_len;
|
|
}
|
|
return res;
|
|
}
|
|
|
|
} // utf8_to_utf16 namespace
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf32/valid_utf8_to_utf32.h
|
|
/* begin file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
#ifndef SIMDUTF_VALID_UTF8_TO_UTF32_H
|
|
#define SIMDUTF_VALID_UTF8_TO_UTF32_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
|
|
inline size_t convert_valid(const char* buf, size_t len, char32_t* utf32_output) {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
size_t pos = 0;
|
|
char32_t* start{utf32_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 8 ASCII bytes
|
|
if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
|
|
uint64_t v;
|
|
::memcpy(&v, data + pos, sizeof(uint64_t));
|
|
if ((v & 0x8080808080808080) == 0) {
|
|
size_t final_pos = pos + 8;
|
|
while(pos < final_pos) {
|
|
*utf32_output++ = char32_t(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
uint8_t leading_byte = data[pos]; // leading byte
|
|
if (leading_byte < 0b10000000) {
|
|
// converting one ASCII byte !!!
|
|
*utf32_output++ = char32_t(leading_byte);
|
|
pos++;
|
|
} else if ((leading_byte & 0b11100000) == 0b11000000) {
|
|
// We have a two-byte UTF-8
|
|
if(pos + 1 >= len) { break; } // minimal bound checking
|
|
*utf32_output++ = char32_t(((leading_byte &0b00011111) << 6) | (data[pos + 1] &0b00111111));
|
|
pos += 2;
|
|
} else if ((leading_byte & 0b11110000) == 0b11100000) {
|
|
// We have a three-byte UTF-8
|
|
if(pos + 2 >= len) { break; } // minimal bound checking
|
|
*utf32_output++ = char32_t(((leading_byte &0b00001111) << 12) | ((data[pos + 1] &0b00111111) << 6) | (data[pos + 2] &0b00111111));
|
|
pos += 3;
|
|
} else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
|
|
// we have a 4-byte UTF-8 word.
|
|
if(pos + 3 >= len) { break; } // minimal bound checking
|
|
uint32_t code_word = ((leading_byte & 0b00000111) << 18 )| ((data[pos + 1] &0b00111111) << 12)
|
|
| ((data[pos + 2] &0b00111111) << 6) | (data[pos + 3] &0b00111111);
|
|
*utf32_output++ = char32_t(code_word);
|
|
pos += 4;
|
|
} else {
|
|
// we may have a continuation but we do not do error checking
|
|
return 0;
|
|
}
|
|
}
|
|
return utf32_output - start;
|
|
}
|
|
|
|
|
|
} // namespace utf8_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf32/utf8_to_utf32.h
|
|
/* begin file src/scalar/utf8_to_utf32/utf8_to_utf32.h */
|
|
#ifndef SIMDUTF_UTF8_TO_UTF32_H
|
|
#define SIMDUTF_UTF8_TO_UTF32_H
|
|
|
|
namespace simdutf {
|
|
namespace scalar {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
|
|
inline size_t convert(const char* buf, size_t len, char32_t* utf32_output) {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
size_t pos = 0;
|
|
char32_t* start{utf32_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 16 ASCII bytes
|
|
if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
|
|
uint64_t v1;
|
|
::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 | v2};
|
|
if ((v & 0x8080808080808080) == 0) {
|
|
size_t final_pos = pos + 16;
|
|
while(pos < final_pos) {
|
|
*utf32_output++ = char32_t(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
uint8_t leading_byte = data[pos]; // leading byte
|
|
if (leading_byte < 0b10000000) {
|
|
// converting one ASCII byte !!!
|
|
*utf32_output++ = char32_t(leading_byte);
|
|
pos++;
|
|
} else if ((leading_byte & 0b11100000) == 0b11000000) {
|
|
// We have a two-byte UTF-8
|
|
if(pos + 1 >= len) { return 0; } // minimal bound checking
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
|
|
// range check
|
|
uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
|
|
if (code_point < 0x80 || 0x7ff < code_point) { return 0; }
|
|
*utf32_output++ = char32_t(code_point);
|
|
pos += 2;
|
|
} else if ((leading_byte & 0b11110000) == 0b11100000) {
|
|
// We have a three-byte UTF-8
|
|
if(pos + 2 >= len) { return 0; } // minimal bound checking
|
|
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
|
|
// range check
|
|
uint32_t code_point = (leading_byte & 0b00001111) << 12 |
|
|
(data[pos + 1] & 0b00111111) << 6 |
|
|
(data[pos + 2] & 0b00111111);
|
|
if (code_point < 0x800 || 0xffff < code_point ||
|
|
(0xd7ff < code_point && code_point < 0xe000)) {
|
|
return 0;
|
|
}
|
|
*utf32_output++ = char32_t(code_point);
|
|
pos += 3;
|
|
} else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
|
|
// we have a 4-byte UTF-8 word.
|
|
if(pos + 3 >= len) { return 0; } // minimal bound checking
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
|
|
if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; }
|
|
|
|
// range check
|
|
uint32_t code_point =
|
|
(leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
|
|
(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
|
|
if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; }
|
|
*utf32_output++ = char32_t(code_point);
|
|
pos += 4;
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
return utf32_output - start;
|
|
}
|
|
|
|
inline result convert_with_errors(const char* buf, size_t len, char32_t* utf32_output) {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
size_t pos = 0;
|
|
char32_t* start{utf32_output};
|
|
while (pos < len) {
|
|
// try to convert the next block of 16 ASCII bytes
|
|
if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
|
|
uint64_t v1;
|
|
::memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 | v2};
|
|
if ((v & 0x8080808080808080) == 0) {
|
|
size_t final_pos = pos + 16;
|
|
while(pos < final_pos) {
|
|
*utf32_output++ = char32_t(buf[pos]);
|
|
pos++;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
uint8_t leading_byte = data[pos]; // leading byte
|
|
if (leading_byte < 0b10000000) {
|
|
// converting one ASCII byte !!!
|
|
*utf32_output++ = char32_t(leading_byte);
|
|
pos++;
|
|
} else if ((leading_byte & 0b11100000) == 0b11000000) {
|
|
// We have a two-byte UTF-8
|
|
if(pos + 1 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
|
|
// range check
|
|
uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
|
|
if (code_point < 0x80 || 0x7ff < code_point) { return result(error_code::OVERLONG, pos); }
|
|
*utf32_output++ = char32_t(code_point);
|
|
pos += 2;
|
|
} else if ((leading_byte & 0b11110000) == 0b11100000) {
|
|
// We have a three-byte UTF-8
|
|
if(pos + 2 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
|
|
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
|
|
// range check
|
|
uint32_t code_point = (leading_byte & 0b00001111) << 12 |
|
|
(data[pos + 1] & 0b00111111) << 6 |
|
|
(data[pos + 2] & 0b00111111);
|
|
if (code_point < 0x800 || 0xffff < code_point) { return result(error_code::OVERLONG, pos); }
|
|
if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); }
|
|
*utf32_output++ = char32_t(code_point);
|
|
pos += 3;
|
|
} else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
|
|
// we have a 4-byte UTF-8 word.
|
|
if(pos + 3 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
|
|
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos);}
|
|
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
|
|
if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
|
|
|
|
// range check
|
|
uint32_t code_point =
|
|
(leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
|
|
(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
|
|
if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); }
|
|
if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); }
|
|
*utf32_output++ = char32_t(code_point);
|
|
pos += 4;
|
|
} else {
|
|
// we either have too many continuation bytes or an invalid leading byte
|
|
if ((leading_byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); }
|
|
else { return result(error_code::HEADER_BITS, pos); }
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf32_output - start);
|
|
}
|
|
|
|
/**
|
|
* When rewind_and_convert_with_errors is called, we are pointing at 'buf' and we have
|
|
* up to len input bytes left, and we encountered some error. It is possible that
|
|
* the error is at 'buf' exactly, but it could also be in the previous bytes location (up to 3 bytes back).
|
|
*
|
|
* prior_bytes indicates how many bytes, prior to 'buf' may belong to the current memory section
|
|
* and can be safely accessed. We prior_bytes to access safely up to three bytes before 'buf'.
|
|
*
|
|
* The caller is responsible to ensure that len > 0.
|
|
*
|
|
* If the error is believed to have occured prior to 'buf', the count value contain in the result
|
|
* will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
|
|
*/
|
|
inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char32_t* utf32_output) {
|
|
size_t extra_len{0};
|
|
// We potentially need to go back in time and find a leading byte.
|
|
size_t how_far_back = 3; // 3 bytes in the past + current position
|
|
if(how_far_back > prior_bytes) { how_far_back = prior_bytes; }
|
|
bool found_leading_bytes{false};
|
|
// important: it is i <= how_far_back and not 'i < how_far_back'.
|
|
for(size_t i = 0; i <= how_far_back; i++) {
|
|
unsigned char byte = buf[0-i];
|
|
found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
|
|
if(found_leading_bytes) {
|
|
buf -= i;
|
|
extra_len = i;
|
|
break;
|
|
}
|
|
}
|
|
//
|
|
// It is possible for this function to return a negative count in its result.
|
|
// C++ Standard Section 18.1 defines size_t is in <cstddef> which is described in C Standard as <stddef.h>.
|
|
// C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator
|
|
//
|
|
// An unsigned type will simply wrap round arithmetically (well defined).
|
|
//
|
|
if(!found_leading_bytes) {
|
|
// If how_far_back == 3, we may have four consecutive continuation bytes!!!
|
|
// [....] [continuation] [continuation] [continuation] | [buf is continuation]
|
|
// Or we possibly have a stream that does not start with a leading byte.
|
|
return result(error_code::TOO_LONG, 0-how_far_back);
|
|
}
|
|
|
|
result res = convert_with_errors(buf, len + extra_len, utf32_output);
|
|
if (res.error) {
|
|
res.count -= extra_len;
|
|
}
|
|
return res;
|
|
}
|
|
|
|
} // utf8_to_utf32 namespace
|
|
} // unnamed namespace
|
|
} // namespace scalar
|
|
} // namespace simdutf
|
|
|
|
#endif
|
|
/* end file src/scalar/utf8_to_utf32/utf8_to_utf32.h */
|
|
//
|
|
|
|
|
|
SIMDUTF_PUSH_DISABLE_WARNINGS
|
|
SIMDUTF_DISABLE_UNDESIRED_WARNINGS
|
|
|
|
|
|
#if SIMDUTF_IMPLEMENTATION_ARM64
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/implementation.cpp
|
|
/* begin file src/arm64/implementation.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/begin.h
|
|
/* begin file src/simdutf/arm64/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "arm64"
|
|
// #define SIMDUTF_IMPLEMENTATION arm64
|
|
/* end file src/simdutf/arm64/begin.h */
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
#ifndef SIMDUTF_ARM64_H
|
|
#error "arm64.h must be included"
|
|
#endif
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
|
|
simd8<uint8_t> bits = input.reduce_or();
|
|
return bits.max_val() < 0b10000000u;
|
|
}
|
|
|
|
simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
|
|
simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
|
|
simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
|
|
simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
|
|
// Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller is using ^ as well.
|
|
// This will work fine because we only have to report errors for cases with 0-1 lead bytes.
|
|
// Multiple lead bytes implies 2 overlapping multibyte characters, and if that happens, there is
|
|
// guaranteed to be at least *one* lead byte that is part of only 1 other multibyte character.
|
|
// The error will be detected there.
|
|
return is_second_byte ^ is_third_byte ^ is_fourth_byte;
|
|
}
|
|
|
|
simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
|
|
simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
|
|
simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
|
|
return is_third_byte ^ is_fourth_byte;
|
|
}
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_detect_encodings.cpp
|
|
/* begin file src/arm64/arm_detect_encodings.cpp */
|
|
template<class checker>
|
|
// len is known to be a multiple of 2 when this is called
|
|
int arm_detect_encodings(const char * buf, size_t len) {
|
|
const char* start = buf;
|
|
const char* end = buf + len;
|
|
|
|
bool is_utf8 = true;
|
|
bool is_utf16 = true;
|
|
bool is_utf32 = true;
|
|
|
|
int out = 0;
|
|
|
|
const auto v_d8 = simd8<uint8_t>::splat(0xd8);
|
|
const auto v_f8 = simd8<uint8_t>::splat(0xf8);
|
|
|
|
uint32x4_t currentmax = vmovq_n_u32(0x0);
|
|
|
|
checker check{};
|
|
|
|
while(buf + 64 <= end) {
|
|
uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t*>(buf));
|
|
uint16x8_t secondin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + simd16<uint16_t>::SIZE / sizeof(char16_t));
|
|
uint16x8_t thirdin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 2*simd16<uint16_t>::SIZE / sizeof(char16_t));
|
|
uint16x8_t fourthin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 3*simd16<uint16_t>::SIZE / sizeof(char16_t));
|
|
|
|
const auto u0 = simd16<uint16_t>(in);
|
|
const auto u1 = simd16<uint16_t>(secondin);
|
|
const auto u2 = simd16<uint16_t>(thirdin);
|
|
const auto u3 = simd16<uint16_t>(fourthin);
|
|
|
|
const auto v0 = u0.shr<8>();
|
|
const auto v1 = u1.shr<8>();
|
|
const auto v2 = u2.shr<8>();
|
|
const auto v3 = u3.shr<8>();
|
|
|
|
const auto in16 = simd16<uint16_t>::pack(v0, v1);
|
|
const auto nextin16 = simd16<uint16_t>::pack(v2, v3);
|
|
|
|
const uint64_t surrogates_wordmask0 = ((in16 & v_f8) == v_d8).to_bitmask64();
|
|
const uint64_t surrogates_wordmask1 = ((nextin16 & v_f8) == v_d8).to_bitmask64();
|
|
|
|
// Check for surrogates
|
|
if (surrogates_wordmask0 != 0 || surrogates_wordmask1 != 0) {
|
|
// Cannot be UTF8
|
|
is_utf8 = false;
|
|
// Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates
|
|
// To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word.
|
|
// On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant
|
|
// bytes of a 32-bit word since they always come in pairs in UTF-16LE.
|
|
// Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit words.
|
|
|
|
if (((surrogates_wordmask0 | surrogates_wordmask1) & 0xf0f0f0f0f0f0f0f0) != 0) {
|
|
is_utf32 = false;
|
|
// Code from arm_validate_utf16le.cpp
|
|
// Not efficient, we do not process surrogates_wordmask1
|
|
const char16_t * input = reinterpret_cast<const char16_t*>(buf);
|
|
const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len/2;
|
|
|
|
const auto v_fc = simd8<uint8_t>::splat(0xfc);
|
|
const auto v_dc = simd8<uint8_t>::splat(0xdc);
|
|
|
|
const uint64_t V0 = ~surrogates_wordmask0;
|
|
|
|
const auto vH0 = ((in16 & v_fc) == v_dc);
|
|
const uint64_t H0 = vH0.to_bitmask64();
|
|
|
|
const uint64_t L0 = ~H0 & surrogates_wordmask0;
|
|
|
|
const uint64_t a0 = L0 & (H0 >> 4);
|
|
|
|
const uint64_t b0 = a0 << 4;
|
|
|
|
const uint64_t c0 = V0 | a0 | b0;
|
|
if (c0 == ~0ull) {
|
|
input += 16;
|
|
} else if (c0 == 0xfffffffffffffffull) {
|
|
input += 15;
|
|
} else {
|
|
is_utf16 = false;
|
|
break;
|
|
}
|
|
|
|
while (input + 16 < end16) {
|
|
const auto in0 = simd16<uint16_t>(input);
|
|
const auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
|
|
const auto t0 = in0.shr<8>();
|
|
const auto t1 = in1.shr<8>();
|
|
const simd8<uint8_t> in_16 = simd16<uint16_t>::pack(t0, t1);
|
|
|
|
const uint64_t surrogates_wordmask = ((in_16 & v_f8) == v_d8).to_bitmask64();
|
|
if(surrogates_wordmask == 0) {
|
|
input += 16;
|
|
} else {
|
|
const uint64_t V = ~surrogates_wordmask;
|
|
|
|
const auto vH = ((in_16 & v_fc) == v_dc);
|
|
const uint64_t H = vH.to_bitmask64();
|
|
|
|
const uint64_t L = ~H & surrogates_wordmask;
|
|
|
|
const uint64_t a = L & (H >> 4);
|
|
|
|
const uint64_t b = a << 4;
|
|
|
|
const uint64_t c = V | a | b;
|
|
if (c == ~0ull) {
|
|
input += 16;
|
|
} else if (c == 0xfffffffffffffffull) {
|
|
input += 15;
|
|
} else {
|
|
is_utf16 = false;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
is_utf16 = false;
|
|
// Check for UTF-32
|
|
if (len % 4 == 0) {
|
|
const char32_t * input = reinterpret_cast<const char32_t*>(buf);
|
|
const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len/4;
|
|
|
|
// Must start checking for surrogates
|
|
uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
|
|
const uint32x4_t offset = vmovq_n_u32(0xffff2000);
|
|
const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
|
|
|
|
const uint32x4_t in32 = vreinterpretq_u32_u16(in);
|
|
const uint32x4_t secondin32 = vreinterpretq_u32_u16(secondin);
|
|
const uint32x4_t thirdin32 = vreinterpretq_u32_u16(thirdin);
|
|
const uint32x4_t fourthin32 = vreinterpretq_u32_u16(fourthin);
|
|
|
|
currentmax = vmaxq_u32(in32,currentmax);
|
|
currentmax = vmaxq_u32(secondin32,currentmax);
|
|
currentmax = vmaxq_u32(thirdin32,currentmax);
|
|
currentmax = vmaxq_u32(fourthin32,currentmax);
|
|
|
|
currentoffsetmax = vmaxq_u32(vaddq_u32(in32, offset), currentoffsetmax);
|
|
currentoffsetmax = vmaxq_u32(vaddq_u32(secondin32, offset), currentoffsetmax);
|
|
currentoffsetmax = vmaxq_u32(vaddq_u32(thirdin32, offset), currentoffsetmax);
|
|
currentoffsetmax = vmaxq_u32(vaddq_u32(fourthin32, offset), currentoffsetmax);
|
|
|
|
while (input + 4 < end32) {
|
|
const uint32x4_t in_32 = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
|
|
currentmax = vmaxq_u32(in_32,currentmax);
|
|
currentoffsetmax = vmaxq_u32(vaddq_u32(in_32, offset), currentoffsetmax);
|
|
input += 4;
|
|
}
|
|
|
|
uint32x4_t forbidden_words = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
|
|
if(vmaxvq_u32(forbidden_words) != 0) {
|
|
is_utf32 = false;
|
|
}
|
|
} else {
|
|
is_utf32 = false;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
// If no surrogate, validate under other encodings as well
|
|
|
|
// UTF-32 validation
|
|
currentmax = vmaxq_u32(vreinterpretq_u32_u16(in),currentmax);
|
|
currentmax = vmaxq_u32(vreinterpretq_u32_u16(secondin),currentmax);
|
|
currentmax = vmaxq_u32(vreinterpretq_u32_u16(thirdin),currentmax);
|
|
currentmax = vmaxq_u32(vreinterpretq_u32_u16(fourthin),currentmax);
|
|
|
|
// UTF-8 validation
|
|
// Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h
|
|
simd::simd8x64<uint8_t> in8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(secondin), vreinterpretq_u8_u16(thirdin), vreinterpretq_u8_u16(fourthin));
|
|
check.check_next_input(in8);
|
|
|
|
buf += 64;
|
|
}
|
|
|
|
// Check which encodings are possible
|
|
|
|
if (is_utf8) {
|
|
if (static_cast<size_t>(buf - start) != len) {
|
|
uint8_t block[64]{};
|
|
std::memset(block, 0x20, 64);
|
|
std::memcpy(block, buf, len - (buf - start));
|
|
simd::simd8x64<uint8_t> in(block);
|
|
check.check_next_input(in);
|
|
}
|
|
if (!check.errors()) {
|
|
out |= simdutf::encoding_type::UTF8;
|
|
}
|
|
}
|
|
|
|
if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start))/2)) {
|
|
out |= simdutf::encoding_type::UTF16_LE;
|
|
}
|
|
|
|
if (is_utf32 && (len % 4 == 0)) {
|
|
const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
|
|
uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
|
|
if (vmaxvq_u32(is_zero) == 0 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start))/4)) {
|
|
out |= simdutf::encoding_type::UTF32_LE;
|
|
}
|
|
}
|
|
|
|
return out;
|
|
}
|
|
/* end file src/arm64/arm_detect_encodings.cpp */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_validate_utf16.cpp
|
|
/* begin file src/arm64/arm_validate_utf16.cpp */
|
|
template <endianness big_endian>
|
|
const char16_t* arm_validate_utf16(const char16_t* input, size_t size) {
|
|
const char16_t* end = input + size;
|
|
const auto v_d8 = simd8<uint8_t>::splat(0xd8);
|
|
const auto v_f8 = simd8<uint8_t>::splat(0xf8);
|
|
const auto v_fc = simd8<uint8_t>::splat(0xfc);
|
|
const auto v_dc = simd8<uint8_t>::splat(0xdc);
|
|
while (input + 16 < end) {
|
|
// 0. Load data: since the validation takes into account only higher
|
|
// byte of each word, we compress the two vectors into one which
|
|
// consists only the higher bytes.
|
|
auto in0 = simd16<uint16_t>(input);
|
|
auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
|
|
if (!match_system(big_endian)) {
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
#else
|
|
const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
|
|
#endif
|
|
in0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in0), swap));
|
|
in1 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in1), swap));
|
|
}
|
|
const auto t0 = in0.shr<8>();
|
|
const auto t1 = in1.shr<8>();
|
|
const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
|
|
// 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
|
|
const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
|
|
if(surrogates_wordmask == 0) {
|
|
input += 16;
|
|
} else {
|
|
// 2. We have some surrogates that have to be distinguished:
|
|
// - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
|
|
// - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
|
|
//
|
|
// Fact: high surrogate has 11th bit set (3rd bit in the higher word)
|
|
|
|
// V - non-surrogate words
|
|
// V = not surrogates_wordmask
|
|
const uint64_t V = ~surrogates_wordmask;
|
|
|
|
// H - word-mask for high surrogates: the six highest bits are 0b1101'11
|
|
const auto vH = ((in & v_fc) == v_dc);
|
|
const uint64_t H = vH.to_bitmask64();
|
|
|
|
// L - word mask for low surrogates
|
|
// L = not H and surrogates_wordmask
|
|
const uint64_t L = ~H & surrogates_wordmask;
|
|
|
|
const uint64_t a = L & (H >> 4); // A low surrogate must be followed by high one.
|
|
// (A low surrogate placed in the 7th register's word
|
|
// is an exception we handle.)
|
|
const uint64_t b = a << 4; // Just mark that the opposite fact is hold,
|
|
// thanks to that we have only two masks for valid case.
|
|
const uint64_t c = V | a | b; // Combine all the masks into the final one.
|
|
if (c == ~0ull) {
|
|
// The whole input register contains valid UTF-16, i.e.,
|
|
// either single words or proper surrogate pairs.
|
|
input += 16;
|
|
} else if (c == 0xfffffffffffffffull) {
|
|
// The 15 lower words of the input register contains valid UTF-16.
|
|
// The 15th word may be either a low or high surrogate. It the next
|
|
// iteration we 1) check if the low surrogate is followed by a high
|
|
// one, 2) reject sole high surrogate.
|
|
input += 15;
|
|
} else {
|
|
return nullptr;
|
|
}
|
|
}
|
|
}
|
|
return input;
|
|
}
|
|
|
|
|
|
template <endianness big_endian>
|
|
const result arm_validate_utf16_with_errors(const char16_t* input, size_t size) {
|
|
const char16_t* start = input;
|
|
const char16_t* end = input + size;
|
|
|
|
const auto v_d8 = simd8<uint8_t>::splat(0xd8);
|
|
const auto v_f8 = simd8<uint8_t>::splat(0xf8);
|
|
const auto v_fc = simd8<uint8_t>::splat(0xfc);
|
|
const auto v_dc = simd8<uint8_t>::splat(0xdc);
|
|
while (input + 16 < end) {
|
|
// 0. Load data: since the validation takes into account only higher
|
|
// byte of each word, we compress the two vectors into one which
|
|
// consists only the higher bytes.
|
|
auto in0 = simd16<uint16_t>(input);
|
|
auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
|
|
|
|
if (!match_system(big_endian)) {
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
#else
|
|
const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
|
|
#endif
|
|
in0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in0), swap));
|
|
in1 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in1), swap));
|
|
}
|
|
const auto t0 = in0.shr<8>();
|
|
const auto t1 = in1.shr<8>();
|
|
const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
|
|
// 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
|
|
const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
|
|
if(surrogates_wordmask == 0) {
|
|
input += 16;
|
|
} else {
|
|
// 2. We have some surrogates that have to be distinguished:
|
|
// - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
|
|
// - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
|
|
//
|
|
// Fact: high surrogate has 11th bit set (3rd bit in the higher word)
|
|
|
|
// V - non-surrogate words
|
|
// V = not surrogates_wordmask
|
|
const uint64_t V = ~surrogates_wordmask;
|
|
|
|
// H - word-mask for high surrogates: the six highest bits are 0b1101'11
|
|
const auto vH = ((in & v_fc) == v_dc);
|
|
const uint64_t H = vH.to_bitmask64();
|
|
|
|
// L - word mask for low surrogates
|
|
// L = not H and surrogates_wordmask
|
|
const uint64_t L = ~H & surrogates_wordmask;
|
|
|
|
const uint64_t a = L & (H >> 4); // A low surrogate must be followed by high one.
|
|
// (A low surrogate placed in the 7th register's word
|
|
// is an exception we handle.)
|
|
const uint64_t b = a << 4; // Just mark that the opposite fact is hold,
|
|
// thanks to that we have only two masks for valid case.
|
|
const uint64_t c = V | a | b; // Combine all the masks into the final one.
|
|
if (c == ~0ull) {
|
|
// The whole input register contains valid UTF-16, i.e.,
|
|
// either single words or proper surrogate pairs.
|
|
input += 16;
|
|
} else if (c == 0xfffffffffffffffull) {
|
|
// The 15 lower words of the input register contains valid UTF-16.
|
|
// The 15th word may be either a low or high surrogate. It the next
|
|
// iteration we 1) check if the low surrogate is followed by a high
|
|
// one, 2) reject sole high surrogate.
|
|
input += 15;
|
|
} else {
|
|
return result(error_code::SURROGATE, input - start);
|
|
}
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, input - start);
|
|
}
|
|
/* end file src/arm64/arm_validate_utf16.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_validate_utf32le.cpp
|
|
/* begin file src/arm64/arm_validate_utf32le.cpp */
|
|
|
|
const char32_t* arm_validate_utf32le(const char32_t* input, size_t size) {
|
|
const char32_t* end = input + size;
|
|
|
|
const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
|
|
const uint32x4_t offset = vmovq_n_u32(0xffff2000);
|
|
const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
|
|
uint32x4_t currentmax = vmovq_n_u32(0x0);
|
|
uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
|
|
|
|
while (input + 4 < end) {
|
|
const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
|
|
currentmax = vmaxq_u32(in,currentmax);
|
|
currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
|
|
input += 4;
|
|
}
|
|
|
|
uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
|
|
if(vmaxvq_u32(is_zero) != 0) {
|
|
return nullptr;
|
|
}
|
|
|
|
is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
|
|
if(vmaxvq_u32(is_zero) != 0) {
|
|
return nullptr;
|
|
}
|
|
|
|
return input;
|
|
}
|
|
|
|
|
|
const result arm_validate_utf32le_with_errors(const char32_t* input, size_t size) {
|
|
const char32_t* start = input;
|
|
const char32_t* end = input + size;
|
|
|
|
const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
|
|
const uint32x4_t offset = vmovq_n_u32(0xffff2000);
|
|
const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
|
|
uint32x4_t currentmax = vmovq_n_u32(0x0);
|
|
uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
|
|
|
|
while (input + 4 < end) {
|
|
const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
|
|
currentmax = vmaxq_u32(in,currentmax);
|
|
currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
|
|
|
|
uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
|
|
if(vmaxvq_u32(is_zero) != 0) {
|
|
return result(error_code::TOO_LARGE, input - start);
|
|
}
|
|
|
|
is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
|
|
if(vmaxvq_u32(is_zero) != 0) {
|
|
return result(error_code::SURROGATE, input - start);
|
|
}
|
|
|
|
input += 4;
|
|
}
|
|
|
|
return result(error_code::SUCCESS, input - start);
|
|
}
|
|
/* end file src/arm64/arm_validate_utf32le.cpp */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf8_to_utf16.cpp
|
|
/* begin file src/arm64/arm_convert_utf8_to_utf16.cpp */
|
|
// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 12).
|
|
template <endianness big_endian>
|
|
size_t convert_masked_utf8_to_utf16(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char16_t *&utf16_output) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
#else
|
|
const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
|
|
#endif
|
|
uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask & 0xfff;
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it is maybe
|
|
// beneficial to have fast paths that depend on branch prediction but have less latency.
|
|
// This results in more instructions but, potentially, also higher speeds.
|
|
//
|
|
// We first try a few fast paths.
|
|
if((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
|
|
// We process in chunks of 16 bytes
|
|
uint16x8_t ascii_first = vmovl_u8(vget_low_u8 (in));
|
|
uint16x8_t ascii_second = vmovl_high_u8(in);
|
|
if (!match_system(big_endian)) {
|
|
ascii_first = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(ascii_first), swap));
|
|
ascii_second = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(ascii_second), swap));
|
|
}
|
|
vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), ascii_first);
|
|
vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output) + 8, ascii_second);
|
|
utf16_output += 16; // We wrote 16 16-bit characters.
|
|
return 16; // We consumed 16 bytes.
|
|
}
|
|
if((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa) {
|
|
// We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
|
|
// There is probably a more efficient sequence, but the following might do.
|
|
uint8x16_t perm = vqtbl1q_u8(in, swap);
|
|
uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
|
|
uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
|
|
uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
|
|
if (!match_system(big_endian)) composed = vqtbl1q_u8(composed, swap);
|
|
vst1q_u8(reinterpret_cast<uint8_t*>(utf16_output), composed);
|
|
utf16_output += 8; // We wrote 16 bytes, 8 code points.
|
|
return 16;
|
|
}
|
|
if(input_utf8_end_of_code_point_mask == 0x924) {
|
|
// We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
|
|
// There is probably a more efficient sequence, but the following might do.
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t sh = make_uint8x16_t(2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255);
|
|
#else
|
|
const uint8x16_t sh = {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255};
|
|
#endif
|
|
uint8x16_t perm = vqtbl1q_u8(in, sh);
|
|
uint8x16_t ascii =
|
|
vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
|
|
uint8x16_t middlebyte =
|
|
vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
|
|
uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
|
|
uint32x4_t highbyte =
|
|
vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
|
|
uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
|
|
uint32x4_t composed =
|
|
vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
|
|
uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed);
|
|
if (!match_system(big_endian)) composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap));
|
|
vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), composed_repacked);
|
|
utf16_output += 4;
|
|
return 12;
|
|
}
|
|
/// We do not have a fast path available, so we fallback.
|
|
|
|
const uint8_t idx =
|
|
simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
|
|
const uint8_t consumed =
|
|
simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
|
|
|
|
|
|
if (idx < 64) {
|
|
// SIX (6) input code-words
|
|
// this is a relatively easy scenario
|
|
// we process SIX (6) input code-words. The max length in bytes of six code
|
|
// words spanning between 1 and 2 bytes each is 12 bytes.
|
|
uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
|
|
uint8x16_t perm = vqtbl1q_u8(in, sh);
|
|
uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
|
|
uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
|
|
uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
|
|
if (!match_system(big_endian)) composed = vqtbl1q_u8(composed, swap);
|
|
vst1q_u8(reinterpret_cast<uint8_t*>(utf16_output), composed);
|
|
utf16_output += 6; // We wrote 12 bytes, 6 code points.
|
|
} else if (idx < 145) {
|
|
// FOUR (4) input code-words
|
|
uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
|
|
uint8x16_t perm = vqtbl1q_u8(in, sh);
|
|
uint8x16_t ascii =
|
|
vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
|
|
uint8x16_t middlebyte =
|
|
vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
|
|
uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
|
|
uint32x4_t highbyte =
|
|
vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
|
|
uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
|
|
uint32x4_t composed =
|
|
vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
|
|
uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed);
|
|
if (!match_system(big_endian)) composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap));
|
|
vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), composed_repacked);
|
|
utf16_output += 4;
|
|
} else if (idx < 209) {
|
|
// TWO (2) input code-words
|
|
//////////////
|
|
// There might be garbage inputs where a leading byte mascarades as a four-byte
|
|
// leading byte (by being followed by 3 continuation byte), but is not greater than
|
|
// 0xf0. This could trigger a buffer overflow if we only counted leading
|
|
// bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation.
|
|
// Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs.
|
|
// We do as at the cost of an extra mask.
|
|
/////////////
|
|
uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
|
|
uint8x16_t perm = vqtbl1q_u8(in, sh);
|
|
uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f)));
|
|
uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00)));
|
|
uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
|
|
uint8x16_t middlehighbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f0000)));
|
|
// correct for spurious high bit
|
|
uint8x16_t correct =
|
|
vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x400000)))), 1));
|
|
middlehighbyte = veorq_u8(correct, middlehighbyte);
|
|
uint8x16_t middlehighbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlehighbyte), 4));
|
|
// We deliberately carry the leading four bits if they are present, we remove
|
|
// them later when computing hightenbits.
|
|
uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0xff000000)));
|
|
uint8x16_t highbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(highbyte), 6));
|
|
// When we need to generate a surrogate pair (leading byte > 0xF0), then
|
|
// the corresponding 32-bit value in 'composed' will be greater than
|
|
// > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
|
|
// location of the surrogate pairs.
|
|
uint8x16_t composed =
|
|
vorrq_u8(vorrq_u8(ascii, middlebyte_shifted),
|
|
vorrq_u8(highbyte_shifted, middlehighbyte_shifted));
|
|
uint32x4_t composedminus =
|
|
vsubq_u32(vreinterpretq_u32_u8(composed), vmovq_n_u32(0x10000));
|
|
uint32x4_t lowtenbits =
|
|
vandq_u32(composedminus, vmovq_n_u32(0x3ff));
|
|
// Notice the 0x3ff mask:
|
|
uint32x4_t hightenbits = vandq_u32(vshrq_n_u32(composedminus, 10), vmovq_n_u32(0x3ff));
|
|
uint32x4_t lowtenbitsadd =
|
|
vaddq_u32(lowtenbits, vmovq_n_u32(0xDC00));
|
|
uint32x4_t hightenbitsadd =
|
|
vaddq_u32(hightenbits, vmovq_n_u32(0xD800));
|
|
uint32x4_t lowtenbitsaddshifted = vshlq_n_u32(lowtenbitsadd, 16);
|
|
uint32x4_t surrogates =
|
|
vorrq_u32(hightenbitsadd, lowtenbitsaddshifted);
|
|
uint32_t basic_buffer[4];
|
|
uint32_t basic_buffer_swap[4];
|
|
if (!match_system(big_endian)) {
|
|
vst1q_u32(basic_buffer_swap, vreinterpretq_u32_u8(vqtbl1q_u8(composed, swap)));
|
|
surrogates = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(surrogates), swap));
|
|
}
|
|
vst1q_u32(basic_buffer, vreinterpretq_u32_u8(composed));
|
|
uint32_t surrogate_buffer[4];
|
|
vst1q_u32(surrogate_buffer, surrogates);
|
|
for (size_t i = 0; i < 3; i++) {
|
|
if(basic_buffer[i] > 0x3c00000) {
|
|
utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
|
|
utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
|
|
utf16_output += 2;
|
|
} else {
|
|
utf16_output[0] = !match_system(big_endian) ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
|
|
utf16_output++;
|
|
}
|
|
}
|
|
} else {
|
|
// here we know that there is an error but we do not handle errors
|
|
}
|
|
return consumed;
|
|
}
|
|
/* end file src/arm64/arm_convert_utf8_to_utf16.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf8_to_utf32.cpp
|
|
/* begin file src/arm64/arm_convert_utf8_to_utf32.cpp */
|
|
// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 12).
|
|
size_t convert_masked_utf8_to_utf32(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char32_t *&utf32_out) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
uint32_t*& utf32_output = reinterpret_cast<uint32_t*&>(utf32_out);
|
|
uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask & 0xFFF;
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it is maybe
|
|
// beneficial to have fast paths that depend on branch prediction but have less latency.
|
|
// This results in more instructions but, potentially, also higher speeds.
|
|
//
|
|
// We first try a few fast paths.
|
|
if((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
|
|
// We process in chunks of 16 bytes
|
|
vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8 (in)))));
|
|
vst1q_u32(utf32_output + 4, vmovl_high_u16(vmovl_u8(vget_low_u8 (in))));
|
|
vst1q_u32(utf32_output + 8, vmovl_u16(vget_low_u16(vmovl_high_u8(in))));
|
|
vst1q_u32(utf32_output + 12, vmovl_high_u16(vmovl_high_u8(in)));
|
|
utf32_output += 16; // We wrote 16 16-bit characters.
|
|
return 16; // We consumed 16 bytes.
|
|
}
|
|
if((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa) {
|
|
// We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
|
|
// There is probably a more efficient sequence, but the following might do.
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t sh = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
#else
|
|
//const uint8x16_t sh = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
|
|
const uint8x16_t sh = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
|
|
#endif
|
|
uint8x16_t perm = vqtbl1q_u8(in, sh);
|
|
uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
|
|
uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
|
|
uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
|
|
vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(vreinterpretq_u16_u8(composed))));
|
|
vst1q_u32(utf32_output+4, vmovl_high_u16(vreinterpretq_u16_u8(composed)));
|
|
utf32_output += 8; // We wrote 32 bytes, 8 code points.
|
|
return 16;
|
|
}
|
|
if(input_utf8_end_of_code_point_mask == 0x924) {
|
|
// We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
|
|
// There is probably a more efficient sequence, but the following might do.
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t sh = make_uint8x16_t(2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255);
|
|
#else
|
|
const uint8x16_t sh = {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255};
|
|
#endif
|
|
uint8x16_t perm = vqtbl1q_u8(in, sh);
|
|
uint8x16_t ascii =
|
|
vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
|
|
uint8x16_t middlebyte =
|
|
vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
|
|
uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
|
|
uint32x4_t highbyte =
|
|
vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
|
|
uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
|
|
uint32x4_t composed =
|
|
vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
|
|
vst1q_u32(utf32_output, composed);
|
|
utf32_output += 4;
|
|
return 12;
|
|
}
|
|
/// We do not have a fast path available, so we fallback.
|
|
|
|
const uint8_t idx =
|
|
simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
|
|
const uint8_t consumed =
|
|
simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
|
|
|
|
|
|
if (idx < 64) {
|
|
// SIX (6) input code-words
|
|
// this is a relatively easy scenario
|
|
// we process SIX (6) input code-words. The max length in bytes of six code
|
|
// words spanning between 1 and 2 bytes each is 12 bytes.
|
|
uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
|
|
uint8x16_t perm = vqtbl1q_u8(in, sh);
|
|
uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
|
|
uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
|
|
uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
|
|
vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(vreinterpretq_u16_u8(composed))));
|
|
vst1q_u32(utf32_output+4, vmovl_high_u16(vreinterpretq_u16_u8(composed)));
|
|
utf32_output += 6; // We wrote 12 bytes, 6 code points.
|
|
} else if (idx < 145) {
|
|
// FOUR (4) input code-words
|
|
uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
|
|
uint8x16_t perm = vqtbl1q_u8(in, sh);
|
|
uint8x16_t ascii =
|
|
vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
|
|
uint8x16_t middlebyte =
|
|
vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
|
|
uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
|
|
uint32x4_t highbyte =
|
|
vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
|
|
uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
|
|
uint32x4_t composed =
|
|
vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
|
|
vst1q_u32(utf32_output, composed);
|
|
utf32_output += 4;
|
|
} else if (idx < 209) {
|
|
// TWO (2) input code-words
|
|
uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
|
|
uint8x16_t perm = vqtbl1q_u8(in, sh);
|
|
uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f)));
|
|
uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00)));
|
|
uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
|
|
uint8x16_t middlehighbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f0000)));
|
|
// correct for spurious high bit
|
|
uint8x16_t correct =
|
|
vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x400000)))), 1));
|
|
middlehighbyte = veorq_u8(correct, middlehighbyte);
|
|
uint8x16_t middlehighbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlehighbyte), 4));
|
|
uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x07000000)));
|
|
uint8x16_t highbyte_shifted =vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(highbyte), 6));
|
|
uint8x16_t composed =
|
|
vorrq_u8(vorrq_u8(ascii, middlebyte_shifted),
|
|
vorrq_u8(highbyte_shifted, middlehighbyte_shifted));
|
|
vst1q_u32(utf32_output, vreinterpretq_u32_u8(composed));
|
|
utf32_output += 3;
|
|
} else {
|
|
// here we know that there is an error but we do not handle errors
|
|
}
|
|
return consumed;
|
|
}
|
|
/* end file src/arm64/arm_convert_utf8_to_utf32.cpp */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf16_to_utf8.cpp
|
|
/* begin file src/arm64/arm_convert_utf16_to_utf8.cpp */
|
|
/*
|
|
The vectorized algorithm works on single SSE register i.e., it
|
|
loads eight 16-bit words.
|
|
|
|
We consider three cases:
|
|
1. an input register contains no surrogates and each value
|
|
is in range 0x0000 .. 0x07ff.
|
|
2. an input register contains no surrogates and values are
|
|
is in range 0x0000 .. 0xffff.
|
|
3. an input register contains surrogates --- i.e. codepoints
|
|
can have 16 or 32 bits.
|
|
|
|
Ad 1.
|
|
|
|
When values are less than 0x0800, it means that a 16-bit words
|
|
can be converted into: 1) single UTF8 byte (when it's an ASCII
|
|
char) or 2) two UTF8 bytes.
|
|
|
|
For this case we do only some shuffle to obtain these 2-byte
|
|
codes and finally compress the whole SSE register with a single
|
|
shuffle.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
Ad 2.
|
|
|
|
When values fit in 16-bit words, but are above 0x07ff, then
|
|
a single word may produce one, two or three UTF8 bytes.
|
|
|
|
We prepare data for all these three cases in two registers.
|
|
The first register contains lower two UTF8 bytes (used in all
|
|
cases), while the second one contains just the third byte for
|
|
the three-UTF8-bytes case.
|
|
|
|
Finally these two registers are interleaved forming eight-element
|
|
array of 32-bit values. The array spans two SSE registers.
|
|
The bytes from the registers are compressed using two shuffles.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
|
|
To summarize:
|
|
- We need two 256-entry tables that have 8704 bytes in total.
|
|
*/
|
|
/*
|
|
Returns a pair: the first unprocessed byte from buf and utf8_output
|
|
A scalar routing should carry on the conversion of the tail.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<const char16_t*, char*> arm_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_out) {
|
|
uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
|
|
const char16_t* end = buf + len;
|
|
|
|
const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
|
|
const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
|
|
const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
|
|
|
|
while (buf + 16 <= end) {
|
|
uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
|
|
if (!match_system(big_endian)) {
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
#else
|
|
const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
|
|
#endif
|
|
in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
|
|
}
|
|
if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
|
|
// It is common enough that we have sequences of 16 consecutive ASCII characters.
|
|
uint16x8_t nextin = vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
|
|
if (!match_system(big_endian)) {
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
#else
|
|
const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
|
|
#endif
|
|
nextin = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(nextin), swap));
|
|
}
|
|
if(vmaxvq_u16(nextin) > 0x7F) {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
uint8x8_t utf8_packed = vmovn_u16(in);
|
|
// 2. store (8 bytes)
|
|
vst1_u8(utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
utf8_output += 8;
|
|
in = nextin;
|
|
} else {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
|
|
// 2. store (16 bytes)
|
|
vst1q_u8(utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
}
|
|
|
|
if (vmaxvq_u16(in) <= 0x7FF) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
|
|
const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const uint16x8_t t0 = vshlq_n_u16(in, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const uint16x8_t t1 = vandq_u16(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const uint16x8_t t2 = vandq_u16(in, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const uint16x8_t t3 = vorrq_u16(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const uint16x8_t t4 = vorrq_u16(t3, v_c080);
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
|
|
const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
|
|
const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
|
|
0x0010, 0x0040,
|
|
0x0002, 0x0008,
|
|
0x0020, 0x0080);
|
|
#else
|
|
const uint16x8_t mask = { 0x0001, 0x0004,
|
|
0x0010, 0x0040,
|
|
0x0002, 0x0008,
|
|
0x0020, 0x0080 };
|
|
#endif
|
|
uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
|
|
// 4. pack the bytes
|
|
const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
|
|
const uint8x16_t shuffle = vld1q_u8(row + 1);
|
|
const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
|
|
|
|
// 5. store bytes
|
|
vst1q_u8(utf8_output, utf8_packed);
|
|
|
|
// 6. adjust pointers
|
|
buf += 8;
|
|
utf8_output += row[0];
|
|
continue;
|
|
|
|
}
|
|
const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
|
|
// it is likely an uncommon occurrence.
|
|
if (vmaxvq_u16(surrogates_bytemask) == 0) {
|
|
// case: words from register produce either 1, 2 or 3 UTF-8 bytes
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
#else
|
|
const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
|
|
#endif
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two words (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two words we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
|
|
const uint16x8_t s0 = vshrq_n_u16(in, 12);
|
|
// s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
|
|
const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
|
|
// [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
|
|
const uint16x8_t s1s = vshlq_n_u16(s1, 2);
|
|
// [00bb|bbbb|0000|aaaa]
|
|
const uint16x8_t s2 = vorrq_u16(s0, s1s);
|
|
// s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
|
|
const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
|
|
const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
|
|
const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
|
|
const uint16x8_t s4 = veorq_u16(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand words 16-bit => 32-bit
|
|
const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
|
|
const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
|
|
|
|
// 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
|
|
const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
|
|
0x0010, 0x0040,
|
|
0x0100, 0x0400,
|
|
0x1000, 0x4000 );
|
|
const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
|
|
0x0020, 0x0080,
|
|
0x0200, 0x0800,
|
|
0x2000, 0x8000 );
|
|
#else
|
|
const uint16x8_t onemask = { 0x0001, 0x0004,
|
|
0x0010, 0x0040,
|
|
0x0100, 0x0400,
|
|
0x1000, 0x4000 };
|
|
const uint16x8_t twomask = { 0x0002, 0x0008,
|
|
0x0020, 0x0080,
|
|
0x0200, 0x0800,
|
|
0x2000, 0x8000 };
|
|
#endif
|
|
const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
|
|
const uint16_t mask = vaddvq_u16(combined);
|
|
// The following fast path may or may not be beneficial.
|
|
/*if(mask == 0) {
|
|
// We only have three-byte words. Use fast path.
|
|
const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
|
|
const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
|
|
const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
|
|
vst1q_u8(utf8_output, utf8_0);
|
|
utf8_output += 12;
|
|
vst1q_u8(utf8_output, utf8_1);
|
|
utf8_output += 12;
|
|
buf += 8;
|
|
continue;
|
|
}*/
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
|
|
const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
|
|
const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
|
|
const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
|
|
|
|
vst1q_u8(utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
vst1q_u8(utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
|
|
if((word & 0xFF80)==0) {
|
|
*utf8_output++ = char(word);
|
|
} else if((word & 0xF800)==0) {
|
|
*utf8_output++ = char((word>>6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if((word &0xF800 ) != 0xD800) {
|
|
*utf8_output++ = char((word>>12) | 0b11100000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf8_output++ = char((value>>18) | 0b11110000);
|
|
*utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
return std::make_pair(buf, reinterpret_cast<char*>(utf8_output));
|
|
}
|
|
|
|
|
|
/*
|
|
Returns a pair: a result struct and utf8_output.
|
|
If there is an error, the count field of the result is the position of the error.
|
|
Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
|
|
A scalar routing should carry on the conversion of the tail if needed.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<result, char*> arm_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_out) {
|
|
uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
|
|
const char16_t* start = buf;
|
|
const char16_t* end = buf + len;
|
|
|
|
const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
|
|
const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
|
|
const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
|
|
|
|
while (buf + 16 <= end) {
|
|
uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
|
|
if (!match_system(big_endian)) {
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
#else
|
|
const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
|
|
#endif
|
|
in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
|
|
}
|
|
if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
|
|
// It is common enough that we have sequences of 16 consecutive ASCII characters.
|
|
uint16x8_t nextin = vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
|
|
if (!match_system(big_endian)) {
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
#else
|
|
const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
|
|
#endif
|
|
nextin = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(nextin), swap));
|
|
}
|
|
if(vmaxvq_u16(nextin) > 0x7F) {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
uint8x8_t utf8_packed = vmovn_u16(in);
|
|
// 2. store (8 bytes)
|
|
vst1_u8(utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
utf8_output += 8;
|
|
in = nextin;
|
|
} else {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
|
|
// 2. store (16 bytes)
|
|
vst1q_u8(utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
}
|
|
|
|
if (vmaxvq_u16(in) <= 0x7FF) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
|
|
const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const uint16x8_t t0 = vshlq_n_u16(in, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const uint16x8_t t1 = vandq_u16(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const uint16x8_t t2 = vandq_u16(in, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const uint16x8_t t3 = vorrq_u16(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const uint16x8_t t4 = vorrq_u16(t3, v_c080);
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
|
|
const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
|
|
const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
|
|
0x0010, 0x0040,
|
|
0x0002, 0x0008,
|
|
0x0020, 0x0080);
|
|
#else
|
|
const uint16x8_t mask = { 0x0001, 0x0004,
|
|
0x0010, 0x0040,
|
|
0x0002, 0x0008,
|
|
0x0020, 0x0080 };
|
|
#endif
|
|
uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
|
|
// 4. pack the bytes
|
|
const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
|
|
const uint8x16_t shuffle = vld1q_u8(row + 1);
|
|
const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
|
|
|
|
// 5. store bytes
|
|
vst1q_u8(utf8_output, utf8_packed);
|
|
|
|
// 6. adjust pointers
|
|
buf += 8;
|
|
utf8_output += row[0];
|
|
continue;
|
|
|
|
}
|
|
const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
|
|
// it is likely an uncommon occurrence.
|
|
if (vmaxvq_u16(surrogates_bytemask) == 0) {
|
|
// case: words from register produce either 1, 2 or 3 UTF-8 bytes
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
#else
|
|
const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
|
|
#endif
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two words (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two words we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
|
|
const uint16x8_t s0 = vshrq_n_u16(in, 12);
|
|
// s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
|
|
const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
|
|
// [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
|
|
const uint16x8_t s1s = vshlq_n_u16(s1, 2);
|
|
// [00bb|bbbb|0000|aaaa]
|
|
const uint16x8_t s2 = vorrq_u16(s0, s1s);
|
|
// s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
|
|
const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
|
|
const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
|
|
const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
|
|
const uint16x8_t s4 = veorq_u16(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand words 16-bit => 32-bit
|
|
const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
|
|
const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
|
|
|
|
// 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
|
|
const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
|
|
0x0010, 0x0040,
|
|
0x0100, 0x0400,
|
|
0x1000, 0x4000 );
|
|
const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
|
|
0x0020, 0x0080,
|
|
0x0200, 0x0800,
|
|
0x2000, 0x8000 );
|
|
#else
|
|
const uint16x8_t onemask = { 0x0001, 0x0004,
|
|
0x0010, 0x0040,
|
|
0x0100, 0x0400,
|
|
0x1000, 0x4000 };
|
|
const uint16x8_t twomask = { 0x0002, 0x0008,
|
|
0x0020, 0x0080,
|
|
0x0200, 0x0800,
|
|
0x2000, 0x8000 };
|
|
#endif
|
|
const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
|
|
const uint16_t mask = vaddvq_u16(combined);
|
|
// The following fast path may or may not be beneficial.
|
|
/*if(mask == 0) {
|
|
// We only have three-byte words. Use fast path.
|
|
const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
|
|
const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
|
|
const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
|
|
vst1q_u8(utf8_output, utf8_0);
|
|
utf8_output += 12;
|
|
vst1q_u8(utf8_output, utf8_1);
|
|
utf8_output += 12;
|
|
buf += 8;
|
|
continue;
|
|
}*/
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
|
|
const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
|
|
const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
|
|
const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
|
|
|
|
vst1q_u8(utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
vst1q_u8(utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
|
|
if((word & 0xFF80)==0) {
|
|
*utf8_output++ = char(word);
|
|
} else if((word & 0xF800)==0) {
|
|
*utf8_output++ = char((word>>6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if((word &0xF800 ) != 0xD800) {
|
|
*utf8_output++ = char((word>>12) | 0b11100000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast<char*>(utf8_output)); }
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf8_output++ = char((value>>18) | 0b11110000);
|
|
*utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char*>(utf8_output));
|
|
}
|
|
/* end file src/arm64/arm_convert_utf16_to_utf8.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf16_to_utf32.cpp
|
|
/* begin file src/arm64/arm_convert_utf16_to_utf32.cpp */
|
|
/*
|
|
The vectorized algorithm works on single SSE register i.e., it
|
|
loads eight 16-bit words.
|
|
|
|
We consider three cases:
|
|
1. an input register contains no surrogates and each value
|
|
is in range 0x0000 .. 0x07ff.
|
|
2. an input register contains no surrogates and values are
|
|
is in range 0x0000 .. 0xffff.
|
|
3. an input register contains surrogates --- i.e. codepoints
|
|
can have 16 or 32 bits.
|
|
|
|
Ad 1.
|
|
|
|
When values are less than 0x0800, it means that a 16-bit words
|
|
can be converted into: 1) single UTF8 byte (when it's an ASCII
|
|
char) or 2) two UTF8 bytes.
|
|
|
|
For this case we do only some shuffle to obtain these 2-byte
|
|
codes and finally compress the whole SSE register with a single
|
|
shuffle.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
Ad 2.
|
|
|
|
When values fit in 16-bit words, but are above 0x07ff, then
|
|
a single word may produce one, two or three UTF8 bytes.
|
|
|
|
We prepare data for all these three cases in two registers.
|
|
The first register contains lower two UTF8 bytes (used in all
|
|
cases), while the second one contains just the third byte for
|
|
the three-UTF8-bytes case.
|
|
|
|
Finally these two registers are interleaved forming eight-element
|
|
array of 32-bit values. The array spans two SSE registers.
|
|
The bytes from the registers are compressed using two shuffles.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
|
|
To summarize:
|
|
- We need two 256-entry tables that have 8704 bytes in total.
|
|
*/
|
|
/*
|
|
Returns a pair: the first unprocessed byte from buf and utf8_output
|
|
A scalar routing should carry on the conversion of the tail.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<const char16_t*, char32_t*> arm_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_out) {
|
|
uint32_t * utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
|
|
const char16_t* end = buf + len;
|
|
|
|
const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
|
|
const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
|
|
|
|
while (buf + 16 <= end) {
|
|
uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
|
|
if (!match_system(big_endian)) {
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
#else
|
|
const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
|
|
#endif
|
|
in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
|
|
}
|
|
|
|
const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
|
|
// it is likely an uncommon occurrence.
|
|
if (vmaxvq_u16(surrogates_bytemask) == 0) {
|
|
// case: no surrogate pairs, extend all 16-bit words to 32-bit words
|
|
vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in)));
|
|
vst1q_u32(utf32_output+4, vmovl_high_u16(in));
|
|
utf32_output += 8;
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
|
|
if((word &0xF800 ) != 0xD800) {
|
|
*utf32_output++ = char32_t(word);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, reinterpret_cast<char32_t*>(utf32_output)); }
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(buf, reinterpret_cast<char32_t*>(utf32_output));
|
|
}
|
|
|
|
|
|
/*
|
|
Returns a pair: a result struct and utf8_output.
|
|
If there is an error, the count field of the result is the position of the error.
|
|
Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
|
|
A scalar routing should carry on the conversion of the tail if needed.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<result, char32_t*> arm_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_out) {
|
|
uint32_t * utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
|
|
const char16_t* start = buf;
|
|
const char16_t* end = buf + len;
|
|
|
|
const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
|
|
const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
|
|
|
|
while (buf + 16 <= end) {
|
|
uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
|
|
if (!match_system(big_endian)) {
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
#else
|
|
const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
|
|
#endif
|
|
in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
|
|
}
|
|
|
|
const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
|
|
// it is likely an uncommon occurrence.
|
|
if (vmaxvq_u16(surrogates_bytemask) == 0) {
|
|
// case: no surrogate pairs, extend all 16-bit words to 32-bit words
|
|
vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in)));
|
|
vst1q_u32(utf32_output+4, vmovl_high_u16(in));
|
|
utf32_output += 8;
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
|
|
if((word &0xF800 ) != 0xD800) {
|
|
*utf32_output++ = char32_t(word);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast<char32_t*>(utf32_output)); }
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char32_t*>(utf32_output));
|
|
}
|
|
/* end file src/arm64/arm_convert_utf16_to_utf32.cpp */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf32_to_utf8.cpp
|
|
/* begin file src/arm64/arm_convert_utf32_to_utf8.cpp */
|
|
std::pair<const char32_t*, char*> arm_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_out) {
|
|
uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
|
|
const char32_t* end = buf + len;
|
|
|
|
const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
|
|
|
|
uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0);
|
|
|
|
while (buf + 16 <= end) {
|
|
uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
|
|
uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf+4));
|
|
|
|
// Check if no bits set above 16th
|
|
if(vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
|
|
// Pack UTF-32 to UTF-16 safely (without surrogate pairs)
|
|
// Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
|
|
uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
|
|
if(vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
|
|
// 2. store (8 bytes)
|
|
vst1_u8(utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
utf8_output += 8;
|
|
continue; // we are done for this round!
|
|
}
|
|
|
|
if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
|
|
const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const uint16x8_t t1 = vandq_u16(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const uint16x8_t t3 = vorrq_u16(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const uint16x8_t t4 = vorrq_u16(t3, v_c080);
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
|
|
const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
|
|
const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4));
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
|
|
0x0010, 0x0040,
|
|
0x0002, 0x0008,
|
|
0x0020, 0x0080);
|
|
#else
|
|
const uint16x8_t mask = { 0x0001, 0x0004,
|
|
0x0010, 0x0040,
|
|
0x0002, 0x0008,
|
|
0x0020, 0x0080 };
|
|
#endif
|
|
uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
|
|
// 4. pack the bytes
|
|
const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
|
|
const uint8x16_t shuffle = vld1q_u8(row + 1);
|
|
const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
|
|
|
|
// 5. store bytes
|
|
vst1q_u8(utf8_output, utf8_packed);
|
|
|
|
// 6. adjust pointers
|
|
buf += 8;
|
|
utf8_output += row[0];
|
|
continue;
|
|
|
|
} else {
|
|
// case: words from register produce either 1, 2 or 3 UTF-8 bytes
|
|
const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
|
|
const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
|
|
forbidden_bytemask = vorrq_u16(vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800)), forbidden_bytemask);
|
|
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
#else
|
|
const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
|
|
#endif
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two words (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two words we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even)));
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
|
|
const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
|
|
// s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
|
|
const uint16x8_t s1 = vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
|
|
// [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
|
|
const uint16x8_t s1s = vshlq_n_u16(s1, 2);
|
|
// [00bb|bbbb|0000|aaaa]
|
|
const uint16x8_t s2 = vorrq_u16(s0, s1s);
|
|
// s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
|
|
const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
|
|
const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff);
|
|
const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
|
|
const uint16x8_t s4 = veorq_u16(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand words 16-bit => 32-bit
|
|
const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
|
|
const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
|
|
|
|
// 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
|
|
const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
|
|
0x0010, 0x0040,
|
|
0x0100, 0x0400,
|
|
0x1000, 0x4000 );
|
|
const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
|
|
0x0020, 0x0080,
|
|
0x0200, 0x0800,
|
|
0x2000, 0x8000 );
|
|
#else
|
|
const uint16x8_t onemask = { 0x0001, 0x0004,
|
|
0x0010, 0x0040,
|
|
0x0100, 0x0400,
|
|
0x1000, 0x4000 };
|
|
const uint16x8_t twomask = { 0x0002, 0x0008,
|
|
0x0020, 0x0080,
|
|
0x0200, 0x0800,
|
|
0x2000, 0x8000 };
|
|
#endif
|
|
const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
|
|
const uint16_t mask = vaddvq_u16(combined);
|
|
// The following fast path may or may not be beneficial.
|
|
/*if(mask == 0) {
|
|
// We only have three-byte words. Use fast path.
|
|
const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
|
|
const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
|
|
const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
|
|
vst1q_u8(utf8_output, utf8_0);
|
|
utf8_output += 12;
|
|
vst1q_u8(utf8_output, utf8_1);
|
|
utf8_output += 12;
|
|
buf += 8;
|
|
continue;
|
|
}*/
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
|
|
const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
|
|
const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
|
|
|
|
vst1q_u8(utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
vst1q_u8(utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
|
|
buf += 8;
|
|
}
|
|
// At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if((word & 0xFFFFFF80)==0) {
|
|
*utf8_output++ = char(word);
|
|
} else if((word & 0xFFFFF800)==0) {
|
|
*utf8_output++ = char((word>>6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if((word & 0xFFFF0000)==0) {
|
|
if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
|
|
*utf8_output++ = char((word>>12) | 0b11100000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
|
|
*utf8_output++ = char((word>>18) | 0b11110000);
|
|
*utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
// check for invalid input
|
|
if (vmaxvq_u16(forbidden_bytemask) != 0) {
|
|
return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output));
|
|
}
|
|
return std::make_pair(buf, reinterpret_cast<char*>(utf8_output));
|
|
}
|
|
|
|
|
|
std::pair<result, char*> arm_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_out) {
|
|
uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
|
|
const char32_t* start = buf;
|
|
const char32_t* end = buf + len;
|
|
|
|
const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
|
|
|
|
while (buf + 16 <= end) {
|
|
uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
|
|
uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf+4));
|
|
|
|
// Check if no bits set above 16th
|
|
if(vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
|
|
// Pack UTF-32 to UTF-16 safely (without surrogate pairs)
|
|
// Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
|
|
uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
|
|
if(vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
|
|
// 2. store (8 bytes)
|
|
vst1_u8(utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
utf8_output += 8;
|
|
continue; // we are done for this round!
|
|
}
|
|
|
|
if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
|
|
const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const uint16x8_t t1 = vandq_u16(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const uint16x8_t t3 = vorrq_u16(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const uint16x8_t t4 = vorrq_u16(t3, v_c080);
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
|
|
const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
|
|
const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4));
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
|
|
0x0010, 0x0040,
|
|
0x0002, 0x0008,
|
|
0x0020, 0x0080);
|
|
#else
|
|
const uint16x8_t mask = { 0x0001, 0x0004,
|
|
0x0010, 0x0040,
|
|
0x0002, 0x0008,
|
|
0x0020, 0x0080 };
|
|
#endif
|
|
uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
|
|
// 4. pack the bytes
|
|
const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
|
|
const uint8x16_t shuffle = vld1q_u8(row + 1);
|
|
const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
|
|
|
|
// 5. store bytes
|
|
vst1q_u8(utf8_output, utf8_packed);
|
|
|
|
// 6. adjust pointers
|
|
buf += 8;
|
|
utf8_output += row[0];
|
|
continue;
|
|
|
|
} else {
|
|
// case: words from register produce either 1, 2 or 3 UTF-8 bytes
|
|
|
|
// check for invalid input
|
|
const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
|
|
const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
|
|
const uint16x8_t forbidden_bytemask = vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800));
|
|
if (vmaxvq_u16(forbidden_bytemask) != 0) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast<char*>(utf8_output));
|
|
}
|
|
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
#else
|
|
const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
|
|
#endif
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two words (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two words we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even)));
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
|
|
const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
|
|
// s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
|
|
const uint16x8_t s1 = vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
|
|
// [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
|
|
const uint16x8_t s1s = vshlq_n_u16(s1, 2);
|
|
// [00bb|bbbb|0000|aaaa]
|
|
const uint16x8_t s2 = vorrq_u16(s0, s1s);
|
|
// s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
|
|
const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
|
|
const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff);
|
|
const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
|
|
const uint16x8_t s4 = veorq_u16(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand words 16-bit => 32-bit
|
|
const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
|
|
const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
|
|
|
|
// 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
|
|
const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
|
|
0x0010, 0x0040,
|
|
0x0100, 0x0400,
|
|
0x1000, 0x4000 );
|
|
const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
|
|
0x0020, 0x0080,
|
|
0x0200, 0x0800,
|
|
0x2000, 0x8000 );
|
|
#else
|
|
const uint16x8_t onemask = { 0x0001, 0x0004,
|
|
0x0010, 0x0040,
|
|
0x0100, 0x0400,
|
|
0x1000, 0x4000 };
|
|
const uint16x8_t twomask = { 0x0002, 0x0008,
|
|
0x0020, 0x0080,
|
|
0x0200, 0x0800,
|
|
0x2000, 0x8000 };
|
|
#endif
|
|
const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
|
|
const uint16_t mask = vaddvq_u16(combined);
|
|
// The following fast path may or may not be beneficial.
|
|
/*if(mask == 0) {
|
|
// We only have three-byte words. Use fast path.
|
|
const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
|
|
const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
|
|
const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
|
|
vst1q_u8(utf8_output, utf8_0);
|
|
utf8_output += 12;
|
|
vst1q_u8(utf8_output, utf8_1);
|
|
utf8_output += 12;
|
|
buf += 8;
|
|
continue;
|
|
}*/
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
|
|
const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
|
|
const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
|
|
const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
|
|
|
|
vst1q_u8(utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
vst1q_u8(utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
|
|
buf += 8;
|
|
}
|
|
// At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if((word & 0xFFFFFF80)==0) {
|
|
*utf8_output++ = char(word);
|
|
} else if((word & 0xFFFFF800)==0) {
|
|
*utf8_output++ = char((word>>6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if((word & 0xFFFF0000)==0) {
|
|
if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char*>(utf8_output)); }
|
|
*utf8_output++ = char((word>>12) | 0b11100000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char*>(utf8_output)); }
|
|
*utf8_output++ = char((word>>18) | 0b11110000);
|
|
*utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char*>(utf8_output));
|
|
}
|
|
/* end file src/arm64/arm_convert_utf32_to_utf8.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf32_to_utf16.cpp
|
|
/* begin file src/arm64/arm_convert_utf32_to_utf16.cpp */
|
|
template <endianness big_endian>
|
|
std::pair<const char32_t*, char16_t*> arm_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_out) {
|
|
uint16_t * utf16_output = reinterpret_cast<uint16_t*>(utf16_out);
|
|
const char32_t* end = buf + len;
|
|
|
|
uint16x4_t forbidden_bytemask = vmov_n_u16(0x0);
|
|
|
|
while(buf + 4 <= end) {
|
|
uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
|
|
|
|
// Check if no bits set above 16th
|
|
if(vmaxvq_u32(in) <= 0xFFFF) {
|
|
uint16x4_t utf16_packed = vmovn_u32(in);
|
|
|
|
const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
|
|
const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
|
|
forbidden_bytemask = vorr_u16(vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800)), forbidden_bytemask);
|
|
|
|
if (!match_system(big_endian)) {
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6);
|
|
#else
|
|
const uint8x8_t swap = {1, 0, 3, 2, 5, 4, 7, 6};
|
|
#endif
|
|
utf16_packed = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(utf16_packed), swap));
|
|
}
|
|
vst1_u16(utf16_output, utf16_packed);
|
|
utf16_output += 4;
|
|
buf += 4;
|
|
} else {
|
|
size_t forward = 3;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if((word & 0xFFFF0000)==0) {
|
|
// will not generate a surrogate pair
|
|
if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output)); }
|
|
*utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output)); }
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (!match_system(big_endian)) {
|
|
high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
|
|
low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
}
|
|
|
|
// check for invalid input
|
|
if (vmaxv_u16(forbidden_bytemask) != 0) {
|
|
return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output));
|
|
}
|
|
|
|
return std::make_pair(buf, reinterpret_cast<char16_t*>(utf16_output));
|
|
}
|
|
|
|
|
|
template <endianness big_endian>
|
|
std::pair<result, char16_t*> arm_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_out) {
|
|
uint16_t * utf16_output = reinterpret_cast<uint16_t*>(utf16_out);
|
|
const char32_t* start = buf;
|
|
const char32_t* end = buf + len;
|
|
|
|
while(buf + 4 <= end) {
|
|
uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
|
|
|
|
// Check if no bits set above 16th
|
|
if(vmaxvq_u32(in) <= 0xFFFF) {
|
|
uint16x4_t utf16_packed = vmovn_u32(in);
|
|
|
|
const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
|
|
const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
|
|
const uint16x4_t forbidden_bytemask = vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800));
|
|
if (vmaxv_u16(forbidden_bytemask) != 0) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast<char16_t*>(utf16_output));
|
|
}
|
|
|
|
if (!match_system(big_endian)) {
|
|
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
|
|
const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6);
|
|
#else
|
|
const uint8x8_t swap = {1, 0, 3, 2, 5, 4, 7, 6};
|
|
#endif
|
|
utf16_packed = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(utf16_packed), swap));
|
|
}
|
|
vst1_u16(utf16_output, utf16_packed);
|
|
utf16_output += 4;
|
|
buf += 4;
|
|
} else {
|
|
size_t forward = 3;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if((word & 0xFFFF0000)==0) {
|
|
// will not generate a surrogate pair
|
|
if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output)); }
|
|
*utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output)); }
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (!match_system(big_endian)) {
|
|
high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
|
|
low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
}
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char16_t*>(utf16_output));
|
|
}
|
|
/* end file src/arm64/arm_convert_utf32_to_utf16.cpp */
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h
|
|
/* begin file src/generic/buf_block_reader.h */
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
|
|
// Walks through a buffer in block-sized increments, loading the last part with spaces
|
|
template<size_t STEP_SIZE>
|
|
struct buf_block_reader {
|
|
public:
|
|
simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
|
|
simdutf_really_inline size_t block_index();
|
|
simdutf_really_inline bool has_full_block() const;
|
|
simdutf_really_inline const uint8_t *full_block() const;
|
|
/**
|
|
* Get the last block, padded with spaces.
|
|
*
|
|
* There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
|
|
* function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
|
|
* will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
|
|
*
|
|
* @return the number of effective characters in the last block.
|
|
*/
|
|
simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
|
|
simdutf_really_inline void advance();
|
|
private:
|
|
const uint8_t *buf;
|
|
const size_t len;
|
|
const size_t lenminusstep;
|
|
size_t idx;
|
|
};
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdutf_unused static char * format_input_text_64(const uint8_t *text) {
|
|
static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
|
|
static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
|
|
in.store(reinterpret_cast<uint8_t*>(buf));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] < ' ') { buf[i] = '_'; }
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdutf_unused static char * format_mask(uint64_t mask) {
|
|
static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
|
|
for (size_t i=0; i<64; i++) {
|
|
buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
|
|
}
|
|
buf[64] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
|
|
return idx < lenminusstep;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
|
|
return &buf[idx];
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
|
|
if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
|
|
std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
|
|
std::memcpy(dst, buf + idx, len - idx);
|
|
return len - idx;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
|
|
idx += STEP_SIZE;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
/* end file src/generic/buf_block_reader.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
|
|
/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
|
|
);
|
|
constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY,
|
|
CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000
|
|
);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
|
|
);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
//
|
|
// Return nonzero if there are incomplete multibyte characters at the end of the block:
|
|
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
|
|
//
|
|
simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
|
|
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
|
|
// ... 1111____ 111_____ 11______
|
|
static const uint8_t max_array[32] = {
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
|
|
};
|
|
const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
|
|
return input.gt_bits(max_value);
|
|
}
|
|
|
|
struct utf8_checker {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
// The last input we received
|
|
simd8<uint8_t> prev_input_block;
|
|
// Whether the last input we received was incomplete (used for ASCII fast path)
|
|
simd8<uint8_t> prev_incomplete;
|
|
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
|
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
// The only problem that can happen at EOF is that a multibyte character is too short
|
|
// or a byte value too large in the last bytes: check_special_cases only checks for bytes
|
|
// too large in the first of two bytes.
|
|
simdutf_really_inline void check_eof() {
|
|
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
|
|
// possibly finish them.
|
|
this->error |= this->prev_incomplete;
|
|
}
|
|
|
|
simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
|
|
if(simdutf_likely(is_ascii(input))) {
|
|
this->error |= this->prev_incomplete;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
|
|
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
|
|
|
|
}
|
|
}
|
|
|
|
// do not forget to call check_eof!
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_validation
|
|
|
|
using utf8_validation::utf8_checker;
|
|
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
|
|
/* begin file src/generic/utf8_validation/utf8_validator.h */
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8.
|
|
*/
|
|
template<class checker>
|
|
bool generic_validate_utf8(const uint8_t * input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
return !c.errors();
|
|
}
|
|
|
|
bool generic_validate_utf8(const char * input, size_t length) {
|
|
return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8 and stops on errors.
|
|
*/
|
|
template<class checker>
|
|
result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
size_t count{0};
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
if(c.errors()) {
|
|
if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input + count), length - count);
|
|
res.count += count;
|
|
return res;
|
|
}
|
|
reader.advance();
|
|
count += 64;
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
if (c.errors()) {
|
|
if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input) + count, length - count);
|
|
res.count += count;
|
|
return res;
|
|
} else {
|
|
return result(error_code::SUCCESS, length);
|
|
}
|
|
}
|
|
|
|
result generic_validate_utf8_with_errors(const char * input, size_t length) {
|
|
return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
template<class checker>
|
|
bool generic_validate_ascii(const uint8_t * input, size_t length) {
|
|
buf_block_reader<64> reader(input, length);
|
|
uint8_t blocks[64]{};
|
|
simd::simd8x64<uint8_t> running_or(blocks);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
running_or |= in;
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
running_or |= in;
|
|
return running_or.is_ascii();
|
|
}
|
|
|
|
bool generic_validate_ascii(const char * input, size_t length) {
|
|
return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
template<class checker>
|
|
result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
|
|
buf_block_reader<64> reader(input, length);
|
|
size_t count{0};
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
if (!in.is_ascii()) {
|
|
result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
|
|
return result(res.error, count + res.count);
|
|
}
|
|
reader.advance();
|
|
|
|
count += 64;
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
if (!in.is_ascii()) {
|
|
result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
|
|
return result(res.error, count + res.count);
|
|
} else {
|
|
return result(error_code::SUCCESS, length);
|
|
}
|
|
}
|
|
|
|
result generic_validate_ascii_with_errors(const char * input, size_t length) {
|
|
return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
} // namespace utf8_validation
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_validation/utf8_validator.h */
|
|
// transcoding from UTF-8 to UTF-16
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
|
|
/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
|
|
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
|
|
using namespace simd;
|
|
|
|
template <endianness endian>
|
|
simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
|
|
char16_t* utf16_output) noexcept {
|
|
// The implementation is not specific to haswell and should be moved to the generic directory.
|
|
size_t pos = 0;
|
|
char16_t* start{utf16_output};
|
|
const size_t safety_margin = 16; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
// this loop could be unrolled further. For example, we could process the mask
|
|
// far more than 64 bytes.
|
|
simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
|
|
if(in.is_ascii()) {
|
|
in.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// Slow path. We hope that the compiler will recognize that this is a slow path.
|
|
// Anything that is not a continuation mask is a 'leading byte', that is, the
|
|
// start of a new code point.
|
|
uint64_t utf8_continuation_mask = in.lt(-65 + 1);
|
|
// -65 is 0b10111111 in two-complement's, so largest possible continuation byte
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
// The *start* of code points is not so useful, rather, we want the *end* of code points.
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times when using solely
|
|
// the slow/regular path, and at least four times if there are fast paths.
|
|
while(pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
//
|
|
// Thus we may allow convert_masked_utf8_to_utf16 to process
|
|
// more bytes at a time under a fast-path mode where 16 bytes
|
|
// are consumed at once (e.g., when encountering ASCII).
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
|
|
utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
|
|
return utf16_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
|
|
/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
|
|
|
|
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
using namespace simd;
|
|
|
|
|
|
simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
|
|
);
|
|
constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY,
|
|
CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000
|
|
);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
|
|
);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
|
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
|
|
size_t pos = 0;
|
|
char16_t* start{utf16_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow of
|
|
// 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
|
|
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
|
|
// much more than 8 bytes. However, you cannot generally assume that you have valid
|
|
// UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
|
|
// to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for(; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin-1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if(input.is_ascii()) {
|
|
input.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while(pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
|
|
utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if(errors()) { return 0; }
|
|
if(pos < size) {
|
|
size_t howmany = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
|
|
if(howmany == 0) { return 0; }
|
|
utf16_output += howmany;
|
|
}
|
|
return utf16_output - start;
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
|
|
size_t pos = 0;
|
|
char16_t* start{utf16_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow of
|
|
// 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
|
|
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
|
|
// much more than 8 bytes. However, you cannot generally assume that you have valid
|
|
// UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
|
|
// to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for(; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin-1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if(input.is_ascii()) {
|
|
input.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos onward,
|
|
// with the ability to go back up to pos bytes, and read size-pos bytes forward.
|
|
result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while(pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
|
|
utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if(errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos onward,
|
|
// with the ability to go back up to pos bytes, and read size-pos bytes forward.
|
|
result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if(pos < size) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos onward,
|
|
// with the ability to go back up to pos bytes, and read size-pos bytes forward.
|
|
result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
utf16_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf16_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // utf8_to_utf16 namespace
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
|
|
// transcoding from UTF-8 to UTF-32
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
|
|
/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
|
|
using namespace simd;
|
|
|
|
|
|
simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
|
|
char32_t* utf32_output) noexcept {
|
|
size_t pos = 0;
|
|
char32_t* start{utf32_output};
|
|
const size_t safety_margin = 16; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
|
|
if(in.is_ascii()) {
|
|
in.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// -65 is 0b10111111 in two-complement's, so largest possible continuation byte
|
|
uint64_t utf8_continuation_mask = in.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
while(pos < max_starting_point) {
|
|
size_t consumed = convert_masked_utf8_to_utf32(input + pos,
|
|
utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
}
|
|
}
|
|
utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
|
|
return utf32_output - start;
|
|
}
|
|
|
|
|
|
} // namespace utf8_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
|
|
/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
|
|
|
|
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
using namespace simd;
|
|
|
|
|
|
simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
|
|
);
|
|
constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY,
|
|
CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000
|
|
);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
|
|
);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
|
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
|
|
|
|
simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
|
|
size_t pos = 0;
|
|
char32_t* start{utf32_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow of
|
|
// 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
|
|
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
|
|
// much more than 8 bytes. However, you cannot generally assume that you have valid
|
|
// UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
|
|
// to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for(; margin > 0 && leading_byte < 4; margin--) {
|
|
leading_byte += (int8_t(in[margin-1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the fourth last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if(input.is_ascii()) {
|
|
input.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while(pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf32(in + pos,
|
|
utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if(errors()) { return 0; }
|
|
if(pos < size) {
|
|
size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
|
|
if(howmany == 0) { return 0; }
|
|
utf32_output += howmany;
|
|
}
|
|
return utf32_output - start;
|
|
}
|
|
|
|
simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
|
|
size_t pos = 0;
|
|
char32_t* start{utf32_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow of
|
|
// 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
|
|
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
|
|
// much more than 8 bytes. However, you cannot generally assume that you have valid
|
|
// UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
|
|
// to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for(; margin > 0 && leading_byte < 4; margin--) {
|
|
leading_byte += (int8_t(in[margin-1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the fourth last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if(input.is_ascii()) {
|
|
input.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
if (errors()) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while(pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf32(in + pos,
|
|
utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if(errors()) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if(pos < size) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
utf32_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf32_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // utf8_to_utf32 namespace
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
|
|
// other functions
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8.h
|
|
/* begin file src/generic/utf8.h */
|
|
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace utf8 {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for(;pos + 64 <= size; pos += 64) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
count += 64 - count_ones(utf8_continuation_mask);
|
|
}
|
|
return count + scalar::utf8::count_code_points(in + pos, size - pos);
|
|
}
|
|
|
|
|
|
simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
// This algorithm could no doubt be improved!
|
|
for(;pos + 64 <= size; pos += 64) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
// We count one word for anything that is not a continuation (so
|
|
// leading bytes).
|
|
count += 64 - count_ones(utf8_continuation_mask);
|
|
int64_t utf8_4byte = input.gteq_unsigned(240);
|
|
count += count_ones(utf8_4byte);
|
|
}
|
|
return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
|
|
}
|
|
|
|
|
|
simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) {
|
|
return count_code_points(in, size);
|
|
}
|
|
} // utf8 namespace
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf16.h
|
|
/* begin file src/generic/utf16.h */
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace utf16 {
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for(;pos + 32 <= size; pos += 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) input.swap_bytes();
|
|
uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
|
|
count += count_ones(not_pair) / 2;
|
|
}
|
|
return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
// This algorithm could no doubt be improved!
|
|
for(;pos + 32 <= size; pos += 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) input.swap_bytes();
|
|
uint64_t ascii_mask = input.lteq(0x7F);
|
|
uint64_t twobyte_mask = input.lteq(0x7FF);
|
|
uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
|
|
|
|
size_t ascii_count = count_ones(ascii_mask) / 2;
|
|
size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
|
|
size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
|
|
size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
|
|
count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
|
|
}
|
|
return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
|
|
return count_code_points<big_endian>(in, size);
|
|
}
|
|
|
|
simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
|
|
size_t pos = 0;
|
|
|
|
while (pos + 32 <= size) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
input.swap_bytes();
|
|
input.store(reinterpret_cast<uint16_t *>(output));
|
|
pos += 32;
|
|
output += 32;
|
|
}
|
|
|
|
scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
|
|
}
|
|
|
|
} // utf16
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf16.h */
|
|
//
|
|
// Implementation-specific overrides
|
|
//
|
|
namespace simdutf {
|
|
namespace arm64 {
|
|
|
|
simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
|
|
// If there is a BOM, then we trust it.
|
|
auto bom_encoding = simdutf::BOM::check_bom(input, length);
|
|
if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
|
|
if (length % 2 == 0) {
|
|
return arm_detect_encodings<utf8_validation::utf8_checker>(input, length);
|
|
} else {
|
|
if (implementation::validate_utf8(input, length)) {
|
|
return simdutf::encoding_type::UTF8;
|
|
} else {
|
|
return simdutf::encoding_type::unspecified;
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
return arm64::utf8_validation::generic_validate_utf8(buf,len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
|
|
return arm64::utf8_validation::generic_validate_utf8_with_errors(buf,len);
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
|
|
return arm64::utf8_validation::generic_validate_ascii(buf,len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
|
|
return arm64::utf8_validation::generic_validate_ascii_with_errors(buf,len);
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
|
|
const char16_t* tail = arm_validate_utf16<endianness::LITTLE>(buf, len);
|
|
if (tail) {
|
|
return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
|
|
const char16_t* tail = arm_validate_utf16<endianness::BIG>(buf, len);
|
|
if (tail) {
|
|
return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
|
|
result res = arm_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
|
|
if (res.count != len) {
|
|
result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
|
|
result res = arm_validate_utf16_with_errors<endianness::BIG>(buf, len);
|
|
if (res.count != len) {
|
|
result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
|
|
const char32_t* tail = arm_validate_utf32le(buf, len);
|
|
if (tail) {
|
|
return scalar::utf32::validate(tail, len - (tail - buf));
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
|
|
result res = arm_validate_utf32le_with_errors(buf, len);
|
|
if (res.count != len) {
|
|
result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size,
|
|
char16_t* utf16_output) const noexcept {
|
|
return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size,
|
|
char16_t* utf16_output) const noexcept {
|
|
return utf8_to_utf16::convert_valid<endianness::BIG>(input, size, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
utf8_to_utf32::validating_transcoder converter;
|
|
return converter.convert(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
utf8_to_utf32::validating_transcoder converter;
|
|
return converter.convert_with_errors(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
|
|
char32_t* utf32_output) const noexcept {
|
|
return utf8_to_utf32::convert_valid(input, size, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
std::pair<const char16_t*, char*> ret = arm_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
std::pair<const char16_t*, char*> ret = arm_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char*> ret = arm_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
|
|
if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char*> ret = arm_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
|
|
if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return convert_utf16le_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return convert_utf16be_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
std::pair<const char32_t*, char*> ret = arm_convert_utf32_to_utf8(buf, len, utf8_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char*> ret = arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
std::pair<const char16_t*, char32_t*> ret = arm_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf32_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
std::pair<const char16_t*, char32_t*> ret = arm_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf32_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char32_t*> ret = arm_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
|
|
if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char32_t*> ret = arm_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
|
|
if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return convert_utf32_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
std::pair<const char32_t*, char16_t*> ret = arm_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
std::pair<const char32_t*, char16_t*> ret = arm_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char16_t*> ret = arm_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char16_t*> ret = arm_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return convert_utf32_to_utf16le(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return convert_utf32_to_utf16be(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
return convert_utf16le_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
return convert_utf16be_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
|
|
utf16::change_endianness_utf16(input, length, output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
|
|
return utf16::count_code_points<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
|
|
return utf16::count_code_points<endianness::BIG>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
|
|
return utf8::count_code_points(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
|
|
return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
|
|
return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
|
|
return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
|
|
return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
|
|
return utf8::utf16_length_from_utf8(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
|
|
const uint32x4_t v_7f = vmovq_n_u32((uint32_t)0x7f);
|
|
const uint32x4_t v_7ff = vmovq_n_u32((uint32_t)0x7ff);
|
|
const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
|
|
const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for(;pos + 4 <= length; pos += 4) {
|
|
uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
|
|
const uint32x4_t ascii_bytes_bytemask = vcleq_u32(in, v_7f);
|
|
const uint32x4_t one_two_bytes_bytemask = vcleq_u32(in, v_7ff);
|
|
const uint32x4_t two_bytes_bytemask = veorq_u32(one_two_bytes_bytemask, ascii_bytes_bytemask);
|
|
const uint32x4_t three_bytes_bytemask = veorq_u32(vcleq_u32(in, v_ffff), one_two_bytes_bytemask);
|
|
|
|
const uint16x8_t reduced_ascii_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(ascii_bytes_bytemask, v_1));
|
|
const uint16x8_t reduced_two_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(two_bytes_bytemask, v_1));
|
|
const uint16x8_t reduced_three_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(three_bytes_bytemask, v_1));
|
|
|
|
const uint16x8_t compressed_bytemask0 = vpaddq_u16(reduced_ascii_bytes_bytemask, reduced_two_bytes_bytemask);
|
|
const uint16x8_t compressed_bytemask1 = vpaddq_u16(reduced_three_bytes_bytemask, reduced_three_bytes_bytemask);
|
|
|
|
size_t ascii_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 0));
|
|
size_t two_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 1));
|
|
size_t three_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask1), 0));
|
|
|
|
count += 16 - 3*ascii_count - 2*two_bytes_count - three_bytes_count;
|
|
}
|
|
return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
|
|
const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
|
|
const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for(;pos + 4 <= length; pos += 4) {
|
|
uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
|
|
const uint32x4_t surrogate_bytemask = vcgtq_u32(in, v_ffff);
|
|
const uint16x8_t reduced_bytemask = vreinterpretq_u16_u32(vandq_u32(surrogate_bytemask, v_1));
|
|
const uint16x8_t compressed_bytemask = vpaddq_u16(reduced_bytemask, reduced_bytemask);
|
|
size_t surrogate_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask), 0));
|
|
count += 4 + surrogate_count;
|
|
}
|
|
return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
|
|
return utf8::utf32_length_from_utf8(input, length);
|
|
}
|
|
|
|
} // namespace arm64
|
|
} // namespace simdutf
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/end.h
|
|
/* begin file src/simdutf/arm64/end.h */
|
|
/* end file src/simdutf/arm64/end.h */
|
|
/* end file src/arm64/implementation.cpp */
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_FALLBACK
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=fallback/implementation.cpp
|
|
/* begin file src/fallback/implementation.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/begin.h
|
|
/* begin file src/simdutf/fallback/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "fallback"
|
|
// #define SIMDUTF_IMPLEMENTATION fallback
|
|
/* end file src/simdutf/fallback/begin.h */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
namespace simdutf {
|
|
namespace fallback {
|
|
|
|
simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
|
|
// If there is a BOM, then we trust it.
|
|
auto bom_encoding = simdutf::BOM::check_bom(input, length);
|
|
if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
|
|
int out = 0;
|
|
if(validate_utf8(input, length)) { out |= encoding_type::UTF8; }
|
|
if((length % 2) == 0) {
|
|
if(validate_utf16le(reinterpret_cast<const char16_t*>(input), length/2)) { out |= encoding_type::UTF16_LE; }
|
|
}
|
|
if((length % 4) == 0) {
|
|
if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { out |= encoding_type::UTF32_LE; }
|
|
}
|
|
|
|
return out;
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
return scalar::utf8::validate(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
|
|
return scalar::utf8::validate_with_errors(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
|
|
return scalar::ascii::validate(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
|
|
return scalar::ascii::validate_with_errors(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
|
|
return scalar::utf16::validate<endianness::LITTLE>(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
|
|
return scalar::utf16::validate<endianness::BIG>(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
|
|
return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
|
|
return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
|
|
return scalar::utf32::validate(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
|
|
return scalar::utf32::validate_with_errors(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return scalar::utf8_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return scalar::utf8_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return scalar::utf8_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
return scalar::utf8_to_utf32::convert(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
return scalar::utf8_to_utf32::convert_with_errors(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
|
|
char32_t* utf32_output) const noexcept {
|
|
return scalar::utf8_to_utf32::convert_valid(input, size, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len, utf32_output);
|
|
}
|
|
|
|
void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
|
|
scalar::utf16::change_endianness_utf16(input, length, output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
|
|
return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
|
|
return scalar::utf16::count_code_points<endianness::BIG>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
|
|
return scalar::utf8::count_code_points(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
|
|
return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
|
|
return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
|
|
return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
|
|
return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
|
|
return scalar::utf8::utf16_length_from_utf8(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
|
|
return scalar::utf32::utf8_length_from_utf32(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
|
|
return scalar::utf32::utf16_length_from_utf32(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
|
|
return scalar::utf8::count_code_points(input, length);
|
|
}
|
|
|
|
} // namespace fallback
|
|
} // namespace simdutf
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/end.h
|
|
/* begin file src/simdutf/fallback/end.h */
|
|
/* end file src/simdutf/fallback/end.h */
|
|
/* end file src/fallback/implementation.cpp */
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_ICELAKE
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/implementation.cpp
|
|
/* begin file src/icelake/implementation.cpp */
|
|
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/begin.h
|
|
/* begin file src/simdutf/icelake/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "icelake"
|
|
// #define SIMDUTF_IMPLEMENTATION icelake
|
|
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_TARGET_ICELAKE
|
|
#endif
|
|
|
|
#if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
|
|
SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
|
|
#endif // end of workaround
|
|
/* end file src/simdutf/icelake/begin.h */
|
|
namespace simdutf {
|
|
namespace icelake {
|
|
namespace {
|
|
#ifndef SIMDUTF_ICELAKE_H
|
|
#error "icelake.h must be included"
|
|
#endif
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_utf8_common.inl.cpp
|
|
/* begin file src/icelake/icelake_utf8_common.inl.cpp */
|
|
// Common procedures for both validating and non-validating conversions from UTF-8.
|
|
enum block_processing_mode { SIMDUTF_FULL, SIMDUTF_TAIL};
|
|
|
|
using utf8_to_utf16_result = std::pair<const char*, char16_t*>;
|
|
using utf8_to_utf32_result = std::pair<const char*, uint32_t*>;
|
|
|
|
/*
|
|
process_block_utf8_to_utf16 converts up to 64 bytes from 'in' from UTF-8
|
|
to UTF-16. When tail = SIMDUTF_FULL, then the full input buffer (64 bytes)
|
|
might be used. When tail = SIMDUTF_TAIL, we take into account 'gap' which
|
|
indicates how many input bytes are relevant.
|
|
|
|
Returns true when the result is correct, otherwise it returns false.
|
|
|
|
The provided in and out pointers are advanced according to how many input
|
|
bytes have been processed, upon success.
|
|
*/
|
|
template <block_processing_mode tail, endianness big_endian>
|
|
simdutf_really_inline bool process_block_utf8_to_utf16(const char *&in, char16_t *&out, size_t gap) {
|
|
// constants
|
|
__m512i mask_identity = _mm512_set_epi8(63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
|
__m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0);
|
|
__m512i mask_80808080 = _mm512_set1_epi32(0x80808080);
|
|
__m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0);
|
|
__m512i mask_dfdfdfdf_tail = _mm512_set_epi64(0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf);
|
|
__m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2);
|
|
__m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff);
|
|
__m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0);
|
|
__m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00);
|
|
__m512i byteflip = _mm512_setr_epi64(
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809
|
|
);
|
|
// Note that 'tail' is a compile-time constant !
|
|
__mmask64 b = (tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1;
|
|
__m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in) : _mm512_maskz_loadu_epi8(b, in);
|
|
__mmask64 m1 = (tail == SIMDUTF_FULL) ? _mm512_cmplt_epu8_mask(input, mask_80808080) : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080);
|
|
if(_ktestc_mask64_u8(m1, b)) {// NOT(m1) AND b -- if all zeroes, then all ASCII
|
|
// alternatively, we could do 'if (m1 == b) { '
|
|
if (tail == SIMDUTF_FULL) {
|
|
in += 64; // consumed 64 bytes
|
|
// we convert a full 64-byte block, writing 128 bytes.
|
|
__m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
|
|
if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); }
|
|
_mm512_storeu_si512(out, input1);
|
|
out += 32;
|
|
__m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
|
|
if(big_endian) { input2 = _mm512_shuffle_epi8(input2, byteflip); }
|
|
_mm512_storeu_si512(out, input2);
|
|
out += 32;
|
|
return true; // we are done
|
|
} else {
|
|
in += gap;
|
|
if (gap <= 32) {
|
|
__m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
|
|
if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); }
|
|
_mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1), input1);
|
|
out += gap;
|
|
} else {
|
|
__m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
|
|
if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); }
|
|
_mm512_storeu_si512(out, input1);
|
|
out += 32;
|
|
__m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
|
|
if(big_endian) { input2 = _mm512_shuffle_epi8(input2, byteflip); }
|
|
_mm512_mask_storeu_epi16(out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2);
|
|
out += gap - 32;
|
|
}
|
|
return true; // we are done
|
|
}
|
|
}
|
|
// classify characters further
|
|
__mmask64 m234 = _mm512_cmp_epu8_mask(mask_c0c0c0c0, input,
|
|
_MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte
|
|
__mmask64 m34 = _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input,
|
|
_MM_CMPINT_LT); // 0xdf < input, 3 or 4 leading byte
|
|
|
|
__mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask(m234, input, mask_c2c2c2c2,
|
|
_MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence)
|
|
// Overlong 2-byte sequence
|
|
if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) {
|
|
// Overlong 2-byte sequence
|
|
return false;
|
|
}
|
|
if (_ktestz_mask64_u8(m34, m34) == 0) {
|
|
// We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a 4-byte sequence!
|
|
__mmask64 m4 = _mm512_cmp_epu8_mask(input, mask_f0f0f0f0,
|
|
_MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes)
|
|
|
|
__mmask64 mask_not_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m1) : _kand_mask64(_knot_mask64(m1), b);
|
|
|
|
__mmask64 mp1 = _kshiftli_mask64(m234, 1);
|
|
__mmask64 mp2 = _kshiftli_mask64(m34, 2);
|
|
// We could do it as follows...
|
|
// if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes
|
|
// but GCC generates better code when we do:
|
|
if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes
|
|
// Fast path with 1,2,3 bytes
|
|
__mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes
|
|
__mmask64 m1234 = _kor_mask64(m1, m234);
|
|
// mismatched continuation bytes:
|
|
if (tail == SIMDUTF_FULL) {
|
|
__mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
|
|
// the presence of a 1 bit indicates that they overlap.
|
|
// _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes.
|
|
if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { return false; }
|
|
} else {
|
|
__mmask64 bxorm1234 = _kxor_mask64(b, m1234);
|
|
if (mc != bxorm1234) { return false; }
|
|
}
|
|
// mend: identifying the last bytes of each sequence to be decoded
|
|
__mmask64 mend = _kshiftri_mask64(m1234, 1);
|
|
if (tail != SIMDUTF_FULL) {
|
|
mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1)));
|
|
}
|
|
|
|
|
|
__m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
|
|
__m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
|
|
|
|
__m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000 other: 11000000
|
|
__m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input); // high two bits cleared where not ASCII
|
|
__m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16,
|
|
clearedbytes); // the last byte of each character
|
|
|
|
__mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1); // bytes that precede non-ASCII bytes
|
|
__m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
|
|
__m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
|
|
__m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes,
|
|
beforeasciibytes); // the second last bytes (of two, three byte seq,
|
|
// surrogates)
|
|
secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6); // shifted into position
|
|
|
|
__m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff,
|
|
indexofsecondlastbytes); // indices of the second last bytes
|
|
__m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34,
|
|
clearedbytes); // only those that are the third last byte of a sequece
|
|
__m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes,
|
|
thirdlastbyte); // the third last bytes (of three byte sequences, hi
|
|
// surrogate)
|
|
thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
|
|
__m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254);
|
|
// the elements of Wout excluding the last element if it happens to be a high surrogate:
|
|
|
|
__mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(0xFFFFFFFF, mend) : _pdep_u64(0xFFFFFFFF, _kand_mask64(mend, b)); // we adjust mend at the end of the output.
|
|
|
|
|
|
// Encodings out of range...
|
|
{
|
|
// the location of 3-byte sequence start bytes in the input
|
|
__mmask64 m3 = m34 & (b ^ m4);
|
|
// words in Wout corresponding to 3-byte sequences.
|
|
__mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
|
|
__m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
|
|
__mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
|
|
__m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
|
|
__m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
|
|
__mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
|
|
if (_kor_mask32(Msmall800, M3s)) { return false; }
|
|
}
|
|
int64_t nout = _mm_popcnt_u64(mprocessed);
|
|
in += 64 - _lzcnt_u64(mprocessed);
|
|
if(big_endian) { Wout = _mm512_shuffle_epi8(Wout, byteflip); }
|
|
_mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
|
|
out += nout;
|
|
return true; // ok
|
|
}
|
|
//
|
|
// We have a 4-byte sequence, this is the general case.
|
|
// Slow!
|
|
__mmask64 mp3 = _kshiftli_mask64(m4, 3);
|
|
__mmask64 mc = _kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes
|
|
__mmask64 m1234 = _kor_mask64(m1, m234);
|
|
|
|
// mend: identifying the last bytes of each sequence to be decoded
|
|
__mmask64 mend = _kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3);
|
|
if (tail != SIMDUTF_FULL) {
|
|
mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1)));
|
|
}
|
|
__m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
|
|
__m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
|
|
|
|
__m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000 other: 11000000
|
|
__m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input); // high two bits cleared where not ASCII
|
|
__m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16,
|
|
clearedbytes); // the last byte of each character
|
|
|
|
__mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1); // bytes that precede non-ASCII bytes
|
|
__m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
|
|
__m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
|
|
__m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes,
|
|
beforeasciibytes); // the second last bytes (of two, three byte seq,
|
|
// surrogates)
|
|
secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6); // shifted into position
|
|
|
|
__m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff,
|
|
indexofsecondlastbytes); // indices of the second last bytes
|
|
__m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34,
|
|
clearedbytes); // only those that are the third last byte of a sequece
|
|
__m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes,
|
|
thirdlastbyte); // the third last bytes (of three byte sequences, hi
|
|
// surrogate)
|
|
thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
|
|
__m512i thirdsecondandlastbytes = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254);
|
|
uint64_t Mlo_uint64 = _pext_u64(mp3, mend);
|
|
__mmask32 Mlo = __mmask32(Mlo_uint64);
|
|
__mmask32 Mhi = __mmask32(Mlo_uint64 >> 1);
|
|
__m512i lo_surr_mask = _mm512_maskz_mov_epi16(Mlo,
|
|
mask_dc00dc00); // lo surr: 1101110000000000, other: 0000000000000000
|
|
__m512i shifted4_thirdsecondandlastbytes = _mm512_srli_epi16(thirdsecondandlastbytes,
|
|
4); // hi surr: 00000WVUTSRQPNML vuts = WVUTS - 1
|
|
__m512i tagged_lo_surrogates = _mm512_or_si512(thirdsecondandlastbytes,
|
|
lo_surr_mask); // lo surr: 110111KJHGFEDCBA, other: unchanged
|
|
__m512i Wout = _mm512_mask_add_epi16(tagged_lo_surrogates, Mhi, shifted4_thirdsecondandlastbytes,
|
|
mask_d7c0d7c0); // hi sur: 110110vutsRQPNML, other: unchanged
|
|
// the elements of Wout excluding the last element if it happens to be a high surrogate:
|
|
__mmask32 Mout = ~(Mhi & 0x80000000);
|
|
__mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(Mout, mend) : _pdep_u64(Mout, _kand_mask64(mend, b)); // we adjust mend at the end of the output.
|
|
|
|
|
|
// mismatched continuation bytes:
|
|
if (tail == SIMDUTF_FULL) {
|
|
__mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
|
|
// the presence of a 1 bit indicates that they overlap.
|
|
// _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes.
|
|
if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { return false; }
|
|
} else {
|
|
__mmask64 bxorm1234 = _kxor_mask64(b, m1234);
|
|
if (mc != bxorm1234) { return false; }
|
|
}
|
|
// Encodings out of range...
|
|
{
|
|
// the location of 3-byte sequence start bytes in the input
|
|
__mmask64 m3 = m34 & (b ^ m4);
|
|
// words in Wout corresponding to 3-byte sequences.
|
|
__mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
|
|
__m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
|
|
__mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
|
|
__m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
|
|
__m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
|
|
__mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
|
|
__m512i mask_04000400 = _mm512_set1_epi32(0x04000400);
|
|
__mmask32 M4s = _mm512_mask_cmpge_epu16_mask(Mhi, Moutminusd800, mask_04000400);
|
|
if (!_kortestz_mask32_u8(M4s, _kor_mask32(Msmall800, M3s))) { return false; }
|
|
}
|
|
in += 64 - _lzcnt_u64(mprocessed);
|
|
int64_t nout = _mm_popcnt_u64(mprocessed);
|
|
if(big_endian) { Wout = _mm512_shuffle_epi8(Wout, byteflip); }
|
|
_mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
|
|
out += nout;
|
|
return true; // ok
|
|
}
|
|
// Fast path 2: all ASCII or 2 byte
|
|
__mmask64 continuation_or_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m234) : _kand_mask64(_knot_mask64(m234), b);
|
|
// on top of -0xc0 we substract -2 which we get back later of the
|
|
// continuation byte tags
|
|
__m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2);
|
|
__mmask64 leading = tail == (tail == SIMDUTF_FULL) ? _kor_mask64(m1, m234) : _kand_mask64(_kor_mask64(m1, m234), b); // first bytes of each sequence
|
|
if (tail == SIMDUTF_FULL) {
|
|
__mmask64 xnor234leading = _kxnor_mask64(_kshiftli_mask64(m234, 1), leading);
|
|
if (!_kortestz_mask64_u8(xnor234leading, xnor234leading)) { return false; }
|
|
} else {
|
|
__mmask64 bxorleading = _kxor_mask64(b, leading);
|
|
if (_kshiftli_mask64(m234, 1) != bxorleading) { return false; }
|
|
}
|
|
//
|
|
if (tail == SIMDUTF_FULL) {
|
|
// In the two-byte/ASCII scenario, we are easily latency bound, so we want
|
|
// to increment the input buffer as quickly as possible.
|
|
// We process 32 bytes unless the byte at index 32 is a continuation byte,
|
|
// in which case we include it as well for a total of 33 bytes.
|
|
// Note that if x is an ASCII byte, then the following is false:
|
|
// int8_t(x) <= int8_t(0xc0) under two's complement.
|
|
in += 32;
|
|
if(int8_t(*in) <= int8_t(0xc0)) in++;
|
|
// The alternative is to do
|
|
// in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
|
|
// but it requires loading the input, doing the mask computation, and converting
|
|
// back the mask to a general register. It just takes too long, leaving the
|
|
// processor likely to be idle.
|
|
} else {
|
|
in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
|
|
}
|
|
__m512i lead = _mm512_maskz_compress_epi8(leading, leading2byte); // will contain zero for ascii, and the data
|
|
lead = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(lead)); // ... zero extended into words
|
|
__m512i follow = _mm512_maskz_compress_epi8(continuation_or_ascii, input); // the last bytes of each sequence
|
|
follow = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(follow)); // ... zero extended into words
|
|
lead = _mm512_slli_epi16(lead, 6); // shifted into position
|
|
__m512i final = _mm512_add_epi16(follow, lead); // combining lead and follow
|
|
|
|
if(big_endian) { final = _mm512_shuffle_epi8(final, byteflip); }
|
|
if (tail == SIMDUTF_FULL) {
|
|
// Next part is UTF-16 specific and can be generalized to UTF-32.
|
|
int nout = _mm_popcnt_u32(uint32_t(leading));
|
|
_mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
|
|
out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
|
|
} else {
|
|
int nout = int(_mm_popcnt_u64(_pdep_u64(0xFFFFFFFF, leading)));
|
|
_mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
|
|
out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
|
|
}
|
|
|
|
return true; // we are fine.
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
utf32_to_utf16_masked converts `count` lower UTF-32 words
|
|
from input `utf32` into UTF-16. It differs from utf32_to_utf16
|
|
in that it 'masks' the writes.
|
|
|
|
Returns how many 16-bit words were stored.
|
|
|
|
byteflip is used for flipping 16-bit words, and it should be
|
|
__m512i byteflip = _mm512_setr_epi64(
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809
|
|
);
|
|
We pass it to the (always inlined) function to encourage the compiler to
|
|
keep the value in a (constant) register.
|
|
*/
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output) {
|
|
|
|
const __mmask16 valid = uint16_t((1 << count) - 1);
|
|
// 1. check if we have any surrogate pairs
|
|
const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
|
|
const __mmask16 sp_mask = _mm512_mask_cmpgt_epu32_mask(valid, utf32, v_0000_ffff);
|
|
|
|
if (sp_mask == 0) {
|
|
if(big_endian) {
|
|
_mm256_mask_storeu_epi16((__m256i*)output, valid, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32), _mm512_castsi512_si256(byteflip)));
|
|
|
|
} else {
|
|
_mm256_mask_storeu_epi16((__m256i*)output, valid, _mm512_cvtepi32_epi16(utf32));
|
|
}
|
|
return count;
|
|
}
|
|
|
|
{
|
|
// build surrogate pair words in 32-bit lanes
|
|
|
|
// t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb]
|
|
const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000);
|
|
const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000);
|
|
|
|
// t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000]
|
|
const __m512i t1 = _mm512_slli_epi32(t0, 6);
|
|
|
|
// t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1 to t0
|
|
// 0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000)
|
|
const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000);
|
|
const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4);
|
|
|
|
// t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1 to t0
|
|
// 0xba = (t2 and not v_fc00_fc000) or v_d800_dc00
|
|
const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00);
|
|
const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00);
|
|
const __m512i t3 = _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
|
|
const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
|
|
__m512i t5 = _mm512_ror_epi32(t4, 16);
|
|
// Here we want to trim all of the upper 16-bit words from the 2-byte
|
|
// characters represented as 4-byte values. We can compute it from
|
|
// sp_mask or the following... It can be more optimized!
|
|
const __mmask32 nonzero = _kor_mask32(0xaaaaaaaa,_mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
|
|
const __mmask32 nonzero_masked = _kand_mask32(nonzero, __mmask32((uint64_t(1) << (2*count)) - 1));
|
|
if(big_endian) { t5 = _mm512_shuffle_epi8(t5, byteflip); }
|
|
// we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability (zen4)
|
|
__m512i compressed = _mm512_maskz_compress_epi16(nonzero_masked, t5);
|
|
_mm512_mask_storeu_epi16(output, (1<<(count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1, compressed);
|
|
//_mm512_mask_compressstoreu_epi16(output, nonzero_masked, t5);
|
|
}
|
|
|
|
return count + static_cast<unsigned int>(count_ones(sp_mask));
|
|
}
|
|
|
|
/*
|
|
utf32_to_utf16 converts `count` lower UTF-32 words
|
|
from input `utf32` into UTF-16. It may overflow.
|
|
|
|
Returns how many 16-bit words were stored.
|
|
|
|
byteflip is used for flipping 16-bit words, and it should be
|
|
__m512i byteflip = _mm512_setr_epi64(
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809
|
|
);
|
|
We pass it to the (always inlined) function to encourage the compiler to
|
|
keep the value in a (constant) register.
|
|
*/
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output) {
|
|
// check if we have any surrogate pairs
|
|
const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
|
|
const __mmask16 sp_mask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
|
|
|
|
if (sp_mask == 0) {
|
|
// technically, it should be _mm256_storeu_epi16
|
|
if(big_endian) {
|
|
_mm256_storeu_si256((__m256i*)output, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),_mm512_castsi512_si256(byteflip)));
|
|
} else {
|
|
_mm256_storeu_si256((__m256i*)output, _mm512_cvtepi32_epi16(utf32));
|
|
}
|
|
return count;
|
|
}
|
|
|
|
{
|
|
// build surrogate pair words in 32-bit lanes
|
|
|
|
// t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb]
|
|
const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000);
|
|
const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000);
|
|
|
|
// t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000]
|
|
const __m512i t1 = _mm512_slli_epi32(t0, 6);
|
|
|
|
// t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1 to t0
|
|
// 0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000)
|
|
const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000);
|
|
const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4);
|
|
|
|
// t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1 to t0
|
|
// 0xba = (t2 and not v_fc00_fc000) or v_d800_dc00
|
|
const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00);
|
|
const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00);
|
|
const __m512i t3 = _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
|
|
const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
|
|
__m512i t5 = _mm512_ror_epi32(t4, 16);
|
|
const __mmask32 nonzero = _kor_mask32(0xaaaaaaaa,_mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
|
|
if(big_endian) { t5 = _mm512_shuffle_epi8(t5, byteflip); }
|
|
// we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability (zen4)
|
|
__m512i compressed = _mm512_maskz_compress_epi16(nonzero, t5);
|
|
_mm512_mask_storeu_epi16(output, (1<<(count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1, compressed);
|
|
//_mm512_mask_compressstoreu_epi16(output, nonzero, t5);
|
|
}
|
|
|
|
return count + static_cast<unsigned int>(count_ones(sp_mask));
|
|
}
|
|
|
|
/**
|
|
* Store the last N bytes of previous followed by 512-N bytes from input.
|
|
*/
|
|
template <int N>
|
|
__m512i prev(__m512i input, __m512i previous) {
|
|
static_assert(N<=32, "N must be no larger than 32");
|
|
const __m512i movemask = _mm512_setr_epi32(28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11);
|
|
const __m512i rotated = _mm512_permutex2var_epi32(input, movemask, previous);
|
|
#if SIMDUTF_GCC8 || SIMDUTF_GCC9
|
|
constexpr int shift = 16-N; // workaround for GCC8,9
|
|
return _mm512_alignr_epi8(input, rotated, shift);
|
|
#else
|
|
return _mm512_alignr_epi8(input, rotated, 16-N);
|
|
#endif // SIMDUTF_GCC8 || SIMDUTF_GCC9
|
|
}
|
|
|
|
template <unsigned idx0, unsigned idx1, unsigned idx2, unsigned idx3>
|
|
__m512i shuffle_epi128(__m512i v) {
|
|
static_assert((idx0 >= 0 && idx0 <= 3), "idx0 must be in range 0..3");
|
|
static_assert((idx1 >= 0 && idx1 <= 3), "idx1 must be in range 0..3");
|
|
static_assert((idx2 >= 0 && idx2 <= 3), "idx2 must be in range 0..3");
|
|
static_assert((idx3 >= 0 && idx3 <= 3), "idx3 must be in range 0..3");
|
|
|
|
constexpr unsigned shuffle = idx0 | (idx1 << 2) | (idx2 << 4) | (idx3 << 6);
|
|
return _mm512_shuffle_i32x4(v, v, shuffle);
|
|
}
|
|
|
|
template <unsigned idx>
|
|
constexpr __m512i broadcast_epi128(__m512i v) {
|
|
return shuffle_epi128<idx, idx, idx, idx>(v);
|
|
}
|
|
|
|
/**
|
|
* Current unused.
|
|
*/
|
|
template <int N>
|
|
__m512i rotate_by_N_epi8(const __m512i input) {
|
|
|
|
// lanes order: 1, 2, 3, 0 => 0b00_11_10_01
|
|
const __m512i permuted = _mm512_shuffle_i32x4(input, input, 0x39);
|
|
|
|
return _mm512_alignr_epi8(permuted, input, N);
|
|
}
|
|
|
|
/*
|
|
expanded_utf8_to_utf32 converts expanded UTF-8 characters (`utf8`)
|
|
stored at separate 32-bit lanes.
|
|
|
|
For each lane we have also a character class (`char_class), given in form
|
|
0x8080800N, where N is 4 higest bits from the leading byte; 0x80 resets
|
|
corresponding bytes during pshufb.
|
|
*/
|
|
simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i utf8) {
|
|
/*
|
|
Input:
|
|
- utf8: bytes stored at separate 32-bit words
|
|
- valid: which words have valid UTF-8 characters
|
|
|
|
Bit layout of single word. We show 4 cases for each possible
|
|
UTF-8 character encoding. The `?` denotes bits we must not
|
|
assume their value.
|
|
|
|
|10dd.dddd|10cc.cccc|10bb.bbbb|1111.0aaa| 4-byte char
|
|
|????.????|10cc.cccc|10bb.bbbb|1110.aaaa| 3-byte char
|
|
|????.????|????.????|10bb.bbbb|110a.aaaa| 2-byte char
|
|
|????.????|????.????|????.????|0aaa.aaaa| ASCII char
|
|
byte 3 byte 2 byte 1 byte 0
|
|
*/
|
|
|
|
/* 1. Reset control bits of continuation bytes and the MSB
|
|
of the leading byte; this makes all bytes unsigned (and
|
|
does not alter ASCII char).
|
|
|
|
|00dd.dddd|00cc.cccc|00bb.bbbb|0111.0aaa| 4-byte char
|
|
|00??.????|00cc.cccc|00bb.bbbb|0110.aaaa| 3-byte char
|
|
|00??.????|00??.????|00bb.bbbb|010a.aaaa| 2-byte char
|
|
|00??.????|00??.????|00??.????|0aaa.aaaa| ASCII char
|
|
^^ ^^ ^^ ^
|
|
*/
|
|
__m512i values;
|
|
const __m512i v_3f3f_3f7f = _mm512_set1_epi32(0x3f3f3f7f);
|
|
values = _mm512_and_si512(utf8, v_3f3f_3f7f);
|
|
|
|
/* 2. Swap and join fields A-B and C-D
|
|
|
|
|0000.cccc|ccdd.dddd|0001.110a|aabb.bbbb| 4-byte char
|
|
|0000.cccc|cc??.????|0001.10aa|aabb.bbbb| 3-byte char
|
|
|0000.????|????.????|0001.0aaa|aabb.bbbb| 2-byte char
|
|
|0000.????|????.????|000a.aaaa|aa??.????| ASCII char */
|
|
const __m512i v_0140_0140 = _mm512_set1_epi32(0x01400140);
|
|
values = _mm512_maddubs_epi16(values, v_0140_0140);
|
|
|
|
/* 3. Swap and join fields AB & CD
|
|
|
|
|0000.0001|110a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char
|
|
|0000.0001|10aa.aabb|bbbb.cccc|cc??.????| 3-byte char
|
|
|0000.0001|0aaa.aabb|bbbb.????|????.????| 2-byte char
|
|
|0000.000a|aaaa.aa??|????.????|????.????| ASCII char */
|
|
const __m512i v_0001_1000 = _mm512_set1_epi32(0x00011000);
|
|
values = _mm512_madd_epi16(values, v_0001_1000);
|
|
|
|
/* 4. Shift left the values by variable amounts to reset highest UTF-8 bits
|
|
|aaab.bbbb|bccc.cccd|dddd.d000|0000.0000| 4-byte char -- by 11
|
|
|aaaa.bbbb|bbcc.cccc|????.??00|0000.0000| 3-byte char -- by 10
|
|
|aaaa.abbb|bbb?.????|????.???0|0000.0000| 2-byte char -- by 9
|
|
|aaaa.aaa?|????.????|????.????|?000.0000| ASCII char -- by 7 */
|
|
{
|
|
/** pshufb
|
|
|
|
continuation = 0
|
|
ascii = 7
|
|
_2_bytes = 9
|
|
_3_bytes = 10
|
|
_4_bytes = 11
|
|
|
|
shift_left_v3 = 4 * [
|
|
ascii, # 0000
|
|
ascii, # 0001
|
|
ascii, # 0010
|
|
ascii, # 0011
|
|
ascii, # 0100
|
|
ascii, # 0101
|
|
ascii, # 0110
|
|
ascii, # 0111
|
|
continuation, # 1000
|
|
continuation, # 1001
|
|
continuation, # 1010
|
|
continuation, # 1011
|
|
_2_bytes, # 1100
|
|
_2_bytes, # 1101
|
|
_3_bytes, # 1110
|
|
_4_bytes, # 1111
|
|
] */
|
|
const __m512i shift_left_v3 = _mm512_setr_epi64(
|
|
0x0707070707070707,
|
|
0x0b0a090900000000,
|
|
0x0707070707070707,
|
|
0x0b0a090900000000,
|
|
0x0707070707070707,
|
|
0x0b0a090900000000,
|
|
0x0707070707070707,
|
|
0x0b0a090900000000
|
|
);
|
|
|
|
const __m512i shift = _mm512_shuffle_epi8(shift_left_v3, char_class);
|
|
values = _mm512_sllv_epi32(values, shift);
|
|
}
|
|
|
|
/* 5. Shift right the values by variable amounts to reset lowest bits
|
|
|0000.0000|000a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char -- by 11
|
|
|0000.0000|0000.0000|aaaa.bbbb|bbcc.cccc| 3-byte char -- by 16
|
|
|0000.0000|0000.0000|0000.0aaa|aabb.bbbb| 2-byte char -- by 21
|
|
|0000.0000|0000.0000|0000.0000|0aaa.aaaa| ASCII char -- by 25 */
|
|
{
|
|
// 4 * [25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 21, 21, 16, 11]
|
|
const __m512i shift_right = _mm512_setr_epi64(
|
|
0x1919191919191919,
|
|
0x0b10151500000000,
|
|
0x1919191919191919,
|
|
0x0b10151500000000,
|
|
0x1919191919191919,
|
|
0x0b10151500000000,
|
|
0x1919191919191919,
|
|
0x0b10151500000000
|
|
);
|
|
|
|
const __m512i shift = _mm512_shuffle_epi8(shift_right, char_class);
|
|
values = _mm512_srlv_epi32(values, shift);
|
|
}
|
|
|
|
return values;
|
|
}
|
|
|
|
|
|
simdutf_really_inline __m512i expand_and_identify(__m512i lane0, __m512i lane1, int &count) {
|
|
const __m512i merged = _mm512_mask_mov_epi32(lane0, 0x1000, lane1);
|
|
const __m512i expand_ver2 = _mm512_setr_epi64(
|
|
0x0403020103020100,
|
|
0x0605040305040302,
|
|
0x0807060507060504,
|
|
0x0a09080709080706,
|
|
0x0c0b0a090b0a0908,
|
|
0x0e0d0c0b0d0c0b0a,
|
|
0x000f0e0d0f0e0d0c,
|
|
0x0201000f01000f0e
|
|
);
|
|
const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);
|
|
const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);
|
|
const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);
|
|
const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);
|
|
const __mmask16 leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);
|
|
count = static_cast<int>(count_ones(leading_bytes));
|
|
return _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, input);
|
|
}
|
|
|
|
simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) {
|
|
__m512i char_class = _mm512_srli_epi32(input, 4);
|
|
/* char_class = ((input >> 4) & 0x0f) | 0x80808000 */
|
|
const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);
|
|
const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);
|
|
char_class = _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea);
|
|
return expanded_utf8_to_utf32(char_class, input);
|
|
}
|
|
/* end file src/icelake/icelake_utf8_common.inl.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_macros.inl.cpp
|
|
/* begin file src/icelake/icelake_macros.inl.cpp */
|
|
|
|
/*
|
|
This upcoming macro (SIMDUTF_ICELAKE_TRANSCODE16) takes 16 + 4 bytes (of a UTF-8 string)
|
|
and loads all possible 4-byte substring into an AVX512 register.
|
|
|
|
For example if we have bytes abcdefgh... we create following 32-bit lanes
|
|
|
|
[abcd|bcde|cdef|defg|efgh|...]
|
|
^ ^
|
|
byte 0 of reg byte 63 of reg
|
|
*/
|
|
/** pshufb
|
|
# lane{0,1,2} have got bytes: [ 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15]
|
|
# lane3 has got bytes: [ 16, 17, 18, 19, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15]
|
|
|
|
expand_ver2 = [
|
|
# lane 0:
|
|
0, 1, 2, 3,
|
|
1, 2, 3, 4,
|
|
2, 3, 4, 5,
|
|
3, 4, 5, 6,
|
|
|
|
# lane 1:
|
|
4, 5, 6, 7,
|
|
5, 6, 7, 8,
|
|
6, 7, 8, 9,
|
|
7, 8, 9, 10,
|
|
|
|
# lane 2:
|
|
8, 9, 10, 11,
|
|
9, 10, 11, 12,
|
|
10, 11, 12, 13,
|
|
11, 12, 13, 14,
|
|
|
|
# lane 3 order: 13, 14, 15, 16 14, 15, 16, 17, 15, 16, 17, 18, 16, 17, 18, 19
|
|
12, 13, 14, 15,
|
|
13, 14, 15, 0,
|
|
14, 15, 0, 1,
|
|
15, 0, 1, 2,
|
|
]
|
|
*/
|
|
|
|
#define SIMDUTF_ICELAKE_TRANSCODE16(LANE0, LANE1, MASKED) \
|
|
{ \
|
|
const __m512i merged = _mm512_mask_mov_epi32(LANE0, 0x1000, LANE1); \
|
|
const __m512i expand_ver2 = _mm512_setr_epi64( \
|
|
0x0403020103020100, \
|
|
0x0605040305040302, \
|
|
0x0807060507060504, \
|
|
0x0a09080709080706, \
|
|
0x0c0b0a090b0a0908, \
|
|
0x0e0d0c0b0d0c0b0a, \
|
|
0x000f0e0d0f0e0d0c, \
|
|
0x0201000f01000f0e \
|
|
); \
|
|
const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2); \
|
|
\
|
|
__mmask16 leading_bytes; \
|
|
const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0); \
|
|
const __m512i t0 = _mm512_and_si512(input, v_0000_00c0); \
|
|
const __m512i v_0000_0080 = _mm512_set1_epi32(0x80); \
|
|
leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080); \
|
|
\
|
|
__m512i char_class; \
|
|
char_class = _mm512_srli_epi32(input, 4); \
|
|
/* char_class = ((input >> 4) & 0x0f) | 0x80808000 */ \
|
|
const __m512i v_0000_000f = _mm512_set1_epi32(0x0f); \
|
|
const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000); \
|
|
char_class = _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea); \
|
|
\
|
|
const int valid_count = static_cast<int>(count_ones(leading_bytes)); \
|
|
const __m512i utf32 = expanded_utf8_to_utf32(char_class, input); \
|
|
\
|
|
const __m512i out = _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, utf32); \
|
|
\
|
|
if (UTF32) { \
|
|
if(MASKED) { \
|
|
const __mmask16 valid = uint16_t((1 << valid_count) - 1); \
|
|
_mm512_mask_storeu_epi32((__m512i*)output, valid, out); \
|
|
} else { \
|
|
_mm512_storeu_si512((__m512i*)output, out); \
|
|
} \
|
|
output += valid_count; \
|
|
} else { \
|
|
if(MASKED) { \
|
|
output += utf32_to_utf16_masked<big_endian>(byteflip, out, valid_count, reinterpret_cast<char16_t *>(output)); \
|
|
} else { \
|
|
output += utf32_to_utf16<big_endian>(byteflip, out, valid_count, reinterpret_cast<char16_t *>(output)); \
|
|
} \
|
|
} \
|
|
}
|
|
|
|
#define SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(INPUT, VALID_COUNT, MASKED) \
|
|
{ \
|
|
if (UTF32) { \
|
|
if(MASKED) { \
|
|
const __mmask16 valid_mask = uint16_t((1 << VALID_COUNT) - 1); \
|
|
_mm512_mask_storeu_epi32((__m512i*)output, valid_mask, INPUT); \
|
|
} else { \
|
|
_mm512_storeu_si512((__m512i*)output, INPUT); \
|
|
} \
|
|
output += VALID_COUNT; \
|
|
} else { \
|
|
if(MASKED) { \
|
|
output += utf32_to_utf16_masked<big_endian>(byteflip, INPUT, VALID_COUNT, reinterpret_cast<char16_t *>(output)); \
|
|
} else { \
|
|
output += utf32_to_utf16<big_endian>(byteflip, INPUT, VALID_COUNT, reinterpret_cast<char16_t *>(output)); \
|
|
} \
|
|
} \
|
|
}
|
|
|
|
|
|
#define SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output) \
|
|
if (UTF32) { \
|
|
const __m128i t0 = _mm512_castsi512_si128(utf8); \
|
|
const __m128i t1 = _mm512_extracti32x4_epi32(utf8, 1); \
|
|
const __m128i t2 = _mm512_extracti32x4_epi32(utf8, 2); \
|
|
const __m128i t3 = _mm512_extracti32x4_epi32(utf8, 3); \
|
|
_mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_cvtepu8_epi32(t0)); \
|
|
_mm512_storeu_si512((__m512i*)(output + 1*16), _mm512_cvtepu8_epi32(t1)); \
|
|
_mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_cvtepu8_epi32(t2)); \
|
|
_mm512_storeu_si512((__m512i*)(output + 3*16), _mm512_cvtepu8_epi32(t3)); \
|
|
} else { \
|
|
const __m256i h0 = _mm512_castsi512_si256(utf8); \
|
|
const __m256i h1 = _mm512_extracti64x4_epi64(utf8, 1); \
|
|
if(big_endian) { \
|
|
_mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h0), byteflip)); \
|
|
_mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h1), byteflip)); \
|
|
} else { \
|
|
_mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_cvtepu8_epi16(h0)); \
|
|
_mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_cvtepu8_epi16(h1)); \
|
|
} \
|
|
}
|
|
/* end file src/icelake/icelake_macros.inl.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_from_valid_utf8.inl.cpp
|
|
/* begin file src/icelake/icelake_from_valid_utf8.inl.cpp */
|
|
// file included directly
|
|
|
|
// File contains conversion procedure from VALID UTF-8 strings.
|
|
|
|
/*
|
|
valid_utf8_to_fixed_length converts a valid UTF-8 string into UTF-32.
|
|
|
|
The `OUTPUT` template type decides what to do with UTF-32: store
|
|
it directly or convert into UTF-16 (with AVX512).
|
|
|
|
Input:
|
|
- str - valid UTF-8 string
|
|
- len - string length
|
|
- out_buffer - output buffer
|
|
|
|
Result:
|
|
- pair.first - the first unprocessed input byte
|
|
- pair.second - the first unprocessed output word
|
|
*/
|
|
template <endianness big_endian, typename OUTPUT>
|
|
std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords) {
|
|
constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
|
|
constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
|
|
static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
|
|
static_assert(!(UTF32 and big_endian), "we do not currently support big-endian UTF-32");
|
|
|
|
__m512i byteflip = _mm512_setr_epi64(
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809
|
|
);
|
|
const char* ptr = str;
|
|
const char* end = ptr + len;
|
|
|
|
OUTPUT* output = dwords;
|
|
/**
|
|
* In the main loop, we consume 64 bytes per iteration,
|
|
* but we access 64 + 4 bytes.
|
|
* We check for ptr + 64 + 64 <= end because
|
|
* we want to be do maskless writes without overruns.
|
|
*/
|
|
while (ptr + 64 + 64 <= end) {
|
|
const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
|
|
const __m512i v_80 = _mm512_set1_epi8(char(0x80));
|
|
const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
|
|
if(ascii == 0) {
|
|
SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
|
|
output += 64;
|
|
ptr += 64;
|
|
continue;
|
|
}
|
|
|
|
const __m512i lane0 = broadcast_epi128<0>(utf8);
|
|
const __m512i lane1 = broadcast_epi128<1>(utf8);
|
|
int valid_count0;
|
|
__m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
|
|
const __m512i lane2 = broadcast_epi128<2>(utf8);
|
|
int valid_count1;
|
|
__m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
|
|
if(valid_count0 + valid_count1 <= 16) {
|
|
vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
|
|
valid_count0 += valid_count1;
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
|
|
} else {
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
vec1 = expand_utf8_to_utf32(vec1);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, false)
|
|
}
|
|
const __m512i lane3 = broadcast_epi128<3>(utf8);
|
|
int valid_count2;
|
|
__m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
|
|
uint32_t tmp1;
|
|
::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
|
|
const __m512i lane4 = _mm512_set1_epi32(tmp1);
|
|
int valid_count3;
|
|
__m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
|
|
if(valid_count2 + valid_count3 <= 16) {
|
|
vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<<valid_count3)-1)<<valid_count2), vec3);
|
|
valid_count2 += valid_count3;
|
|
vec2 = expand_utf8_to_utf32(vec2);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
|
|
} else {
|
|
vec2 = expand_utf8_to_utf32(vec2);
|
|
vec3 = expand_utf8_to_utf32(vec3);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, false)
|
|
}
|
|
ptr += 4*16;
|
|
}
|
|
|
|
if (ptr + 64 <= end) {
|
|
const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
|
|
const __m512i v_80 = _mm512_set1_epi8(char(0x80));
|
|
const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
|
|
if(ascii == 0) {
|
|
SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
|
|
output += 64;
|
|
ptr += 64;
|
|
} else {
|
|
const __m512i lane0 = broadcast_epi128<0>(utf8);
|
|
const __m512i lane1 = broadcast_epi128<1>(utf8);
|
|
int valid_count0;
|
|
__m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
|
|
const __m512i lane2 = broadcast_epi128<2>(utf8);
|
|
int valid_count1;
|
|
__m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
|
|
if(valid_count0 + valid_count1 <= 16) {
|
|
vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
|
|
valid_count0 += valid_count1;
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
|
|
} else {
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
vec1 = expand_utf8_to_utf32(vec1);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
|
|
}
|
|
|
|
const __m512i lane3 = broadcast_epi128<3>(utf8);
|
|
SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
|
|
|
|
ptr += 3*16;
|
|
}
|
|
}
|
|
return {ptr, output};
|
|
}
|
|
|
|
|
|
using utf8_to_utf16_result = std::pair<const char*, char16_t*>;
|
|
/* end file src/icelake/icelake_from_valid_utf8.inl.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_utf8_validation.inl.cpp
|
|
/* begin file src/icelake/icelake_utf8_validation.inl.cpp */
|
|
// file included directly
|
|
|
|
|
|
simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i prev1) {
|
|
__m512i mask1 = _mm512_setr_epi64(
|
|
0x0202020202020202,
|
|
0x4915012180808080,
|
|
0x0202020202020202,
|
|
0x4915012180808080,
|
|
0x0202020202020202,
|
|
0x4915012180808080,
|
|
0x0202020202020202,
|
|
0x4915012180808080);
|
|
const __m512i v_0f = _mm512_set1_epi8(0x0f);
|
|
__m512i index1 = _mm512_and_si512(_mm512_srli_epi16(prev1, 4), v_0f);
|
|
|
|
__m512i byte_1_high = _mm512_shuffle_epi8(mask1, index1);
|
|
__m512i mask2 = _mm512_setr_epi64(
|
|
0xcbcbcb8b8383a3e7,
|
|
0xcbcbdbcbcbcbcbcb,
|
|
0xcbcbcb8b8383a3e7,
|
|
0xcbcbdbcbcbcbcbcb,
|
|
0xcbcbcb8b8383a3e7,
|
|
0xcbcbdbcbcbcbcbcb,
|
|
0xcbcbcb8b8383a3e7,
|
|
0xcbcbdbcbcbcbcbcb);
|
|
__m512i index2 = _mm512_and_si512(prev1, v_0f);
|
|
|
|
__m512i byte_1_low = _mm512_shuffle_epi8(mask2, index2);
|
|
__m512i mask3 = _mm512_setr_epi64(
|
|
0x101010101010101,
|
|
0x1010101babaaee6,
|
|
0x101010101010101,
|
|
0x1010101babaaee6,
|
|
0x101010101010101,
|
|
0x1010101babaaee6,
|
|
0x101010101010101,
|
|
0x1010101babaaee6
|
|
);
|
|
__m512i index3 = _mm512_and_si512(_mm512_srli_epi16(input, 4), v_0f);
|
|
__m512i byte_2_high = _mm512_shuffle_epi8(mask3, index3);
|
|
return _mm512_ternarylogic_epi64(byte_1_high, byte_1_low, byte_2_high, 128);
|
|
}
|
|
|
|
simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input,
|
|
const __m512i prev_input, const __m512i sc) {
|
|
__m512i prev2 = prev<2>(input, prev_input);
|
|
__m512i prev3 = prev<3>(input, prev_input);
|
|
__m512i is_third_byte = _mm512_subs_epu8(prev2, _mm512_set1_epi8(0b11100000u-1)); // Only 111_____ will be > 0
|
|
__m512i is_fourth_byte = _mm512_subs_epu8(prev3, _mm512_set1_epi8(0b11110000u-1)); // Only 1111____ will be > 0
|
|
__m512i is_third_or_fourth_byte = _mm512_or_si512(is_third_byte, is_fourth_byte);
|
|
const __m512i v_7f = _mm512_set1_epi8(char(0x7f));
|
|
is_third_or_fourth_byte = _mm512_adds_epu8(v_7f, is_third_or_fourth_byte);
|
|
// We want to compute (is_third_or_fourth_byte AND v80) XOR sc.
|
|
const __m512i v_80 = _mm512_set1_epi8(char(0x80));
|
|
return _mm512_ternarylogic_epi32(is_third_or_fourth_byte, v_80, sc, 0b1101010);
|
|
//__m512i is_third_or_fourth_byte_mask = _mm512_and_si512(is_third_or_fourth_byte, v_80);
|
|
//return _mm512_xor_si512(is_third_or_fourth_byte_mask, sc);
|
|
}
|
|
//
|
|
// Return nonzero if there are incomplete multibyte characters at the end of the block:
|
|
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
|
|
//
|
|
simdutf_really_inline __m512i is_incomplete(const __m512i input) {
|
|
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
|
|
// ... 1111____ 111_____ 11______
|
|
__m512i max_value = _mm512_setr_epi64(
|
|
0xffffffffffffffff,
|
|
0xffffffffffffffff,
|
|
0xffffffffffffffff,
|
|
0xffffffffffffffff,
|
|
0xffffffffffffffff,
|
|
0xffffffffffffffff,
|
|
0xffffffffffffffff,
|
|
0xbfdfefffffffffff);
|
|
return _mm512_subs_epu8(input, max_value);
|
|
}
|
|
|
|
struct avx512_utf8_checker {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
__m512i error{};
|
|
|
|
// The last input we received
|
|
__m512i prev_input_block{};
|
|
// Whether the last input we received was incomplete (used for ASCII fast path)
|
|
__m512i prev_incomplete{};
|
|
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const __m512i input, const __m512i prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
|
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
|
__m512i prev1 = prev<1>(input, prev_input);
|
|
__m512i sc = check_special_cases(input, prev1);
|
|
this->error = _mm512_or_si512(check_multibyte_lengths(input, prev_input, sc), this->error);
|
|
}
|
|
|
|
// The only problem that can happen at EOF is that a multibyte character is too short
|
|
// or a byte value too large in the last bytes: check_special_cases only checks for bytes
|
|
// too large in the first of two bytes.
|
|
simdutf_really_inline void check_eof() {
|
|
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
|
|
// possibly finish them.
|
|
this->error = _mm512_or_si512(this->error, this->prev_incomplete);
|
|
}
|
|
|
|
// returns true if ASCII.
|
|
simdutf_really_inline bool check_next_input(const __m512i input) {
|
|
const __m512i v_80 = _mm512_set1_epi8(char(0x80));
|
|
const __mmask64 ascii = _mm512_test_epi8_mask(input, v_80);
|
|
if(ascii == 0) {
|
|
this->error = _mm512_or_si512(this->error, this->prev_incomplete);
|
|
return true;
|
|
} else {
|
|
this->check_utf8_bytes(input, this->prev_input_block);
|
|
this->prev_incomplete = is_incomplete(input);
|
|
this->prev_input_block = input;
|
|
return false;
|
|
}
|
|
}
|
|
// do not forget to call check_eof!
|
|
simdutf_really_inline bool errors() const {
|
|
return _mm512_test_epi8_mask(this->error, this->error) != 0;
|
|
}
|
|
|
|
}; // struct avx512_utf8_checker
|
|
/* end file src/icelake/icelake_utf8_validation.inl.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_from_utf8.inl.cpp
|
|
/* begin file src/icelake/icelake_from_utf8.inl.cpp */
|
|
// file included directly
|
|
|
|
// File contains conversion procedure from possibly invalid UTF-8 strings.
|
|
|
|
/**
|
|
* Attempts to convert up to len 1-byte words from in (in UTF-8 format) to
|
|
* out.
|
|
* Returns the position of the input and output after the processing is
|
|
* completed. Upon error, the output is set to null.
|
|
*/
|
|
|
|
template <endianness big_endian>
|
|
utf8_to_utf16_result fast_avx512_convert_utf8_to_utf16(const char *in, size_t len, char16_t *out) {
|
|
const char *const final_in = in + len;
|
|
bool result = true;
|
|
while (result) {
|
|
if (in + 64 <= final_in) {
|
|
result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(in, out, final_in - in);
|
|
} else if(in < final_in) {
|
|
result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(in, out, final_in - in);
|
|
} else { break; }
|
|
}
|
|
if(!result) { out = nullptr; }
|
|
return std::make_pair(in, out);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char *in, size_t len, char16_t *out) {
|
|
const char *const init_in = in;
|
|
const char16_t *const init_out = out;
|
|
const char *const final_in = in + len;
|
|
bool result = true;
|
|
while (result) {
|
|
if (in + 64 <= final_in) {
|
|
result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(in, out, final_in - in);
|
|
} else if(in < final_in) {
|
|
result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(in, out, final_in - in);
|
|
} else { break; }
|
|
}
|
|
if(!result) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in onward,
|
|
// with the ability to go back up to in - init_in bytes, and read final_in - in bytes forward.
|
|
simdutf::result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<big_endian>(in - init_in, in, final_in - in, out);
|
|
res.count += (in - init_in);
|
|
return res;
|
|
} else {
|
|
return simdutf::result(error_code::SUCCESS,out - init_out);
|
|
}
|
|
}
|
|
|
|
|
|
template <endianness big_endian, typename OUTPUT>
|
|
std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords) {
|
|
constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
|
|
constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
|
|
static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
|
|
static_assert(!(UTF32 and big_endian), "we do not currently support big-endian UTF-32");
|
|
|
|
const char* ptr = str;
|
|
const char* end = ptr + len;
|
|
__m512i byteflip = _mm512_setr_epi64(
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809
|
|
);
|
|
OUTPUT* output = dwords;
|
|
avx512_utf8_checker checker{};
|
|
/**
|
|
* In the main loop, we consume 64 bytes per iteration,
|
|
* but we access 64 + 4 bytes.
|
|
* We check for ptr + 64 + 64 <= end because
|
|
* we want to be do maskless writes without overruns.
|
|
*/
|
|
while (ptr + 64 + 64 <= end) {
|
|
const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
|
|
if(checker.check_next_input(utf8)) {
|
|
SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
|
|
output += 64;
|
|
ptr += 64;
|
|
continue;
|
|
}
|
|
const __m512i lane0 = broadcast_epi128<0>(utf8);
|
|
const __m512i lane1 = broadcast_epi128<1>(utf8);
|
|
int valid_count0;
|
|
__m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
|
|
const __m512i lane2 = broadcast_epi128<2>(utf8);
|
|
int valid_count1;
|
|
__m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
|
|
if(valid_count0 + valid_count1 <= 16) {
|
|
vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
|
|
valid_count0 += valid_count1;
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
|
|
} else {
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
vec1 = expand_utf8_to_utf32(vec1);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, false)
|
|
}
|
|
const __m512i lane3 = broadcast_epi128<3>(utf8);
|
|
int valid_count2;
|
|
__m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
|
|
uint32_t tmp1;
|
|
::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
|
|
const __m512i lane4 = _mm512_set1_epi32(tmp1);
|
|
int valid_count3;
|
|
__m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
|
|
if(valid_count2 + valid_count3 <= 16) {
|
|
vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<<valid_count3)-1)<<valid_count2), vec3);
|
|
valid_count2 += valid_count3;
|
|
vec2 = expand_utf8_to_utf32(vec2);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
|
|
} else {
|
|
vec2 = expand_utf8_to_utf32(vec2);
|
|
vec3 = expand_utf8_to_utf32(vec3);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, false)
|
|
}
|
|
ptr += 4*16;
|
|
}
|
|
const char* validatedptr = ptr; // validated up to ptr
|
|
|
|
// For the final pass, we validate 64 bytes, but we only transcode
|
|
// 3*16 bytes, so we may end up double-validating 16 bytes.
|
|
if (ptr + 64 <= end) {
|
|
const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
|
|
if(checker.check_next_input(utf8)) {
|
|
SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
|
|
output += 64;
|
|
ptr += 64;
|
|
} else {
|
|
const __m512i lane0 = broadcast_epi128<0>(utf8);
|
|
const __m512i lane1 = broadcast_epi128<1>(utf8);
|
|
int valid_count0;
|
|
__m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
|
|
const __m512i lane2 = broadcast_epi128<2>(utf8);
|
|
int valid_count1;
|
|
__m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
|
|
if(valid_count0 + valid_count1 <= 16) {
|
|
vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
|
|
valid_count0 += valid_count1;
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
|
|
} else {
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
vec1 = expand_utf8_to_utf32(vec1);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
|
|
}
|
|
|
|
const __m512i lane3 = broadcast_epi128<3>(utf8);
|
|
SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
|
|
|
|
ptr += 3*16;
|
|
}
|
|
validatedptr += 4*16;
|
|
}
|
|
{
|
|
const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - validatedptr))-1, (const __m512i*)validatedptr);
|
|
checker.check_next_input(utf8);
|
|
}
|
|
checker.check_eof();
|
|
if(checker.errors()) {
|
|
return {ptr, nullptr}; // We found an error.
|
|
}
|
|
return {ptr, output};
|
|
}
|
|
|
|
// Like validating_utf8_to_fixed_length but returns as soon as an error is identified
|
|
template <endianness big_endian, typename OUTPUT>
|
|
std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_constant_checks(const char* str, size_t len, OUTPUT* dwords) {
|
|
constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
|
|
constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
|
|
static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
|
|
static_assert(!(UTF32 and big_endian), "we do not currently support big-endian UTF-32");
|
|
|
|
const char* ptr = str;
|
|
const char* end = ptr + len;
|
|
__m512i byteflip = _mm512_setr_epi64(
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809
|
|
);
|
|
OUTPUT* output = dwords;
|
|
avx512_utf8_checker checker{};
|
|
/**
|
|
* In the main loop, we consume 64 bytes per iteration,
|
|
* but we access 64 + 4 bytes.
|
|
* We check for ptr + 64 + 64 <= end because
|
|
* we want to be do maskless writes without overruns.
|
|
*/
|
|
while (ptr + 64 + 64 <= end) {
|
|
const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
|
|
if(checker.check_next_input(utf8)) {
|
|
SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
|
|
output += 64;
|
|
ptr += 64;
|
|
continue;
|
|
}
|
|
if(checker.errors()) {
|
|
return {ptr, output, false}; // We found an error.
|
|
}
|
|
const __m512i lane0 = broadcast_epi128<0>(utf8);
|
|
const __m512i lane1 = broadcast_epi128<1>(utf8);
|
|
int valid_count0;
|
|
__m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
|
|
const __m512i lane2 = broadcast_epi128<2>(utf8);
|
|
int valid_count1;
|
|
__m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
|
|
if(valid_count0 + valid_count1 <= 16) {
|
|
vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
|
|
valid_count0 += valid_count1;
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
|
|
} else {
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
vec1 = expand_utf8_to_utf32(vec1);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, false)
|
|
}
|
|
const __m512i lane3 = broadcast_epi128<3>(utf8);
|
|
int valid_count2;
|
|
__m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
|
|
uint32_t tmp1;
|
|
::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
|
|
const __m512i lane4 = _mm512_set1_epi32(tmp1);
|
|
int valid_count3;
|
|
__m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
|
|
if(valid_count2 + valid_count3 <= 16) {
|
|
vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<<valid_count3)-1)<<valid_count2), vec3);
|
|
valid_count2 += valid_count3;
|
|
vec2 = expand_utf8_to_utf32(vec2);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
|
|
} else {
|
|
vec2 = expand_utf8_to_utf32(vec2);
|
|
vec3 = expand_utf8_to_utf32(vec3);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, false)
|
|
}
|
|
ptr += 4*16;
|
|
}
|
|
const char* validatedptr = ptr; // validated up to ptr
|
|
|
|
// For the final pass, we validate 64 bytes, but we only transcode
|
|
// 3*16 bytes, so we may end up double-validating 16 bytes.
|
|
if (ptr + 64 <= end) {
|
|
const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
|
|
if(checker.check_next_input(utf8)) {
|
|
SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
|
|
output += 64;
|
|
ptr += 64;
|
|
} else if(checker.errors()) {
|
|
return {ptr, output, false}; // We found an error.
|
|
} else {
|
|
const __m512i lane0 = broadcast_epi128<0>(utf8);
|
|
const __m512i lane1 = broadcast_epi128<1>(utf8);
|
|
int valid_count0;
|
|
__m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
|
|
const __m512i lane2 = broadcast_epi128<2>(utf8);
|
|
int valid_count1;
|
|
__m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
|
|
if(valid_count0 + valid_count1 <= 16) {
|
|
vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
|
|
valid_count0 += valid_count1;
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
|
|
} else {
|
|
vec0 = expand_utf8_to_utf32(vec0);
|
|
vec1 = expand_utf8_to_utf32(vec1);
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
|
|
SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
|
|
}
|
|
|
|
const __m512i lane3 = broadcast_epi128<3>(utf8);
|
|
SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
|
|
|
|
ptr += 3*16;
|
|
}
|
|
validatedptr += 4*16;
|
|
}
|
|
{
|
|
const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - validatedptr))-1, (const __m512i*)validatedptr);
|
|
checker.check_next_input(utf8);
|
|
}
|
|
checker.check_eof();
|
|
if(checker.errors()) {
|
|
return {ptr, output, false}; // We found an error.
|
|
}
|
|
return {ptr, output, true};
|
|
}
|
|
/* end file src/icelake/icelake_from_utf8.inl.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf16_to_utf32.inl.cpp
|
|
/* begin file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */
|
|
// file included directly
|
|
|
|
/*
|
|
Returns a pair: the first unprocessed byte from buf and utf32_output
|
|
A scalar routing should carry on the conversion of the tail.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::tuple<const char16_t*, char32_t*, bool> convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) {
|
|
const char16_t* end = buf + len;
|
|
const __m512i v_fc00 = _mm512_set1_epi16((uint16_t)0xfc00);
|
|
const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
|
|
const __m512i v_dc00 = _mm512_set1_epi16((uint16_t)0xdc00);
|
|
__mmask32 carry{0};
|
|
const __m512i byteflip = _mm512_setr_epi64(
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809
|
|
);
|
|
while (buf + 32 <= end) {
|
|
// Always safe because buf + 32 <= end so that end - buf >= 32 bytes:
|
|
__m512i in = _mm512_loadu_si512((__m512i*)buf);
|
|
if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); }
|
|
|
|
// H - bitmask for high surrogates
|
|
const __mmask32 H = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_d800);
|
|
// H - bitmask for low surrogates
|
|
const __mmask32 L = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_dc00);
|
|
|
|
if ((H|L)) {
|
|
// surrogate pair(s) in a register
|
|
const __mmask32 V = (L ^ (carry | (H << 1))); // A high surrogate must be followed by low one and a low one must be preceded by a high one.
|
|
// If valid, V should be equal to 0
|
|
|
|
if(V == 0) {
|
|
// valid case
|
|
/*
|
|
Input surrogate pair:
|
|
|1101.11aa.aaaa.aaaa|1101.10bb.bbbb.bbbb|
|
|
low surrogate high surrogate
|
|
*/
|
|
/* 1. Expand all words to 32-bit words
|
|
in |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
|
|
*/
|
|
const __m512i first = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
|
|
const __m512i second = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in,1));
|
|
|
|
/* 2. Shift by one 16-bit word to align low surrogates with high surrogates
|
|
in |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
|
|
shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
|
|
*/
|
|
const __m512i shifted_first = _mm512_alignr_epi32(second, first, 1);
|
|
const __m512i shifted_second = _mm512_alignr_epi32(_mm512_setzero_si512(), second, 1);
|
|
|
|
/* 3. Align all high surrogates in first and second by shifting to the left by 10 bits
|
|
|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
|
|
*/
|
|
const __m512i aligned_first = _mm512_mask_slli_epi32(first, (__mmask16)H, first, 10);
|
|
const __m512i aligned_second = _mm512_mask_slli_epi32(second, (__mmask16)(H>>16), second, 10);
|
|
|
|
/* 4. Remove surrogate prefixes and add offset 0x10000 by adding in, shifted and constant
|
|
in |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
|
|
shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
|
|
constant|1111.1100.1010.0000.0010.0100.0000.0000|1111.1100.1010.0000.0010.0100.0000.0000|
|
|
*/
|
|
const __m512i constant = _mm512_set1_epi32((uint32_t)0xfca02400);
|
|
const __m512i added_first = _mm512_mask_add_epi32(aligned_first, (__mmask16)H, aligned_first, shifted_first);
|
|
const __m512i utf32_first = _mm512_mask_add_epi32(added_first, (__mmask16)H, added_first, constant);
|
|
|
|
const __m512i added_second = _mm512_mask_add_epi32(aligned_second, (__mmask16)(H>>16), aligned_second, shifted_second);
|
|
const __m512i utf32_second = _mm512_mask_add_epi32(added_second, (__mmask16)(H>>16), added_second, constant);
|
|
|
|
// 5. Store all valid UTF-32 words (low surrogate positions and 32nd word are invalid)
|
|
const __mmask32 valid = ~L & 0x7fffffff;
|
|
// We deliberately do a _mm512_maskz_compress_epi32 followed by storeu_epi32
|
|
// to ease performance portability to Zen 4.
|
|
const __m512i compressed_first = _mm512_maskz_compress_epi32((__mmask16)(valid), utf32_first);
|
|
const size_t howmany1 = count_ones((uint16_t)(valid));
|
|
_mm512_storeu_si512((__m512i *) utf32_output, compressed_first);
|
|
utf32_output += howmany1;
|
|
const __m512i compressed_second = _mm512_maskz_compress_epi32((__mmask16)(valid >> 16), utf32_second);
|
|
const size_t howmany2 = count_ones((uint16_t)(valid >> 16));
|
|
// The following could be unsafe in some cases?
|
|
//_mm512_storeu_epi32((__m512i *) utf32_output, compressed_second);
|
|
_mm512_mask_storeu_epi32((__m512i *) utf32_output, __mmask16((1<<howmany2)-1), compressed_second);
|
|
utf32_output += howmany2;
|
|
// Only process 31 words, but keep track if the 31st word is a high surrogate as a carry
|
|
buf += 31;
|
|
carry = (H >> 30) & 0x1;
|
|
} else {
|
|
// invalid case
|
|
return std::make_tuple(buf+carry, utf32_output, false);
|
|
}
|
|
} else {
|
|
// no surrogates
|
|
// extend all thirty-two 16-bit words to thirty-two 32-bit words
|
|
_mm512_storeu_si512((__m512i *)(utf32_output), _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)));
|
|
_mm512_storeu_si512((__m512i *)(utf32_output) + 1, _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in,1)));
|
|
utf32_output += 32;
|
|
buf += 32;
|
|
carry = 0;
|
|
}
|
|
} // while
|
|
return std::make_tuple(buf+carry, utf32_output, true);
|
|
}
|
|
/* end file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf32_to_utf8.inl.cpp
|
|
/* begin file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
|
|
// file included directly
|
|
|
|
// Todo: currently, this is just the haswell code, optimize for icelake kernel.
|
|
std::pair<const char32_t*, char*> avx512_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) {
|
|
const char32_t* end = buf + len;
|
|
const __m256i v_0000 = _mm256_setzero_si256();
|
|
const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
|
|
const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
|
|
const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
|
|
const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
|
|
const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
|
|
__m256i running_max = _mm256_setzero_si256();
|
|
__m256i forbidden_bytemask = _mm256_setzero_si256();
|
|
|
|
const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (buf + 16 + safety_margin <= end) {
|
|
__m256i in = _mm256_loadu_si256((__m256i*)buf);
|
|
__m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
|
|
running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
|
|
|
|
// Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
|
|
__m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
|
|
in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
|
|
|
|
// Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
|
|
|
|
if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
// no bits set above 7th bit
|
|
const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
|
|
const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
|
|
|
|
// no bits set above 11th bit
|
|
const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
|
|
const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
|
|
if (one_or_two_bytes_bitmask == 0xffffffff) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
|
|
const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m256i t0 = _mm256_slli_epi16(in_16, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m256i t1 = _mm256_and_si256(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m256i t2 = _mm256_and_si256(in_16, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m256i t3 = _mm256_or_si256(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m256i t4 = _mm256_or_si256(t3, v_c080);
|
|
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
|
|
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
const uint32_t M0 = one_byte_bitmask & 0x55555555;
|
|
const uint32_t M1 = M0 >> 7;
|
|
const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
|
|
// 4. pack the bytes
|
|
|
|
const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
|
|
const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
|
|
|
|
const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
|
|
const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
|
|
|
|
const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
|
|
// 5. store bytes
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
|
|
utf8_output += row[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
|
|
utf8_output += row_2[0];
|
|
|
|
// 6. adjust pointers
|
|
buf += 16;
|
|
continue;
|
|
}
|
|
// Must check for overflow in packing
|
|
const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
|
|
const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
|
|
if (saturation_bitmask == 0xffffffff) {
|
|
// case: words from register produce either 1, 2 or 3 UTF-8 bytes
|
|
const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
|
|
forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
|
|
|
|
const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
|
|
0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two words (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two words we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
|
|
const __m256i s0 = _mm256_srli_epi16(in_16, 4);
|
|
// [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
|
|
const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
|
|
// [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
|
|
const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
|
|
// [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
|
|
const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
|
|
const __m256i s4 = _mm256_xor_si256(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand words 16-bit => 32-bit
|
|
const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
|
|
const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
|
|
|
|
// 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint32_t mask = (one_byte_bitmask & 0x55555555) |
|
|
(one_or_two_bytes_bitmask & 0xaaaaaaaa);
|
|
// Due to the wider registers, the following path is less likely to be useful.
|
|
/*if(mask == 0) {
|
|
// We only have three-byte words. Use fast path.
|
|
const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
|
|
const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
|
|
const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
|
|
utf8_output += 12;
|
|
buf += 16;
|
|
continue;
|
|
}*/
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
|
|
|
|
const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
|
|
const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
|
|
const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
|
|
const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
|
|
|
|
|
|
const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
|
|
const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
|
|
const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
|
|
const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
|
|
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_2);
|
|
utf8_output += row2[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_3);
|
|
utf8_output += row3[0];
|
|
buf += 16;
|
|
} else {
|
|
// case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// may require large, non-trivial tables?
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if((word & 0xFFFFFF80)==0) { // 1-byte (ASCII)
|
|
*utf8_output++ = char(word);
|
|
} else if((word & 0xFFFFF800)==0) { // 2-byte
|
|
*utf8_output++ = char((word>>6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if((word & 0xFFFF0000 )==0) { // 3-byte
|
|
if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
|
|
*utf8_output++ = char((word>>12) | 0b11100000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else { // 4-byte
|
|
if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
|
|
*utf8_output++ = char((word>>18) | 0b11110000);
|
|
*utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
// check for invalid input
|
|
const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
|
|
if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
|
|
return std::make_pair(nullptr, utf8_output);
|
|
}
|
|
|
|
if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); }
|
|
|
|
return std::make_pair(buf, utf8_output);
|
|
}
|
|
|
|
// Todo: currently, this is just the haswell code, optimize for icelake kernel.
|
|
std::pair<result, char*> avx512_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
|
|
const char32_t* end = buf + len;
|
|
const char32_t* start = buf;
|
|
|
|
const __m256i v_0000 = _mm256_setzero_si256();
|
|
const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
|
|
const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
|
|
const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
|
|
const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
|
|
const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
|
|
const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
|
|
|
|
const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (buf + 16 + safety_margin <= end) {
|
|
__m256i in = _mm256_loadu_si256((__m256i*)buf);
|
|
__m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
|
|
// Check for too large input
|
|
const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
|
|
if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
|
|
}
|
|
|
|
// Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
|
|
__m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
|
|
in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
|
|
|
|
// Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
|
|
|
|
if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
// no bits set above 7th bit
|
|
const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
|
|
const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
|
|
|
|
// no bits set above 11th bit
|
|
const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
|
|
const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
|
|
if (one_or_two_bytes_bitmask == 0xffffffff) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
|
|
const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m256i t0 = _mm256_slli_epi16(in_16, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m256i t1 = _mm256_and_si256(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m256i t2 = _mm256_and_si256(in_16, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m256i t3 = _mm256_or_si256(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m256i t4 = _mm256_or_si256(t3, v_c080);
|
|
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
|
|
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
const uint32_t M0 = one_byte_bitmask & 0x55555555;
|
|
const uint32_t M1 = M0 >> 7;
|
|
const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
|
|
// 4. pack the bytes
|
|
|
|
const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
|
|
const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
|
|
|
|
const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
|
|
const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
|
|
|
|
const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
|
|
// 5. store bytes
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
|
|
utf8_output += row[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
|
|
utf8_output += row_2[0];
|
|
|
|
// 6. adjust pointers
|
|
buf += 16;
|
|
continue;
|
|
}
|
|
// Must check for overflow in packing
|
|
const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
|
|
const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
|
|
if (saturation_bitmask == 0xffffffff) {
|
|
// case: words from register produce either 1, 2 or 3 UTF-8 bytes
|
|
|
|
// Check for illegal surrogate words
|
|
const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
|
|
const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
|
|
if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
|
|
}
|
|
|
|
const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
|
|
0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two words (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two words we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
|
|
const __m256i s0 = _mm256_srli_epi16(in_16, 4);
|
|
// [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
|
|
const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
|
|
// [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
|
|
const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
|
|
// [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
|
|
const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
|
|
const __m256i s4 = _mm256_xor_si256(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand words 16-bit => 32-bit
|
|
const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
|
|
const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
|
|
|
|
// 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint32_t mask = (one_byte_bitmask & 0x55555555) |
|
|
(one_or_two_bytes_bitmask & 0xaaaaaaaa);
|
|
// Due to the wider registers, the following path is less likely to be useful.
|
|
/*if(mask == 0) {
|
|
// We only have three-byte words. Use fast path.
|
|
const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
|
|
const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
|
|
const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
|
|
utf8_output += 12;
|
|
buf += 16;
|
|
continue;
|
|
}*/
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
|
|
|
|
const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
|
|
const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
|
|
const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
|
|
const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
|
|
|
|
|
|
const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
|
|
const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
|
|
const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
|
|
const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
|
|
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_2);
|
|
utf8_output += row2[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_3);
|
|
utf8_output += row3[0];
|
|
buf += 16;
|
|
} else {
|
|
// case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// may require large, non-trivial tables?
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if((word & 0xFFFFFF80)==0) { // 1-byte (ASCII)
|
|
*utf8_output++ = char(word);
|
|
} else if((word & 0xFFFFF800)==0) { // 2-byte
|
|
*utf8_output++ = char((word>>6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if((word & 0xFFFF0000 )==0) { // 3-byte
|
|
if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
|
|
*utf8_output++ = char((word>>12) | 0b11100000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else { // 4-byte
|
|
if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); }
|
|
*utf8_output++ = char((word>>18) | 0b11110000);
|
|
*utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
|
|
}
|
|
/* end file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf32_to_utf16.inl.cpp
|
|
/* begin file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */
|
|
// file included directly
|
|
|
|
// Todo: currently, this is just the haswell code, optimize for icelake kernel.
|
|
template <endianness big_endian>
|
|
std::pair<const char32_t*, char16_t*> avx512_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) {
|
|
const char32_t* end = buf + len;
|
|
|
|
const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
|
|
__m256i forbidden_bytemask = _mm256_setzero_si256();
|
|
|
|
|
|
while (buf + 8 + safety_margin <= end) {
|
|
__m256i in = _mm256_loadu_si256((__m256i*)buf);
|
|
|
|
const __m256i v_00000000 = _mm256_setzero_si256();
|
|
const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
|
|
|
|
// no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
|
|
const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
|
|
const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
|
|
|
|
if (saturation_bitmask == 0xffffffff) {
|
|
const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
|
|
const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
|
|
forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
|
|
|
|
__m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
|
|
if (big_endian) {
|
|
const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
|
|
}
|
|
_mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
|
|
utf16_output += 8;
|
|
buf += 8;
|
|
} else {
|
|
size_t forward = 7;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if((word & 0xFFFF0000)==0) {
|
|
// will not generate a surrogate pair
|
|
if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
|
|
*utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (big_endian) {
|
|
high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
|
|
low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
}
|
|
|
|
// check for invalid input
|
|
if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); }
|
|
|
|
return std::make_pair(buf, utf16_output);
|
|
}
|
|
|
|
// Todo: currently, this is just the haswell code, optimize for icelake kernel.
|
|
template <endianness big_endian>
|
|
std::pair<result, char16_t*> avx512_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
|
|
const char32_t* start = buf;
|
|
const char32_t* end = buf + len;
|
|
|
|
const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (buf + 8 + safety_margin <= end) {
|
|
__m256i in = _mm256_loadu_si256((__m256i*)buf);
|
|
|
|
const __m256i v_00000000 = _mm256_setzero_si256();
|
|
const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
|
|
|
|
// no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
|
|
const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
|
|
const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
|
|
|
|
if (saturation_bitmask == 0xffffffff) {
|
|
const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
|
|
const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
|
|
const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
|
|
if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
|
|
}
|
|
|
|
__m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
|
|
if (big_endian) {
|
|
const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
|
|
}
|
|
_mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
|
|
utf16_output += 8;
|
|
buf += 8;
|
|
} else {
|
|
size_t forward = 7;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if((word & 0xFFFF0000)==0) {
|
|
// will not generate a surrogate pair
|
|
if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
|
|
*utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (big_endian) {
|
|
high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
|
|
low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
}
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
|
|
}
|
|
/* end file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_ascii_validation.inl.cpp
|
|
/* begin file src/icelake/icelake_ascii_validation.inl.cpp */
|
|
// file included directly
|
|
|
|
bool validate_ascii(const char* buf, size_t len) {
|
|
const char* end = buf + len;
|
|
const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
|
|
__m512i running_or = _mm512_setzero_si512();
|
|
for (; buf + 64 <= end; buf += 64) {
|
|
const __m512i utf8 = _mm512_loadu_si512((const __m512i*)buf);
|
|
running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii)
|
|
}
|
|
if(buf < end) {
|
|
const __m512i utf8 = _mm512_maskz_loadu_epi8((uint64_t(1) << (end-buf)) - 1,(const __m512i*)buf);
|
|
running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii)
|
|
}
|
|
return (_mm512_test_epi8_mask(running_or, running_or) == 0);
|
|
}
|
|
/* end file src/icelake/icelake_ascii_validation.inl.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_utf32_validation.inl.cpp
|
|
/* begin file src/icelake/icelake_utf32_validation.inl.cpp */
|
|
// file included directly
|
|
|
|
const char32_t* validate_utf32(const char32_t* buf, size_t len) {
|
|
const char32_t* end = len >= 16 ? buf + len - 16 : nullptr;
|
|
|
|
const __m512i offset = _mm512_set1_epi32((uint32_t)0xffff2000);
|
|
__m512i currentmax = _mm512_setzero_si512();
|
|
__m512i currentoffsetmax = _mm512_setzero_si512();
|
|
|
|
while (buf <= end) {
|
|
__m512i utf32 = _mm512_loadu_si512((const __m512i*)buf);
|
|
buf += 16;
|
|
currentoffsetmax = _mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax);
|
|
currentmax = _mm512_max_epu32(utf32, currentmax);
|
|
}
|
|
|
|
const __m512i standardmax = _mm512_set1_epi32((uint32_t)0x10ffff);
|
|
const __m512i standardoffsetmax = _mm512_set1_epi32((uint32_t)0xfffff7ff);
|
|
__m512i is_zero = _mm512_xor_si512(_mm512_max_epu32(currentmax, standardmax), standardmax);
|
|
if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
|
|
return nullptr;
|
|
}
|
|
is_zero = _mm512_xor_si512(_mm512_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
|
|
if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
|
|
return nullptr;
|
|
}
|
|
|
|
return buf;
|
|
}
|
|
/* end file src/icelake/icelake_utf32_validation.inl.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf16_to_utf8.inl.cpp
|
|
/* begin file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */
|
|
// file included directly
|
|
|
|
/**
|
|
* This function converts the input (inbuf, inlen), assumed to be valid
|
|
* UTF16 (little endian) into UTF-8 (to outbuf). The number of words written
|
|
* is written to 'outlen' and the function reports the number of input word
|
|
* consumed.
|
|
*/
|
|
template <endianness big_endian>
|
|
size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen,
|
|
unsigned char *outbuf, size_t *outlen) {
|
|
__m512i in;
|
|
__mmask32 inmask = _cvtu32_mask32(0x7fffffff);
|
|
__m512i byteflip = _mm512_setr_epi64(
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809
|
|
);
|
|
const char16_t * const inbuf_orig = inbuf;
|
|
const unsigned char * const outbuf_orig = outbuf;
|
|
size_t adjust = 0;
|
|
int carry = 0;
|
|
|
|
while (inlen >= 32) {
|
|
in = _mm512_loadu_si512(inbuf);
|
|
if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); }
|
|
inlen -= 31;
|
|
lastiteration:
|
|
inbuf += 31;
|
|
|
|
failiteration:
|
|
const __mmask32 is234byte = _mm512_mask_cmp_epu16_mask(
|
|
inmask, in, _mm512_set1_epi16(0x0080), _MM_CMPINT_NLT);
|
|
|
|
if (_ktestz_mask32_u8(inmask, is234byte)) {
|
|
// fast path for ASCII only
|
|
_mm512_mask_cvtepi16_storeu_epi8(outbuf, inmask, in);
|
|
outbuf += 31;
|
|
carry = 0;
|
|
|
|
if (inlen < 32) {
|
|
goto tail;
|
|
} else {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
const __mmask32 is12byte =
|
|
_mm512_cmp_epu16_mask(in, _mm512_set1_epi16(0x0800), _MM_CMPINT_LT);
|
|
|
|
if (_ktestc_mask32_u8(is12byte, inmask)) {
|
|
// fast path for 1 and 2 byte only
|
|
|
|
const __m512i twobytes = _mm512_ternarylogic_epi32(
|
|
_mm512_slli_epi16(in, 8), _mm512_srli_epi16(in, 6),
|
|
_mm512_set1_epi16(0x3f3f), 0xa8); // (A|B)&C
|
|
in = _mm512_mask_add_epi16(in, is234byte, twobytes,
|
|
_mm512_set1_epi16(int16_t(0x80c0)));
|
|
const __m512i cmpmask =
|
|
_mm512_mask_blend_epi16(inmask, _mm512_set1_epi16(int16_t(0xffff)),
|
|
_mm512_set1_epi16(0x0800));
|
|
const __mmask64 smoosh = _mm512_cmp_epu8_mask(in, cmpmask, _MM_CMPINT_NLT);
|
|
const __m512i out = _mm512_maskz_compress_epi8(smoosh, in);
|
|
_mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(_cvtmask64_u64(smoosh), _cvtmask64_u64(smoosh))),
|
|
out);
|
|
outbuf += 31 + _mm_popcnt_u32(_cvtmask32_u32(is234byte));
|
|
carry = 0;
|
|
|
|
if (inlen < 32) {
|
|
goto tail;
|
|
} else {
|
|
continue;
|
|
}
|
|
}
|
|
__m512i lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
|
|
__m512i hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
|
|
|
|
|
|
__m512i taglo = _mm512_set1_epi32(0x8080e000);
|
|
__m512i taghi = taglo;
|
|
|
|
const __m512i fc00masked = _mm512_and_epi32(in, _mm512_set1_epi16(int16_t(0xfc00)));
|
|
const __mmask32 hisurr = _mm512_mask_cmp_epu16_mask(
|
|
inmask, fc00masked, _mm512_set1_epi16(int16_t(0xd800)), _MM_CMPINT_EQ);
|
|
const __mmask32 losurr = _mm512_cmp_epu16_mask(
|
|
fc00masked, _mm512_set1_epi16(int16_t(0xdc00)), _MM_CMPINT_EQ);
|
|
|
|
int carryout = 0;
|
|
if (!_kortestz_mask32_u8(hisurr, losurr)) {
|
|
// handle surrogates
|
|
|
|
__m512i los = _mm512_alignr_epi32(hi, lo, 1);
|
|
__m512i his = _mm512_alignr_epi32(lo, hi, 1);
|
|
|
|
const __mmask32 hisurrhi = _kshiftri_mask32(hisurr, 16);
|
|
taglo =
|
|
_mm512_mask_mov_epi32(taglo,__mmask16(hisurr), _mm512_set1_epi32(0x808080f0));
|
|
taghi =
|
|
_mm512_mask_mov_epi32(taghi, __mmask16(hisurrhi), _mm512_set1_epi32(0x808080f0));
|
|
|
|
lo = _mm512_mask_slli_epi32(lo, __mmask16(hisurr), lo, 10);
|
|
hi = _mm512_mask_slli_epi32(hi, __mmask16(hisurrhi), hi, 10);
|
|
los = _mm512_add_epi32(los, _mm512_set1_epi32(0xfca02400));
|
|
his = _mm512_add_epi32(his, _mm512_set1_epi32(0xfca02400));
|
|
lo = _mm512_mask_add_epi32(lo, __mmask16(hisurr), lo, los);
|
|
hi = _mm512_mask_add_epi32(hi, __mmask16(hisurrhi), hi, his);
|
|
|
|
carryout = _cvtu32_mask32(_kshiftri_mask32(hisurr, 30));
|
|
|
|
const uint32_t h = _cvtmask32_u32(hisurr);
|
|
const uint32_t l = _cvtmask32_u32(losurr);
|
|
// check for mismatched surrogates
|
|
if ((h + h + carry) ^ l) {
|
|
const uint32_t lonohi = l & ~(h + h + carry);
|
|
const uint32_t hinolo = h & ~(l >> 1);
|
|
inlen = _tzcnt_u32(hinolo | lonohi);
|
|
inmask = __mmask32(0x7fffffff & ((1 << inlen) - 1));
|
|
in = _mm512_maskz_mov_epi16(inmask, in);
|
|
adjust = (int)inlen - 31;
|
|
inlen = 0;
|
|
goto failiteration;
|
|
}
|
|
}
|
|
|
|
hi = _mm512_maskz_mov_epi32(_cvtu32_mask16(0x7fff),hi);
|
|
carry = carryout;
|
|
|
|
__m512i mslo =
|
|
_mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), lo);
|
|
|
|
__m512i mshi =
|
|
_mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), hi);
|
|
|
|
const __mmask32 outmask = __mmask32(_kandn_mask64(losurr, inmask));
|
|
const __mmask64 outmhi = _kshiftri_mask64(outmask, 16);
|
|
|
|
const __mmask32 is1byte = __mmask32(_knot_mask64(is234byte));
|
|
const __mmask64 is1bhi = _kshiftri_mask64(is1byte, 16);
|
|
const __mmask64 is12bhi = _kshiftri_mask64(is12byte, 16);
|
|
|
|
taglo =
|
|
_mm512_mask_mov_epi32(taglo, __mmask16(is12byte), _mm512_set1_epi32(0x80c00000));
|
|
taghi =
|
|
_mm512_mask_mov_epi32(taghi, __mmask16(is12bhi), _mm512_set1_epi32(0x80c00000));
|
|
__m512i magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff),
|
|
_mm512_set1_epi32(0x00010101));
|
|
__m512i magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff),
|
|
_mm512_set1_epi32(0x00010101));
|
|
|
|
|
|
magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff),
|
|
_mm512_set1_epi32(0x00010101));
|
|
magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff),
|
|
_mm512_set1_epi32(0x00010101));
|
|
|
|
mslo = _mm512_ternarylogic_epi32(mslo, _mm512_set1_epi32(0x3f3f3f3f), taglo,
|
|
0xea); // A&B|C
|
|
mshi = _mm512_ternarylogic_epi32(mshi, _mm512_set1_epi32(0x3f3f3f3f), taghi,
|
|
0xea);
|
|
mslo = _mm512_mask_slli_epi32(mslo, __mmask16(is1byte), lo, 24);
|
|
|
|
mshi = _mm512_mask_slli_epi32(mshi, __mmask16(is1bhi), hi, 24);
|
|
|
|
const __mmask64 wantlo = _mm512_cmp_epu8_mask(mslo, magiclo, _MM_CMPINT_NLT);
|
|
const __mmask64 wanthi = _mm512_cmp_epu8_mask(mshi, magichi, _MM_CMPINT_NLT);
|
|
const __m512i outlo = _mm512_maskz_compress_epi8(wantlo, mslo);
|
|
const __m512i outhi = _mm512_maskz_compress_epi8(wanthi, mshi);
|
|
const uint64_t wantlo_uint64 = _cvtmask64_u64(wantlo);
|
|
const uint64_t wanthi_uint64 = _cvtmask64_u64(wanthi);
|
|
|
|
uint64_t advlo = _mm_popcnt_u64(wantlo_uint64);
|
|
uint64_t advhi = _mm_popcnt_u64(wanthi_uint64);
|
|
|
|
_mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(wantlo_uint64, wantlo_uint64)), outlo);
|
|
_mm512_mask_storeu_epi8(outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)), outhi);
|
|
outbuf += advlo + advhi;
|
|
}
|
|
outbuf -= adjust;
|
|
|
|
tail:
|
|
if (inlen != 0) {
|
|
// We must have inlen < 31.
|
|
inmask = _cvtu32_mask32((1 << inlen) - 1);
|
|
in = _mm512_maskz_loadu_epi16(inmask, inbuf);
|
|
if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); }
|
|
adjust = inlen - 31;
|
|
inlen = 0;
|
|
goto lastiteration;
|
|
}
|
|
*outlen = (outbuf - outbuf_orig) + adjust;
|
|
return ((inbuf - inbuf_orig) + adjust);
|
|
}
|
|
/* end file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */
|
|
|
|
} // namespace
|
|
} // namespace icelake
|
|
} // namespace simdutf
|
|
|
|
namespace simdutf {
|
|
namespace icelake {
|
|
|
|
|
|
simdutf_warn_unused int
|
|
implementation::detect_encodings(const char *input,
|
|
size_t length) const noexcept {
|
|
// If there is a BOM, then we trust it.
|
|
auto bom_encoding = simdutf::BOM::check_bom(input, length);
|
|
if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
|
|
if (length % 2 == 0) {
|
|
const char *buf = input;
|
|
|
|
const char *start = buf;
|
|
const char *end = input + length;
|
|
|
|
bool is_utf8 = true;
|
|
bool is_utf16 = true;
|
|
bool is_utf32 = true;
|
|
|
|
int out = 0;
|
|
|
|
avx512_utf8_checker checker{};
|
|
__m512i currentmax = _mm512_setzero_si512();
|
|
while (buf + 64 <= end) {
|
|
__m512i in = _mm512_loadu_si512((__m512i *)buf);
|
|
__m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
|
|
__mmask32 surrogates =
|
|
_mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
|
|
if (surrogates) {
|
|
is_utf8 = false;
|
|
|
|
// Can still be either UTF-16LE or UTF-32 depending on the positions
|
|
// of the surrogates To be valid UTF-32, a surrogate cannot be in the
|
|
// two most significant bytes of any 32-bit word. On the other hand, to
|
|
// be valid UTF-16LE, at least one surrogate must be in the two most
|
|
// significant bytes of a 32-bit word since they always come in pairs in
|
|
// UTF-16LE. Note that we always proceed in multiple of 4 before this
|
|
// point so there is no offset in 32-bit words.
|
|
|
|
if ((surrogates & 0xaaaaaaaa) != 0) {
|
|
is_utf32 = false;
|
|
__mmask32 highsurrogates = _mm512_cmplt_epu16_mask(
|
|
diff, _mm512_set1_epi16(uint16_t(0x0400)));
|
|
__mmask32 lowsurrogates = surrogates ^ highsurrogates;
|
|
// high must be followed by low
|
|
if ((highsurrogates << 1) != lowsurrogates) {
|
|
return simdutf::encoding_type::unspecified;
|
|
}
|
|
|
|
bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
|
|
if (ends_with_high) {
|
|
buf +=
|
|
31 *
|
|
sizeof(char16_t); // advance only by 31 words so that we start
|
|
// with the high surrogate on the next round.
|
|
} else {
|
|
buf += 32 * sizeof(char16_t);
|
|
}
|
|
is_utf16 = validate_utf16le(reinterpret_cast<const char16_t *>(buf),
|
|
(end - buf) / sizeof(char16_t));
|
|
if (!is_utf16) {
|
|
return simdutf::encoding_type::unspecified;
|
|
|
|
} else {
|
|
return simdutf::encoding_type::UTF16_LE;
|
|
}
|
|
|
|
} else {
|
|
is_utf16 = false;
|
|
// Check for UTF-32
|
|
if (length % 4 == 0) {
|
|
const char32_t *input32 = reinterpret_cast<const char32_t *>(buf);
|
|
const char32_t *end32 =
|
|
reinterpret_cast<const char32_t *>(start) + length / 4;
|
|
if (validate_utf32(input32, end32 - input32)) {
|
|
return simdutf::encoding_type::UTF32_LE;
|
|
}
|
|
}
|
|
return simdutf::encoding_type::unspecified;
|
|
}
|
|
break;
|
|
}
|
|
// If no surrogate, validate under other encodings as well
|
|
|
|
// UTF-32 validation
|
|
currentmax = _mm512_max_epu32(in, currentmax);
|
|
|
|
// UTF-8 validation
|
|
checker.check_next_input(in);
|
|
|
|
buf += 64;
|
|
}
|
|
|
|
// Check which encodings are possible
|
|
|
|
if (is_utf8) {
|
|
size_t current_length = static_cast<size_t>(buf - start);
|
|
if (current_length != length) {
|
|
const __m512i utf8 = _mm512_maskz_loadu_epi8(
|
|
(1ULL << (length - current_length)) - 1, (const __m512i *)buf);
|
|
checker.check_next_input(utf8);
|
|
}
|
|
checker.check_eof();
|
|
if (!checker.errors()) {
|
|
out |= simdutf::encoding_type::UTF8;
|
|
}
|
|
}
|
|
|
|
if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(
|
|
reinterpret_cast<const char16_t *>(buf),
|
|
(length - (buf - start)) / 2)) {
|
|
out |= simdutf::encoding_type::UTF16_LE;
|
|
}
|
|
|
|
if (is_utf32 && (length % 4 == 0)) {
|
|
currentmax = _mm512_max_epu32(
|
|
_mm512_maskz_loadu_epi8(
|
|
(1ULL << (length - static_cast<size_t>(buf - start))) - 1,
|
|
(const __m512i *)buf),
|
|
currentmax);
|
|
__mmask16 outside_range = _mm512_cmp_epu32_mask(currentmax, _mm512_set1_epi32(0x10ffff),
|
|
_MM_CMPINT_GT);
|
|
if (outside_range == 0) {
|
|
out |= simdutf::encoding_type::UTF32_LE;
|
|
}
|
|
}
|
|
|
|
return out;
|
|
} else if (implementation::validate_utf8(input, length)) {
|
|
return simdutf::encoding_type::UTF8;
|
|
} else {
|
|
return simdutf::encoding_type::unspecified;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
avx512_utf8_checker checker{};
|
|
const char* ptr = buf;
|
|
const char* end = ptr + len;
|
|
for (; ptr + 64 <= end; ptr += 64) {
|
|
const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
|
|
checker.check_next_input(utf8);
|
|
}
|
|
{
|
|
const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - ptr))-1, (const __m512i*)ptr);
|
|
checker.check_next_input(utf8);
|
|
}
|
|
checker.check_eof();
|
|
return ! checker.errors();
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
|
|
avx512_utf8_checker checker{};
|
|
const char* ptr = buf;
|
|
const char* end = ptr + len;
|
|
size_t count{0};
|
|
for (; ptr + 64 <= end; ptr += 64) {
|
|
const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
|
|
checker.check_next_input(utf8);
|
|
if(checker.errors()) {
|
|
if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(buf), reinterpret_cast<const char*>(buf + count), len - count);
|
|
res.count += count;
|
|
return res;
|
|
}
|
|
count += 64;
|
|
}
|
|
{
|
|
const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - ptr))-1, (const __m512i*)ptr);
|
|
checker.check_next_input(utf8);
|
|
if(checker.errors()) {
|
|
if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(buf), reinterpret_cast<const char*>(buf + count), len - count);
|
|
res.count += count;
|
|
return res;
|
|
} else {
|
|
return result(error_code::SUCCESS, len);
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
|
|
return icelake::validate_ascii(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
|
|
const char* buf_orig = buf;
|
|
const char* end = buf + len;
|
|
const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
|
|
for (; buf + 64 <= end; buf += 64) {
|
|
const __m512i input = _mm512_loadu_si512((const __m512i*)buf);
|
|
__mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
|
|
if(notascii) {
|
|
return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii));
|
|
}
|
|
}
|
|
{
|
|
const __m512i input = _mm512_maskz_loadu_epi8((1ULL<<(end - buf))-1, (const __m512i*)buf);
|
|
__mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
|
|
if(notascii) {
|
|
return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii));
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, len);
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
|
|
const char16_t *end = buf + len;
|
|
|
|
for(;buf + 32 <= end; ) {
|
|
__m512i in = _mm512_loadu_si512((__m512i*)buf);
|
|
__m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
|
|
__mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
|
|
if(surrogates) {
|
|
__mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
|
|
__mmask32 lowsurrogates = surrogates ^ highsurrogates;
|
|
// high must be followed by low
|
|
if ((highsurrogates << 1) != lowsurrogates) {
|
|
return false;
|
|
}
|
|
bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
|
|
if(ends_with_high) {
|
|
buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
|
|
} else {
|
|
buf += 32;
|
|
}
|
|
} else {
|
|
buf += 32;
|
|
}
|
|
}
|
|
if(buf < end) {
|
|
__m512i in = _mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf);
|
|
__m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
|
|
__mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
|
|
if(surrogates) {
|
|
__mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
|
|
__mmask32 lowsurrogates = surrogates ^ highsurrogates;
|
|
// high must be followed by low
|
|
if ((highsurrogates << 1) != lowsurrogates) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
|
|
const char16_t *end = buf + len;
|
|
const __m512i byteflip = _mm512_setr_epi64(
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809
|
|
);
|
|
for(;buf + 32 <= end; ) {
|
|
__m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip);
|
|
__m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
|
|
__mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
|
|
if(surrogates) {
|
|
__mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
|
|
__mmask32 lowsurrogates = surrogates ^ highsurrogates;
|
|
// high must be followed by low
|
|
if ((highsurrogates << 1) != lowsurrogates) {
|
|
return false;
|
|
}
|
|
bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
|
|
if(ends_with_high) {
|
|
buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
|
|
} else {
|
|
buf += 32;
|
|
}
|
|
} else {
|
|
buf += 32;
|
|
}
|
|
}
|
|
if(buf < end) {
|
|
__m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf), byteflip);
|
|
__m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
|
|
__mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
|
|
if(surrogates) {
|
|
__mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
|
|
__mmask32 lowsurrogates = surrogates ^ highsurrogates;
|
|
// high must be followed by low
|
|
if ((highsurrogates << 1) != lowsurrogates) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
|
|
const char16_t *start_buf = buf;
|
|
const char16_t *end = buf + len;
|
|
for(;buf + 32 <= end; ) {
|
|
__m512i in = _mm512_loadu_si512((__m512i*)buf);
|
|
__m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
|
|
__mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
|
|
if(surrogates) {
|
|
__mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
|
|
__mmask32 lowsurrogates = surrogates ^ highsurrogates;
|
|
// high must be followed by low
|
|
if ((highsurrogates << 1) != lowsurrogates) {
|
|
uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
|
|
uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
|
|
return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
|
|
}
|
|
bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
|
|
if(ends_with_high) {
|
|
buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
|
|
} else {
|
|
buf += 32;
|
|
}
|
|
} else {
|
|
buf += 32;
|
|
}
|
|
}
|
|
if(buf < end) {
|
|
__m512i in = _mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf);
|
|
__m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
|
|
__mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
|
|
if(surrogates) {
|
|
__mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
|
|
__mmask32 lowsurrogates = surrogates ^ highsurrogates;
|
|
// high must be followed by low
|
|
if ((highsurrogates << 1) != lowsurrogates) {
|
|
uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
|
|
uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
|
|
return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
|
|
}
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
|
|
const char16_t *start_buf = buf;
|
|
const char16_t *end = buf + len;
|
|
const __m512i byteflip = _mm512_setr_epi64(
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809
|
|
);
|
|
for(;buf + 32 <= end; ) {
|
|
__m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip);
|
|
__m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
|
|
__mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
|
|
if(surrogates) {
|
|
__mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
|
|
__mmask32 lowsurrogates = surrogates ^ highsurrogates;
|
|
// high must be followed by low
|
|
if ((highsurrogates << 1) != lowsurrogates) {
|
|
uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
|
|
uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
|
|
return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
|
|
}
|
|
bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
|
|
if(ends_with_high) {
|
|
buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
|
|
} else {
|
|
buf += 32;
|
|
}
|
|
} else {
|
|
buf += 32;
|
|
}
|
|
}
|
|
if(buf < end) {
|
|
__m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf), byteflip);
|
|
__m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
|
|
__mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
|
|
if(surrogates) {
|
|
__mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
|
|
__mmask32 lowsurrogates = surrogates ^ highsurrogates;
|
|
// high must be followed by low
|
|
if ((highsurrogates << 1) != lowsurrogates) {
|
|
uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
|
|
uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
|
|
return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
|
|
}
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, len);
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
|
|
const char32_t * tail = icelake::validate_utf32(buf, len);
|
|
if (tail) {
|
|
return scalar::utf32::validate(tail, len - (tail - buf));
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
|
|
|
|
const char32_t* end = len >= 16 ? buf + len - 16 : nullptr;
|
|
const char32_t* buf_orig = buf;
|
|
while (buf <= end) {
|
|
__m512i utf32 = _mm512_loadu_si512((const __m512i*)buf);
|
|
__mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff),
|
|
_MM_CMPINT_GT);
|
|
if (outside_range) {
|
|
return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range));
|
|
}
|
|
|
|
__m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
|
|
|
|
__mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff),
|
|
_MM_CMPINT_GT);
|
|
if (surrogate_range) {
|
|
return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range));
|
|
}
|
|
buf += 16;
|
|
}
|
|
if(buf < buf_orig + len) {
|
|
__m512i utf32 = _mm512_maskz_loadu_epi32(__mmask16((1<<(buf_orig + len - buf))-1),(const __m512i*)buf);
|
|
__mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff),
|
|
_MM_CMPINT_GT);
|
|
if (outside_range) {
|
|
return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range));
|
|
}
|
|
__m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
|
|
|
|
__mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff),
|
|
_MM_CMPINT_GT);
|
|
if (surrogate_range) {
|
|
return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range));
|
|
}
|
|
}
|
|
|
|
return result(error_code::SUCCESS, len);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
|
|
if (ret.second == nullptr) {
|
|
return 0;
|
|
}
|
|
return ret.second - utf16_output;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::BIG>(buf, len, utf16_output);
|
|
if (ret.second == nullptr) {
|
|
return 0;
|
|
}
|
|
return ret.second - utf16_output;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length<endianness::LITTLE, char16_t>(buf, len, utf16_output);
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
const char* end = buf + len;
|
|
if (ret.first == end) {
|
|
return saved_bytes;
|
|
}
|
|
|
|
// Note: AVX512 procedure looks up 4 bytes forward, and
|
|
// correctly converts multi-byte chars even if their
|
|
// continuation bytes lie outsiede 16-byte window.
|
|
// It meas, we have to skip continuation bytes from
|
|
// the beginning ret.first, as they were already consumed.
|
|
while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
|
|
ret.first += 1;
|
|
}
|
|
|
|
if (ret.first != end) {
|
|
const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length<endianness::BIG, char16_t>(buf, len, utf16_output);
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
const char* end = buf + len;
|
|
if (ret.first == end) {
|
|
return saved_bytes;
|
|
}
|
|
|
|
// Note: AVX512 procedure looks up 4 bytes forward, and
|
|
// correctly converts multi-byte chars even if their
|
|
// continuation bytes lie outsiede 16-byte window.
|
|
// It meas, we have to skip continuation bytes from
|
|
// the beginning ret.first, as they were already consumed.
|
|
while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
|
|
ret.first += 1;
|
|
}
|
|
|
|
if (ret.first != end) {
|
|
const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
|
|
return saved_bytes;
|
|
}
|
|
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept {
|
|
uint32_t * utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
|
|
utf8_to_utf32_result ret = icelake::validating_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
|
|
if (ret.second == nullptr)
|
|
return 0;
|
|
|
|
size_t saved_bytes = ret.second - utf32_output;
|
|
const char* end = buf + len;
|
|
if (ret.first == end) {
|
|
return saved_bytes;
|
|
}
|
|
|
|
// Note: the AVX512 procedure looks up 4 bytes forward, and
|
|
// correctly converts multi-byte chars even if their
|
|
// continuation bytes lie outside 16-byte window.
|
|
// It means, we have to skip continuation bytes from
|
|
// the beginning ret.first, as they were already consumed.
|
|
while (ret.first != end and ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
|
|
ret.first += 1;
|
|
}
|
|
|
|
if (ret.first != end) {
|
|
const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert(
|
|
ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32) const noexcept {
|
|
uint32_t * utf32_output = reinterpret_cast<uint32_t *>(utf32);
|
|
auto ret = icelake::validating_utf8_to_fixed_length_with_constant_checks<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
|
|
if (!std::get<2>(ret)) {
|
|
auto new_buf = std::get<0>(ret);
|
|
// rewind_and_convert_with_errors will seek a potential error from new_buf onward,
|
|
// with the ability to go back up to new_buf - buf bytes, and read len - (new_buf - buf) bytes forward.
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(new_buf - buf, new_buf, len - (new_buf - buf), reinterpret_cast<char32_t *>(std::get<1>(ret)));
|
|
res.count += (std::get<0>(ret) - buf);
|
|
return res;
|
|
}
|
|
size_t saved_bytes = std::get<1>(ret) - utf32_output;
|
|
const char* end = buf + len;
|
|
if (std::get<0>(ret) == end) {
|
|
return {simdutf::SUCCESS, saved_bytes};
|
|
}
|
|
|
|
// Note: the AVX512 procedure looks up 4 bytes forward, and
|
|
// correctly converts multi-byte chars even if their
|
|
// continuation bytes lie outside 16-byte window.
|
|
// It means, we have to skip continuation bytes from
|
|
// the beginning ret.first, as they were already consumed.
|
|
while (std::get<0>(ret) != end and ((uint8_t(*std::get<0>(ret)) & 0xc0) == 0x80)) {
|
|
std::get<0>(ret) += 1;
|
|
}
|
|
|
|
if (std::get<0>(ret) != end) {
|
|
auto scalar_result = scalar::utf8_to_utf32::convert_with_errors(
|
|
std::get<0>(ret), len - (std::get<0>(ret) - buf), reinterpret_cast<char32_t *>(utf32_output) + saved_bytes);
|
|
if (scalar_result.error != simdutf::SUCCESS) {
|
|
scalar_result.count += (std::get<0>(ret) - buf);
|
|
} else {
|
|
scalar_result.count += saved_bytes;
|
|
}
|
|
return scalar_result;
|
|
}
|
|
|
|
return {simdutf::SUCCESS, size_t(std::get<1>(ret) - utf32_output)};
|
|
}
|
|
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept {
|
|
uint32_t * utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
|
|
utf8_to_utf32_result ret = icelake::valid_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
|
|
size_t saved_bytes = ret.second - utf32_output;
|
|
const char* end = buf + len;
|
|
if (ret.first == end) {
|
|
return saved_bytes;
|
|
}
|
|
|
|
// Note: AVX512 procedure looks up 4 bytes forward, and
|
|
// correctly converts multi-byte chars even if their
|
|
// continuation bytes lie outsiede 16-byte window.
|
|
// It meas, we have to skip continuation bytes from
|
|
// the beginning ret.first, as they were already consumed.
|
|
while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
|
|
ret.first += 1;
|
|
}
|
|
|
|
if (ret.first != end) {
|
|
const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert_valid(
|
|
ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
size_t outlen;
|
|
size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(buf, len, (unsigned char*)utf8_output, &outlen);
|
|
if(inlen != len) { return 0; }
|
|
return outlen;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
size_t outlen;
|
|
size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(buf, len, (unsigned char*)utf8_output, &outlen);
|
|
if(inlen != len) { return 0; }
|
|
return outlen;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
size_t outlen;
|
|
size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(buf, len, (unsigned char*)utf8_output, &outlen);
|
|
if(inlen != len) {
|
|
result res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf + inlen, len - outlen, utf8_output + outlen);
|
|
res.count += inlen;
|
|
return res;
|
|
}
|
|
return {simdutf::SUCCESS, outlen};
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
size_t outlen;
|
|
size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(buf, len, (unsigned char*)utf8_output, &outlen);
|
|
if(inlen != len) {
|
|
result res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf + inlen, len - outlen, utf8_output + outlen);
|
|
res.count += inlen;
|
|
return res;
|
|
}
|
|
return {simdutf::SUCCESS, outlen};
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return convert_utf16le_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return convert_utf16be_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
std::pair<const char32_t*, char*> ret = avx512_convert_utf32_to_utf8(buf, len, utf8_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char*> ret = icelake::avx512_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return convert_utf32_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
std::pair<const char32_t*, char16_t*> ret = avx512_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
std::pair<const char32_t*, char16_t*> ret = avx512_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char16_t*> ret = avx512_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char16_t*> ret = avx512_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return convert_utf32_to_utf16le(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return convert_utf32_to_utf16be(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
|
|
if (!std::get<2>(ret)) { return 0; }
|
|
size_t saved_bytes = std::get<1>(ret) - utf32_output;
|
|
if (std::get<0>(ret) != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
|
|
std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
|
|
if (!std::get<2>(ret)) { return 0; }
|
|
size_t saved_bytes = std::get<1>(ret) - utf32_output;
|
|
if (std::get<0>(ret) != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
|
|
std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
|
|
if (!std::get<2>(ret)) {
|
|
result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
|
|
std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
|
|
scalar_res.count += (std::get<0>(ret) - buf);
|
|
return scalar_res;
|
|
}
|
|
size_t saved_bytes = std::get<1>(ret) - utf32_output;
|
|
if (std::get<0>(ret) != buf + len) {
|
|
result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
|
|
std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
|
|
if (scalar_res.error) {
|
|
scalar_res.count += (std::get<0>(ret) - buf);
|
|
return scalar_res;
|
|
} else {
|
|
scalar_res.count += saved_bytes;
|
|
return scalar_res;
|
|
}
|
|
}
|
|
return simdutf::result(simdutf::SUCCESS, saved_bytes);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
|
|
if (!std::get<2>(ret)) {
|
|
result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
|
|
std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
|
|
scalar_res.count += (std::get<0>(ret) - buf);
|
|
return scalar_res;
|
|
}
|
|
size_t saved_bytes = std::get<1>(ret) - utf32_output;
|
|
if (std::get<0>(ret) != buf + len) {
|
|
result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
|
|
std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
|
|
if (scalar_res.error) {
|
|
scalar_res.count += (std::get<0>(ret) - buf);
|
|
return scalar_res;
|
|
} else {
|
|
scalar_res.count += saved_bytes;
|
|
return scalar_res;
|
|
}
|
|
}
|
|
return simdutf::result(simdutf::SUCCESS, saved_bytes);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
|
|
if (!std::get<2>(ret)) { return 0; }
|
|
size_t saved_bytes = std::get<1>(ret) - utf32_output;
|
|
if (std::get<0>(ret) != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
|
|
std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
|
|
if (!std::get<2>(ret)) { return 0; }
|
|
size_t saved_bytes = std::get<1>(ret) - utf32_output;
|
|
if (std::get<0>(ret) != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
|
|
std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
|
|
size_t pos = 0;
|
|
const __m512i byteflip = _mm512_setr_epi64(
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809
|
|
);
|
|
while (pos + 32 <= length) {
|
|
__m512i utf16 = _mm512_loadu_si512((const __m512i*)(input + pos));
|
|
utf16 = _mm512_shuffle_epi8(utf16, byteflip);
|
|
_mm512_storeu_si512(output + pos, utf16);
|
|
pos += 32;
|
|
}
|
|
if(pos < length) {
|
|
__mmask32 m((1<< (length - pos))-1);
|
|
__m512i utf16 = _mm512_maskz_loadu_epi16(m, (const __m512i*)(input + pos));
|
|
utf16 = _mm512_shuffle_epi8(utf16, byteflip);
|
|
_mm512_mask_storeu_epi16(output + pos, m, utf16);
|
|
}
|
|
}
|
|
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
|
|
const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
|
|
const char16_t* ptr = input;
|
|
|
|
const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
|
|
const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
|
|
|
|
size_t count{0};
|
|
|
|
while (ptr <= end) {
|
|
__m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
|
|
ptr += 32;
|
|
uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
|
|
count += count_ones(not_high_surrogate);
|
|
}
|
|
|
|
return count + scalar::utf16::count_code_points<endianness::LITTLE>(ptr, length - (ptr - input));
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
|
|
const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
|
|
const char16_t* ptr = input;
|
|
|
|
const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
|
|
const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
|
|
|
|
size_t count{0};
|
|
const __m512i byteflip = _mm512_setr_epi64(
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809
|
|
);
|
|
while (ptr <= end) {
|
|
__m512i utf16 = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)ptr), byteflip);
|
|
ptr += 32;
|
|
uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
|
|
count += count_ones(not_high_surrogate);
|
|
}
|
|
|
|
return count + scalar::utf16::count_code_points<endianness::BIG>(ptr, length - (ptr - input));
|
|
}
|
|
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
|
|
const char* end = length >= 64 ? input + length - 64 : nullptr;
|
|
const char* ptr = input;
|
|
|
|
const __m512i continuation = _mm512_set1_epi8(char(0b10111111));
|
|
|
|
size_t count{0};
|
|
|
|
while (ptr <= end) {
|
|
__m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
|
|
ptr += 64;
|
|
uint64_t continuation_bitmask = static_cast<uint64_t>(_mm512_cmple_epi8_mask(utf8, continuation));
|
|
count += 64 - count_ones(continuation_bitmask);
|
|
}
|
|
|
|
return count + scalar::utf8::count_code_points(ptr, length - (ptr - input));
|
|
}
|
|
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
|
|
const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
|
|
const char16_t* ptr = input;
|
|
|
|
const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
|
|
const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
|
|
const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
|
|
const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
|
|
|
|
size_t count{0};
|
|
|
|
while (ptr <= end) {
|
|
__m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
|
|
ptr += 32;
|
|
__mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
|
|
__mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
|
|
__mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
|
|
__mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
|
|
|
|
size_t ascii_count = count_ones(ascii_bitmask);
|
|
size_t two_bytes_count = count_ones(two_bytes_bitmask);
|
|
size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
|
|
size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
|
|
|
|
count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 2*surrogate_bytes_count;
|
|
}
|
|
|
|
return count + scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(ptr, length - (ptr - input));
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
|
|
const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
|
|
const char16_t* ptr = input;
|
|
|
|
const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
|
|
const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
|
|
const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
|
|
const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
|
|
|
|
size_t count{0};
|
|
const __m512i byteflip = _mm512_setr_epi64(
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809,
|
|
0x0607040502030001,
|
|
0x0e0f0c0d0a0b0809
|
|
);
|
|
while (ptr <= end) {
|
|
__m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
|
|
utf16 = _mm512_shuffle_epi8(utf16, byteflip);
|
|
ptr += 32;
|
|
__mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
|
|
__mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
|
|
__mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
|
|
__mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
|
|
|
|
size_t ascii_count = count_ones(ascii_bitmask);
|
|
size_t two_bytes_count = count_ones(two_bytes_bitmask);
|
|
size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
|
|
size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
|
|
count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 2*surrogate_bytes_count;
|
|
}
|
|
|
|
return count + scalar::utf16::utf8_length_from_utf16<endianness::BIG>(ptr, length - (ptr - input));
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
|
|
return implementation::count_utf16le(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
|
|
return implementation::count_utf16be(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
// This algorithm could no doubt be improved!
|
|
for(;pos + 64 <= length; pos += 64) {
|
|
__m512i utf8 = _mm512_loadu_si512((const __m512i*)(input+pos));
|
|
uint64_t utf8_continuation_mask = _mm512_cmple_epi8_mask(utf8, _mm512_set1_epi8(-65+1));
|
|
// We count one word for anything that is not a continuation (so
|
|
// leading bytes).
|
|
count += 64 - count_ones(utf8_continuation_mask);
|
|
uint64_t utf8_4byte = _mm512_cmpge_epu8_mask(utf8, _mm512_set1_epi8(int8_t(240)));
|
|
count += count_ones(utf8_4byte);
|
|
}
|
|
return count + scalar::utf8::utf16_length_from_utf8(input + pos, length - pos);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
|
|
const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
|
|
const char32_t* ptr = input;
|
|
|
|
const __m512i v_0000_007f = _mm512_set1_epi32((uint32_t)0x7f);
|
|
const __m512i v_0000_07ff = _mm512_set1_epi32((uint32_t)0x7ff);
|
|
const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
|
|
|
|
size_t count{0};
|
|
|
|
while (ptr <= end) {
|
|
__m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
|
|
ptr += 16;
|
|
__mmask16 ascii_bitmask = _mm512_cmple_epu32_mask(utf32, v_0000_007f);
|
|
__mmask16 two_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(ascii_bitmask), utf32, v_0000_07ff);
|
|
__mmask16 three_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(_mm512_kor(ascii_bitmask, two_bytes_bitmask)), utf32, v_0000_ffff);
|
|
|
|
size_t ascii_count = count_ones(ascii_bitmask);
|
|
size_t two_bytes_count = count_ones(two_bytes_bitmask);
|
|
size_t three_bytes_count = count_ones(three_bytes_bitmask);
|
|
size_t four_bytes_count = 16 - ascii_count - two_bytes_count - three_bytes_count;
|
|
count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 4*four_bytes_count;
|
|
}
|
|
|
|
return count + scalar::utf32::utf8_length_from_utf32(ptr, length - (ptr - input));
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
|
|
const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
|
|
const char32_t* ptr = input;
|
|
|
|
const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
|
|
|
|
size_t count{0};
|
|
|
|
while (ptr <= end) {
|
|
__m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
|
|
ptr += 16;
|
|
__mmask16 surrogates_bitmask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
|
|
|
|
count += 16 + count_ones(surrogates_bitmask);
|
|
}
|
|
|
|
return count + scalar::utf32::utf16_length_from_utf32(ptr, length - (ptr - input));
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
|
|
return implementation::count_utf8(input, length);
|
|
}
|
|
|
|
} // namespace icelake
|
|
} // namespace simdutf
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/end.h
|
|
/* begin file src/simdutf/icelake/end.h */
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_UNTARGET_REGION
|
|
#endif
|
|
|
|
|
|
#if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
|
|
SIMDUTF_POP_DISABLE_WARNINGS
|
|
#endif // end of workaround
|
|
/* end file src/simdutf/icelake/end.h */
|
|
/* end file src/icelake/implementation.cpp */
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_HASWELL
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/implementation.cpp
|
|
/* begin file src/haswell/implementation.cpp */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/begin.h
|
|
/* begin file src/simdutf/haswell/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "haswell"
|
|
// #define SIMDUTF_IMPLEMENTATION haswell
|
|
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_TARGET_HASWELL
|
|
#endif
|
|
|
|
#if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
|
|
SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
|
|
#endif // end of workaround
|
|
/* end file src/simdutf/haswell/begin.h */
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
#ifndef SIMDUTF_HASWELL_H
|
|
#error "haswell.h must be included"
|
|
#endif
|
|
using namespace simd;
|
|
|
|
|
|
simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
|
|
return input.reduce_or().is_ascii();
|
|
}
|
|
|
|
simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
|
|
simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
|
|
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
|
|
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
|
|
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
|
|
return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
|
|
}
|
|
|
|
simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
|
|
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
|
|
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
|
|
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
|
|
return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
|
|
}
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_detect_encodings.cpp
|
|
/* begin file src/haswell/avx2_detect_encodings.cpp */
|
|
template<class checker>
|
|
// len is known to be a multiple of 2 when this is called
|
|
int avx2_detect_encodings(const char * buf, size_t len) {
|
|
const char* start = buf;
|
|
const char* end = buf + len;
|
|
|
|
bool is_utf8 = true;
|
|
bool is_utf16 = true;
|
|
bool is_utf32 = true;
|
|
|
|
int out = 0;
|
|
|
|
const auto v_d8 = simd8<uint8_t>::splat(0xd8);
|
|
const auto v_f8 = simd8<uint8_t>::splat(0xf8);
|
|
|
|
__m256i currentmax = _mm256_setzero_si256();
|
|
|
|
checker check{};
|
|
|
|
while(buf + 64 <= end) {
|
|
__m256i in = _mm256_loadu_si256((__m256i*)buf);
|
|
__m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
|
|
|
|
const auto u0 = simd16<uint16_t>(in);
|
|
const auto u1 = simd16<uint16_t>(nextin);
|
|
|
|
const auto v0 = u0.shr<8>();
|
|
const auto v1 = u1.shr<8>();
|
|
|
|
const auto in16 = simd16<uint16_t>::pack(v0, v1);
|
|
|
|
const auto surrogates_wordmask0 = (in16 & v_f8) == v_d8;
|
|
uint32_t surrogates_bitmask0 = surrogates_wordmask0.to_bitmask();
|
|
|
|
// Check for surrogates
|
|
if (surrogates_bitmask0 != 0x0) {
|
|
// Cannot be UTF8
|
|
is_utf8 = false;
|
|
// Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates
|
|
// To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word.
|
|
// On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant
|
|
// bytes of a 32-bit word since they always come in pairs in UTF-16LE.
|
|
// Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit words.
|
|
|
|
if ((surrogates_bitmask0 & 0xaaaaaaaa) != 0) {
|
|
is_utf32 = false;
|
|
// Code from avx2_validate_utf16le.cpp
|
|
const char16_t * input = reinterpret_cast<const char16_t*>(buf);
|
|
const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len/2;
|
|
|
|
const auto v_fc = simd8<uint8_t>::splat(0xfc);
|
|
const auto v_dc = simd8<uint8_t>::splat(0xdc);
|
|
|
|
const uint32_t V0 = ~surrogates_bitmask0;
|
|
|
|
const auto vH0 = (in16 & v_fc) == v_dc;
|
|
const uint32_t H0 = vH0.to_bitmask();
|
|
|
|
const uint32_t L0 = ~H0 & surrogates_bitmask0;
|
|
|
|
const uint32_t a0 = L0 & (H0 >> 1);
|
|
const uint32_t b0 = a0 << 1;
|
|
const uint32_t c0 = V0 | a0 | b0;
|
|
|
|
if (c0 == 0xffffffff) {
|
|
input += simd16<uint16_t>::ELEMENTS * 2;
|
|
} else if (c0 == 0x7fffffff) {
|
|
input += simd16<uint16_t>::ELEMENTS * 2 - 1;
|
|
} else {
|
|
return simdutf::encoding_type::unspecified;
|
|
}
|
|
|
|
while (input + simd16<uint16_t>::ELEMENTS * 2 < end16) {
|
|
const auto in0 = simd16<uint16_t>(input);
|
|
const auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
|
|
|
|
const auto t0 = in0.shr<8>();
|
|
const auto t1 = in1.shr<8>();
|
|
|
|
const auto in_16 = simd16<uint16_t>::pack(t0, t1);
|
|
|
|
const auto surrogates_wordmask = (in_16 & v_f8) == v_d8;
|
|
const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
|
|
if (surrogates_bitmask == 0x0) {
|
|
input += simd16<uint16_t>::ELEMENTS * 2;
|
|
} else {
|
|
const uint32_t V = ~surrogates_bitmask;
|
|
|
|
const auto vH = (in_16 & v_fc) == v_dc;
|
|
const uint32_t H = vH.to_bitmask();
|
|
|
|
const uint32_t L = ~H & surrogates_bitmask;
|
|
|
|
const uint32_t a = L & (H >> 1);
|
|
|
|
const uint32_t b = a << 1;
|
|
|
|
const uint32_t c = V | a | b;
|
|
|
|
if (c == 0xffffffff) {
|
|
input += simd16<uint16_t>::ELEMENTS * 2;
|
|
} else if (c == 0x7fffffff) {
|
|
input += simd16<uint16_t>::ELEMENTS * 2 - 1;
|
|
} else {
|
|
return simdutf::encoding_type::unspecified;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
is_utf16 = false;
|
|
// Check for UTF-32
|
|
if (len % 4 == 0) {
|
|
const char32_t * input = reinterpret_cast<const char32_t*>(buf);
|
|
const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len/4;
|
|
|
|
// Must start checking for surrogates
|
|
__m256i currentoffsetmax = _mm256_setzero_si256();
|
|
const __m256i offset = _mm256_set1_epi32(0xffff2000);
|
|
const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
|
|
|
|
currentmax = _mm256_max_epu32(in, currentmax);
|
|
currentmax = _mm256_max_epu32(nextin, currentmax);
|
|
|
|
currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
|
|
currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(nextin, offset), currentoffsetmax);
|
|
|
|
while (input + 8 < end32) {
|
|
const __m256i in32 = _mm256_loadu_si256((__m256i *)input);
|
|
currentmax = _mm256_max_epu32(in32,currentmax);
|
|
currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in32, offset), currentoffsetmax);
|
|
input += 8;
|
|
}
|
|
|
|
__m256i forbidden_words = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
|
|
if(_mm256_testz_si256(forbidden_words, forbidden_words) == 0) {
|
|
return simdutf::encoding_type::unspecified;
|
|
}
|
|
} else {
|
|
return simdutf::encoding_type::unspecified;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
// If no surrogate, validate under other encodings as well
|
|
|
|
// UTF-32 validation
|
|
currentmax = _mm256_max_epu32(in, currentmax);
|
|
currentmax = _mm256_max_epu32(nextin, currentmax);
|
|
|
|
// UTF-8 validation
|
|
// Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h
|
|
simd::simd8x64<uint8_t> in8(in, nextin);
|
|
check.check_next_input(in8);
|
|
|
|
buf += 64;
|
|
}
|
|
|
|
// Check which encodings are possible
|
|
|
|
if (is_utf8) {
|
|
if (static_cast<size_t>(buf - start) != len) {
|
|
uint8_t block[64]{};
|
|
std::memset(block, 0x20, 64);
|
|
std::memcpy(block, buf, len - (buf - start));
|
|
simd::simd8x64<uint8_t> in(block);
|
|
check.check_next_input(in);
|
|
}
|
|
if (!check.errors()) {
|
|
out |= simdutf::encoding_type::UTF8;
|
|
}
|
|
}
|
|
|
|
if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start))/2)) {
|
|
out |= simdutf::encoding_type::UTF16_LE;
|
|
}
|
|
|
|
if (is_utf32 && (len % 4 == 0)) {
|
|
const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
|
|
__m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
|
|
if (_mm256_testz_si256(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start))/4)) {
|
|
out |= simdutf::encoding_type::UTF32_LE;
|
|
}
|
|
}
|
|
|
|
return out;
|
|
}
|
|
/* end file src/haswell/avx2_detect_encodings.cpp */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_validate_utf16.cpp
|
|
/* begin file src/haswell/avx2_validate_utf16.cpp */
|
|
/*
|
|
In UTF-16 words in range 0xD800 to 0xDFFF have special meaning.
|
|
|
|
In a vectorized algorithm we want to examine the most significant
|
|
nibble in order to select a fast path. If none of highest nibbles
|
|
are 0xD (13), than we are sure that UTF-16 chunk in a vector
|
|
register is valid.
|
|
|
|
Let us analyze what we need to check if the nibble is 0xD. The
|
|
value of the preceding nibble determines what we have:
|
|
|
|
0xd000 .. 0xd7ff - a valid word
|
|
0xd800 .. 0xdbff - low surrogate
|
|
0xdc00 .. 0xdfff - high surrogate
|
|
|
|
Other constraints we have to consider:
|
|
- there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
|
|
- there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
|
|
- there must not be sole low surrogate nor high surrogate
|
|
|
|
We're going to build three bitmasks based on the 3rd nibble:
|
|
- V = valid word,
|
|
- L = low surrogate (0xd800 .. 0xdbff)
|
|
- H = high surrogate (0xdc00 .. 0xdfff)
|
|
|
|
0 1 2 3 4 5 6 7 <--- word index
|
|
[ V | L | H | L | H | V | V | L ]
|
|
1 0 0 0 0 1 1 0 - V = valid masks
|
|
0 1 0 1 0 0 0 1 - L = low surrogate
|
|
0 0 1 0 1 0 0 0 - H high surrogate
|
|
|
|
|
|
1 0 0 0 0 1 1 0 V = valid masks
|
|
0 1 0 1 0 0 0 0 a = L & (H >> 1)
|
|
0 0 1 0 1 0 0 0 b = a << 1
|
|
1 1 1 1 1 1 1 0 c = V | a | b
|
|
^
|
|
the last bit can be zero, we just consume 7 words
|
|
and recheck this word in the next iteration
|
|
*/
|
|
|
|
/* Returns:
|
|
- pointer to the last unprocessed character (a scalar fallback should check the rest);
|
|
- nullptr if an error was detected.
|
|
*/
|
|
template <endianness big_endian>
|
|
const char16_t* avx2_validate_utf16(const char16_t* input, size_t size) {
|
|
const char16_t* end = input + size;
|
|
|
|
const auto v_d8 = simd8<uint8_t>::splat(0xd8);
|
|
const auto v_f8 = simd8<uint8_t>::splat(0xf8);
|
|
const auto v_fc = simd8<uint8_t>::splat(0xfc);
|
|
const auto v_dc = simd8<uint8_t>::splat(0xdc);
|
|
|
|
while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
|
|
// 0. Load data: since the validation takes into account only higher
|
|
// byte of each word, we compress the two vectors into one which
|
|
// consists only the higher bytes.
|
|
auto in0 = simd16<uint16_t>(input);
|
|
auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
|
|
|
|
if (big_endian) {
|
|
in0 = in0.swap_bytes();
|
|
in1 = in1.swap_bytes();
|
|
}
|
|
|
|
const auto t0 = in0.shr<8>();
|
|
const auto t1 = in1.shr<8>();
|
|
|
|
const auto in = simd16<uint16_t>::pack(t0, t1);
|
|
|
|
// 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
|
|
const auto surrogates_wordmask = (in & v_f8) == v_d8;
|
|
const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
|
|
if (surrogates_bitmask == 0x0) {
|
|
input += simd16<uint16_t>::ELEMENTS * 2;
|
|
} else {
|
|
// 2. We have some surrogates that have to be distinguished:
|
|
// - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
|
|
// - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
|
|
//
|
|
// Fact: high surrogate has 11th bit set (3rd bit in the higher word)
|
|
|
|
// V - non-surrogate words
|
|
// V = not surrogates_wordmask
|
|
const uint32_t V = ~surrogates_bitmask;
|
|
|
|
// H - word-mask for high surrogates: the six highest bits are 0b1101'11
|
|
const auto vH = (in & v_fc) == v_dc;
|
|
const uint32_t H = vH.to_bitmask();
|
|
|
|
// L - word mask for low surrogates
|
|
// L = not H and surrogates_wordmask
|
|
const uint32_t L = ~H & surrogates_bitmask;
|
|
|
|
const uint32_t a = L & (H >> 1); // A low surrogate must be followed by high one.
|
|
// (A low surrogate placed in the 7th register's word
|
|
// is an exception we handle.)
|
|
const uint32_t b = a << 1; // Just mark that the opposite fact is hold,
|
|
// thanks to that we have only two masks for valid case.
|
|
const uint32_t c = V | a | b; // Combine all the masks into the final one.
|
|
|
|
if (c == 0xffffffff) {
|
|
// The whole input register contains valid UTF-16, i.e.,
|
|
// either single words or proper surrogate pairs.
|
|
input += simd16<uint16_t>::ELEMENTS * 2;
|
|
} else if (c == 0x7fffffff) {
|
|
// The 31 lower words of the input register contains valid UTF-16.
|
|
// The 31 word may be either a low or high surrogate. It the next
|
|
// iteration we 1) check if the low surrogate is followed by a high
|
|
// one, 2) reject sole high surrogate.
|
|
input += simd16<uint16_t>::ELEMENTS * 2 - 1;
|
|
} else {
|
|
return nullptr;
|
|
}
|
|
}
|
|
}
|
|
|
|
return input;
|
|
}
|
|
|
|
|
|
template <endianness big_endian>
|
|
const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size) {
|
|
const char16_t* start = input;
|
|
const char16_t* end = input + size;
|
|
|
|
const auto v_d8 = simd8<uint8_t>::splat(0xd8);
|
|
const auto v_f8 = simd8<uint8_t>::splat(0xf8);
|
|
const auto v_fc = simd8<uint8_t>::splat(0xfc);
|
|
const auto v_dc = simd8<uint8_t>::splat(0xdc);
|
|
|
|
while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
|
|
// 0. Load data: since the validation takes into account only higher
|
|
// byte of each word, we compress the two vectors into one which
|
|
// consists only the higher bytes.
|
|
auto in0 = simd16<uint16_t>(input);
|
|
auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
|
|
|
|
if (big_endian) {
|
|
in0 = in0.swap_bytes();
|
|
in1 = in1.swap_bytes();
|
|
}
|
|
|
|
const auto t0 = in0.shr<8>();
|
|
const auto t1 = in1.shr<8>();
|
|
|
|
const auto in = simd16<uint16_t>::pack(t0, t1);
|
|
|
|
// 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
|
|
const auto surrogates_wordmask = (in & v_f8) == v_d8;
|
|
const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
|
|
if (surrogates_bitmask == 0x0) {
|
|
input += simd16<uint16_t>::ELEMENTS * 2;
|
|
} else {
|
|
// 2. We have some surrogates that have to be distinguished:
|
|
// - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
|
|
// - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
|
|
//
|
|
// Fact: high surrogate has 11th bit set (3rd bit in the higher word)
|
|
|
|
// V - non-surrogate words
|
|
// V = not surrogates_wordmask
|
|
const uint32_t V = ~surrogates_bitmask;
|
|
|
|
// H - word-mask for high surrogates: the six highest bits are 0b1101'11
|
|
const auto vH = (in & v_fc) == v_dc;
|
|
const uint32_t H = vH.to_bitmask();
|
|
|
|
// L - word mask for low surrogates
|
|
// L = not H and surrogates_wordmask
|
|
const uint32_t L = ~H & surrogates_bitmask;
|
|
|
|
const uint32_t a = L & (H >> 1); // A low surrogate must be followed by high one.
|
|
// (A low surrogate placed in the 7th register's word
|
|
// is an exception we handle.)
|
|
const uint32_t b = a << 1; // Just mark that the opposite fact is hold,
|
|
// thanks to that we have only two masks for valid case.
|
|
const uint32_t c = V | a | b; // Combine all the masks into the final one.
|
|
|
|
if (c == 0xffffffff) {
|
|
// The whole input register contains valid UTF-16, i.e.,
|
|
// either single words or proper surrogate pairs.
|
|
input += simd16<uint16_t>::ELEMENTS * 2;
|
|
} else if (c == 0x7fffffff) {
|
|
// The 31 lower words of the input register contains valid UTF-16.
|
|
// The 31 word may be either a low or high surrogate. It the next
|
|
// iteration we 1) check if the low surrogate is followed by a high
|
|
// one, 2) reject sole high surrogate.
|
|
input += simd16<uint16_t>::ELEMENTS * 2 - 1;
|
|
} else {
|
|
return result(error_code::SURROGATE, input - start);
|
|
}
|
|
}
|
|
}
|
|
|
|
return result(error_code::SUCCESS, input - start);
|
|
}
|
|
/* end file src/haswell/avx2_validate_utf16.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_validate_utf32le.cpp
|
|
/* begin file src/haswell/avx2_validate_utf32le.cpp */
|
|
/* Returns:
|
|
- pointer to the last unprocessed character (a scalar fallback should check the rest);
|
|
- nullptr if an error was detected.
|
|
*/
|
|
const char32_t* avx2_validate_utf32le(const char32_t* input, size_t size) {
|
|
const char32_t* end = input + size;
|
|
|
|
const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
|
|
const __m256i offset = _mm256_set1_epi32(0xffff2000);
|
|
const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
|
|
__m256i currentmax = _mm256_setzero_si256();
|
|
__m256i currentoffsetmax = _mm256_setzero_si256();
|
|
|
|
while (input + 8 < end) {
|
|
const __m256i in = _mm256_loadu_si256((__m256i *)input);
|
|
currentmax = _mm256_max_epu32(in,currentmax);
|
|
currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
|
|
input += 8;
|
|
}
|
|
__m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
|
|
if(_mm256_testz_si256(is_zero, is_zero) == 0) {
|
|
return nullptr;
|
|
}
|
|
|
|
is_zero = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
|
|
if(_mm256_testz_si256(is_zero, is_zero) == 0) {
|
|
return nullptr;
|
|
}
|
|
|
|
return input;
|
|
}
|
|
|
|
|
|
const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t size) {
|
|
const char32_t* start = input;
|
|
const char32_t* end = input + size;
|
|
|
|
const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
|
|
const __m256i offset = _mm256_set1_epi32(0xffff2000);
|
|
const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
|
|
__m256i currentmax = _mm256_setzero_si256();
|
|
__m256i currentoffsetmax = _mm256_setzero_si256();
|
|
|
|
while (input + 8 < end) {
|
|
const __m256i in = _mm256_loadu_si256((__m256i *)input);
|
|
currentmax = _mm256_max_epu32(in,currentmax);
|
|
currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
|
|
|
|
__m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
|
|
if(_mm256_testz_si256(is_zero, is_zero) == 0) {
|
|
return result(error_code::TOO_LARGE, input - start);
|
|
}
|
|
|
|
is_zero = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
|
|
if(_mm256_testz_si256(is_zero, is_zero) == 0) {
|
|
return result(error_code::SURROGATE, input - start);
|
|
}
|
|
input += 8;
|
|
}
|
|
|
|
return result(error_code::SUCCESS, input - start);
|
|
}
|
|
/* end file src/haswell/avx2_validate_utf32le.cpp */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf8_to_utf16.cpp
|
|
/* begin file src/haswell/avx2_convert_utf8_to_utf16.cpp */
|
|
// depends on "tables/utf8_to_utf16_tables.h"
|
|
|
|
|
|
// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 12).
|
|
template <endianness big_endian>
|
|
size_t convert_masked_utf8_to_utf16(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char16_t *&utf16_output) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it is maybe
|
|
// beneficial to have fast paths that depend on branch prediction but have less latency.
|
|
// This results in more instructions but, potentially, also higher speeds.
|
|
//
|
|
// We first try a few fast paths.
|
|
const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
const __m128i in = _mm_loadu_si128((__m128i *)input);
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask & 0xfff;
|
|
if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
|
|
// We process the data in chunks of 16 bytes.
|
|
__m256i ascii = _mm256_cvtepu8_epi16(in);
|
|
if (big_endian) {
|
|
const __m256i swap256 = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
|
|
17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
|
|
ascii = _mm256_shuffle_epi8(ascii, swap256);
|
|
}
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), ascii);
|
|
utf16_output += 16; // We wrote 16 16-bit characters.
|
|
return 16; // We consumed 16 bytes.
|
|
}
|
|
if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
|
|
// We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
|
|
// There is probably a more efficient sequence, but the following might do.
|
|
const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
|
|
__m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
|
|
if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
|
|
_mm_storeu_si128((__m128i *)utf16_output, composed);
|
|
utf16_output += 8; // We wrote 16 bytes, 8 code points.
|
|
return 16;
|
|
}
|
|
if(input_utf8_end_of_code_point_mask == 0x924) {
|
|
// We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
|
|
// There is probably a more efficient sequence, but the following might do.
|
|
const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
|
|
const __m128i middlebyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
const __m128i highbyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
|
|
__m128i composed_repacked = _mm_packus_epi32(composed, composed);
|
|
if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
|
|
_mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
|
|
utf16_output += 4;
|
|
return 12;
|
|
}
|
|
|
|
const uint8_t idx =
|
|
simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
|
|
const uint8_t consumed =
|
|
simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
|
|
if (idx < 64) {
|
|
// SIX (6) input code-words
|
|
// this is a relatively easy scenario
|
|
// we process SIX (6) input code-words. The max length in bytes of six code
|
|
// words spanning between 1 and 2 bytes each is 12 bytes. On processors
|
|
// where pdep/pext is fast, we might be able to use a small lookup table.
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
|
|
__m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
|
|
if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
|
|
_mm_storeu_si128((__m128i *)utf16_output, composed);
|
|
utf16_output += 6; // We wrote 12 bytes, 6 code points. There is a potential overflow of 4 bytes.
|
|
} else if (idx < 145) {
|
|
// FOUR (4) input code-words
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
|
|
const __m128i middlebyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
const __m128i highbyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
|
|
__m128i composed_repacked = _mm_packus_epi32(composed, composed);
|
|
if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
|
|
_mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
|
|
utf16_output += 4; // Here we overflow by 8 bytes.
|
|
} else if (idx < 209) {
|
|
// TWO (2) input code-words
|
|
//////////////
|
|
// There might be garbage inputs where a leading byte mascarades as a four-byte
|
|
// leading byte (by being followed by 3 continuation byte), but is not greater than
|
|
// 0xf0. This could trigger a buffer overflow if we only counted leading
|
|
// bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation.
|
|
// Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs.
|
|
// We do as at the cost of an extra mask.
|
|
/////////////
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
|
|
const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
__m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
|
|
// correct for spurious high bit
|
|
const __m128i correct =
|
|
_mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
|
|
middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
|
|
const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
|
|
// We deliberately carry the leading four bits in highbyte if they are present,
|
|
// we remove them later when computing hightenbits.
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
|
|
// When we need to generate a surrogate pair (leading byte > 0xF0), then
|
|
// the corresponding 32-bit value in 'composed' will be greater than
|
|
// > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
|
|
// location of the surrogate pairs.
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
|
|
_mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
|
|
const __m128i composedminus =
|
|
_mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
|
|
const __m128i lowtenbits =
|
|
_mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
|
|
// Notice the 0x3ff mask:
|
|
const __m128i hightenbits = _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
|
|
const __m128i lowtenbitsadd =
|
|
_mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
|
|
const __m128i hightenbitsadd =
|
|
_mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
|
|
const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
|
|
__m128i surrogates =
|
|
_mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
|
|
uint32_t basic_buffer[4];
|
|
uint32_t basic_buffer_swap[4];
|
|
if (big_endian) {
|
|
_mm_storeu_si128((__m128i *)basic_buffer_swap, _mm_shuffle_epi8(composed, swap));
|
|
surrogates = _mm_shuffle_epi8(surrogates, swap);
|
|
}
|
|
_mm_storeu_si128((__m128i *)basic_buffer, composed);
|
|
uint32_t surrogate_buffer[4];
|
|
_mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
|
|
for (size_t i = 0; i < 3; i++) {
|
|
if(basic_buffer[i] > 0x3c00000) {
|
|
utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
|
|
utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
|
|
utf16_output += 2;
|
|
} else {
|
|
utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
|
|
utf16_output++;
|
|
}
|
|
}
|
|
} else {
|
|
// here we know that there is an error but we do not handle errors
|
|
}
|
|
return consumed;
|
|
}
|
|
/* end file src/haswell/avx2_convert_utf8_to_utf16.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf8_to_utf32.cpp
|
|
/* begin file src/haswell/avx2_convert_utf8_to_utf32.cpp */
|
|
// depends on "tables/utf8_to_utf16_tables.h"
|
|
|
|
|
|
// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 12).
|
|
size_t convert_masked_utf8_to_utf32(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char32_t *&utf32_output) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it is maybe
|
|
// beneficial to have fast paths that depend on branch prediction but have less latency.
|
|
// This results in more instructions but, potentially, also higher speeds.
|
|
//
|
|
// We first try a few fast paths.
|
|
const __m128i in = _mm_loadu_si128((__m128i *)input);
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask & 0xfff;
|
|
if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
|
|
// We process the data in chunks of 16 bytes.
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu8_epi32(in));
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output+8), _mm256_cvtepu8_epi32(_mm_srli_si128(in,8)));
|
|
utf32_output += 16; // We wrote 16 32-bit characters.
|
|
return 16; // We consumed 16 bytes.
|
|
}
|
|
if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
|
|
// We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
|
|
// There is probably a more efficient sequence, but the following might do.
|
|
const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
|
|
const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
|
|
_mm256_storeu_si256((__m256i *)utf32_output, _mm256_cvtepu16_epi32(composed));
|
|
utf32_output += 8; // We wrote 16 bytes, 8 code points.
|
|
return 16;
|
|
}
|
|
if(input_utf8_end_of_code_point_mask == 0x924) {
|
|
// We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
|
|
// There is probably a more efficient sequence, but the following might do.
|
|
const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
|
|
const __m128i middlebyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
const __m128i highbyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
|
|
_mm_storeu_si128((__m128i *)utf32_output, composed);
|
|
utf32_output += 4;
|
|
return 12;
|
|
}
|
|
/// We do not have a fast path available, so we fallback.
|
|
|
|
const uint8_t idx =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
|
|
const uint8_t consumed =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
|
|
if (idx < 64) {
|
|
// SIX (6) input code-words
|
|
// this is a relatively easy scenario
|
|
// we process SIX (6) input code-words. The max length in bytes of six code
|
|
// words spanning between 1 and 2 bytes each is 12 bytes. On processors
|
|
// where pdep/pext is fast, we might be able to use a small lookup table.
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
|
|
const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
|
|
_mm256_storeu_si256((__m256i *)utf32_output, _mm256_cvtepu16_epi32(composed));
|
|
utf32_output += 6; // We wrote 24 bytes, 6 code points. There is a potential
|
|
// overflow of 32 - 24 = 8 bytes.
|
|
} else if (idx < 145) {
|
|
// FOUR (4) input code-words
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
|
|
const __m128i middlebyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
const __m128i highbyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
|
|
_mm_storeu_si128((__m128i *)utf32_output, composed);
|
|
utf32_output += 4;
|
|
} else if (idx < 209) {
|
|
// TWO (2) input code-words
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
|
|
const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
__m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
|
|
// correct for spurious high bit
|
|
const __m128i correct =
|
|
_mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
|
|
middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
|
|
const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
|
|
_mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
|
|
_mm_storeu_si128((__m128i *)utf32_output, composed);
|
|
utf32_output += 3; // We wrote 3 * 4 bytes, there is a potential overflow of 4 bytes.
|
|
} else {
|
|
// here we know that there is an error but we do not handle errors
|
|
}
|
|
return consumed;
|
|
}
|
|
/* end file src/haswell/avx2_convert_utf8_to_utf32.cpp */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf16_to_utf8.cpp
|
|
/* begin file src/haswell/avx2_convert_utf16_to_utf8.cpp */
|
|
/*
|
|
The vectorized algorithm works on single SSE register i.e., it
|
|
loads eight 16-bit words.
|
|
|
|
We consider three cases:
|
|
1. an input register contains no surrogates and each value
|
|
is in range 0x0000 .. 0x07ff.
|
|
2. an input register contains no surrogates and values are
|
|
is in range 0x0000 .. 0xffff.
|
|
3. an input register contains surrogates --- i.e. codepoints
|
|
can have 16 or 32 bits.
|
|
|
|
Ad 1.
|
|
|
|
When values are less than 0x0800, it means that a 16-bit words
|
|
can be converted into: 1) single UTF8 byte (when it's an ASCII
|
|
char) or 2) two UTF8 bytes.
|
|
|
|
For this case we do only some shuffle to obtain these 2-byte
|
|
codes and finally compress the whole SSE register with a single
|
|
shuffle.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
Ad 2.
|
|
|
|
When values fit in 16-bit words, but are above 0x07ff, then
|
|
a single word may produce one, two or three UTF8 bytes.
|
|
|
|
We prepare data for all these three cases in two registers.
|
|
The first register contains lower two UTF8 bytes (used in all
|
|
cases), while the second one contains just the third byte for
|
|
the three-UTF8-bytes case.
|
|
|
|
Finally these two registers are interleaved forming eight-element
|
|
array of 32-bit values. The array spans two SSE registers.
|
|
The bytes from the registers are compressed using two shuffles.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
|
|
To summarize:
|
|
- We need two 256-entry tables that have 8704 bytes in total.
|
|
*/
|
|
|
|
|
|
/*
|
|
Returns a pair: the first unprocessed byte from buf and utf8_output
|
|
A scalar routing should carry on the conversion of the tail.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<const char16_t*, char*> avx2_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) {
|
|
const char16_t* end = buf + len;
|
|
const __m256i v_0000 = _mm256_setzero_si256();
|
|
const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
|
|
const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
|
|
const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
|
|
const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (buf + 16 + safety_margin <= end) {
|
|
__m256i in = _mm256_loadu_si256((__m256i*)buf);
|
|
if (big_endian) {
|
|
const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
|
|
17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
|
|
in = _mm256_shuffle_epi8(in, swap);
|
|
}
|
|
// a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
|
|
const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
|
|
if(_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
// no bits set above 7th bit
|
|
const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
|
|
const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
|
|
|
|
// no bits set above 11th bit
|
|
const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
|
|
const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
|
|
if (one_or_two_bytes_bitmask == 0xffffffff) {
|
|
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
|
|
const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m256i t0 = _mm256_slli_epi16(in, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m256i t1 = _mm256_and_si256(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m256i t2 = _mm256_and_si256(in, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m256i t3 = _mm256_or_si256(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m256i t4 = _mm256_or_si256(t3, v_c080);
|
|
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
|
|
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
const uint32_t M0 = one_byte_bitmask & 0x55555555;
|
|
const uint32_t M1 = M0 >> 7;
|
|
const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
|
|
// 4. pack the bytes
|
|
|
|
const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
|
|
const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
|
|
|
|
const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
|
|
const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
|
|
|
|
const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
|
|
// 5. store bytes
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
|
|
utf8_output += row[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
|
|
utf8_output += row_2[0];
|
|
|
|
// 6. adjust pointers
|
|
buf += 16;
|
|
continue;
|
|
}
|
|
// 1. Check if there are any surrogate word in the input chunk.
|
|
// We have also deal with situation when there is a surrogate word
|
|
// at the end of a chunk.
|
|
const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
|
|
|
|
// bitmask = 0x0000 if there are no surrogates
|
|
// = 0xc000 if the last word is a surrogate
|
|
const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
|
|
// it is likely an uncommon occurrence.
|
|
if (surrogates_bitmask == 0x00000000) {
|
|
// case: words from register produce either 1, 2 or 3 UTF-8 bytes
|
|
const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
|
|
0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two words (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two words we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
|
|
const __m256i s0 = _mm256_srli_epi16(in, 4);
|
|
// [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
|
|
const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
|
|
// [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
|
|
const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
|
|
// [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
|
|
const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
|
|
const __m256i s4 = _mm256_xor_si256(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand words 16-bit => 32-bit
|
|
const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
|
|
const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
|
|
|
|
// 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint32_t mask = (one_byte_bitmask & 0x55555555) |
|
|
(one_or_two_bytes_bitmask & 0xaaaaaaaa);
|
|
// Due to the wider registers, the following path is less likely to be useful.
|
|
/*if(mask == 0) {
|
|
// We only have three-byte words. Use fast path.
|
|
const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
|
|
const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
|
|
const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
|
|
utf8_output += 12;
|
|
buf += 16;
|
|
continue;
|
|
}*/
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
|
|
|
|
const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
|
|
const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
|
|
const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
|
|
const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
|
|
|
|
|
|
const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
|
|
const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
|
|
const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
|
|
const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
|
|
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_2);
|
|
utf8_output += row2[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_3);
|
|
utf8_output += row3[0];
|
|
buf += 16;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
|
|
if((word & 0xFF80)==0) {
|
|
*utf8_output++ = char(word);
|
|
} else if((word & 0xF800)==0) {
|
|
*utf8_output++ = char((word>>6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if((word &0xF800 ) != 0xD800) {
|
|
*utf8_output++ = char((word>>12) | 0b11100000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, utf8_output); }
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf8_output++ = char((value>>18) | 0b11110000);
|
|
*utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(buf, utf8_output);
|
|
}
|
|
|
|
|
|
/*
|
|
Returns a pair: a result struct and utf8_output.
|
|
If there is an error, the count field of the result is the position of the error.
|
|
Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
|
|
A scalar routing should carry on the conversion of the tail if needed.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<result, char*> avx2_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) {
|
|
const char16_t* start = buf;
|
|
const char16_t* end = buf + len;
|
|
|
|
const __m256i v_0000 = _mm256_setzero_si256();
|
|
const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
|
|
const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
|
|
const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
|
|
const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (buf + 16 + safety_margin <= end) {
|
|
__m256i in = _mm256_loadu_si256((__m256i*)buf);
|
|
if (big_endian) {
|
|
const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
|
|
17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
|
|
in = _mm256_shuffle_epi8(in, swap);
|
|
}
|
|
// a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
|
|
const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
|
|
if(_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
// no bits set above 7th bit
|
|
const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
|
|
const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
|
|
|
|
// no bits set above 11th bit
|
|
const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
|
|
const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
|
|
if (one_or_two_bytes_bitmask == 0xffffffff) {
|
|
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
|
|
const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m256i t0 = _mm256_slli_epi16(in, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m256i t1 = _mm256_and_si256(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m256i t2 = _mm256_and_si256(in, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m256i t3 = _mm256_or_si256(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m256i t4 = _mm256_or_si256(t3, v_c080);
|
|
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
|
|
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
const uint32_t M0 = one_byte_bitmask & 0x55555555;
|
|
const uint32_t M1 = M0 >> 7;
|
|
const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
|
|
// 4. pack the bytes
|
|
|
|
const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
|
|
const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
|
|
|
|
const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
|
|
const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
|
|
|
|
const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
|
|
// 5. store bytes
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
|
|
utf8_output += row[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
|
|
utf8_output += row_2[0];
|
|
|
|
// 6. adjust pointers
|
|
buf += 16;
|
|
continue;
|
|
}
|
|
// 1. Check if there are any surrogate word in the input chunk.
|
|
// We have also deal with situation when there is a surrogate word
|
|
// at the end of a chunk.
|
|
const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
|
|
|
|
// bitmask = 0x0000 if there are no surrogates
|
|
// = 0xc000 if the last word is a surrogate
|
|
const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
|
|
// it is likely an uncommon occurrence.
|
|
if (surrogates_bitmask == 0x00000000) {
|
|
// case: words from register produce either 1, 2 or 3 UTF-8 bytes
|
|
const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
|
|
0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two words (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two words we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
|
|
const __m256i s0 = _mm256_srli_epi16(in, 4);
|
|
// [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
|
|
const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
|
|
// [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
|
|
const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
|
|
// [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
|
|
const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
|
|
const __m256i s4 = _mm256_xor_si256(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand words 16-bit => 32-bit
|
|
const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
|
|
const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
|
|
|
|
// 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint32_t mask = (one_byte_bitmask & 0x55555555) |
|
|
(one_or_two_bytes_bitmask & 0xaaaaaaaa);
|
|
// Due to the wider registers, the following path is less likely to be useful.
|
|
/*if(mask == 0) {
|
|
// We only have three-byte words. Use fast path.
|
|
const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
|
|
const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
|
|
const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
|
|
utf8_output += 12;
|
|
buf += 16;
|
|
continue;
|
|
}*/
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
|
|
|
|
const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
|
|
const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
|
|
const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
|
|
const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
|
|
|
|
|
|
const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
|
|
const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
|
|
const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
|
|
const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
|
|
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_2);
|
|
utf8_output += row2[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_3);
|
|
utf8_output += row3[0];
|
|
buf += 16;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
|
|
if((word & 0xFF80)==0) {
|
|
*utf8_output++ = char(word);
|
|
} else if((word & 0xF800)==0) {
|
|
*utf8_output++ = char((word>>6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if((word &0xF800 ) != 0xD800) {
|
|
*utf8_output++ = char((word>>12) | 0b11100000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output); }
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf8_output++ = char((value>>18) | 0b11110000);
|
|
*utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
|
|
}
|
|
/* end file src/haswell/avx2_convert_utf16_to_utf8.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf16_to_utf32.cpp
|
|
/* begin file src/haswell/avx2_convert_utf16_to_utf32.cpp */
|
|
/*
|
|
The vectorized algorithm works on single SSE register i.e., it
|
|
loads eight 16-bit words.
|
|
|
|
We consider three cases:
|
|
1. an input register contains no surrogates and each value
|
|
is in range 0x0000 .. 0x07ff.
|
|
2. an input register contains no surrogates and values are
|
|
is in range 0x0000 .. 0xffff.
|
|
3. an input register contains surrogates --- i.e. codepoints
|
|
can have 16 or 32 bits.
|
|
|
|
Ad 1.
|
|
|
|
When values are less than 0x0800, it means that a 16-bit words
|
|
can be converted into: 1) single UTF8 byte (when it's an ASCII
|
|
char) or 2) two UTF8 bytes.
|
|
|
|
For this case we do only some shuffle to obtain these 2-byte
|
|
codes and finally compress the whole SSE register with a single
|
|
shuffle.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
Ad 2.
|
|
|
|
When values fit in 16-bit words, but are above 0x07ff, then
|
|
a single word may produce one, two or three UTF8 bytes.
|
|
|
|
We prepare data for all these three cases in two registers.
|
|
The first register contains lower two UTF8 bytes (used in all
|
|
cases), while the second one contains just the third byte for
|
|
the three-UTF8-bytes case.
|
|
|
|
Finally these two registers are interleaved forming eight-element
|
|
array of 32-bit values. The array spans two SSE registers.
|
|
The bytes from the registers are compressed using two shuffles.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
|
|
To summarize:
|
|
- We need two 256-entry tables that have 8704 bytes in total.
|
|
*/
|
|
|
|
|
|
/*
|
|
Returns a pair: the first unprocessed byte from buf and utf32_output
|
|
A scalar routing should carry on the conversion of the tail.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<const char16_t*, char32_t*> avx2_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) {
|
|
const char16_t* end = buf + len;
|
|
const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
|
|
const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
|
|
|
|
while (buf + 16 <= end) {
|
|
__m256i in = _mm256_loadu_si256((__m256i*)buf);
|
|
if (big_endian) {
|
|
const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
|
|
17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
|
|
in = _mm256_shuffle_epi8(in, swap);
|
|
}
|
|
|
|
// 1. Check if there are any surrogate word in the input chunk.
|
|
// We have also deal with situation when there is a surrogate word
|
|
// at the end of a chunk.
|
|
const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
|
|
|
|
// bitmask = 0x0000 if there are no surrogates
|
|
// = 0xc000 if the last word is a surrogate
|
|
const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
|
|
// it is likely an uncommon occurrence.
|
|
if (surrogates_bitmask == 0x00000000) {
|
|
// case: we extend all sixteen 16-bit words to sixteen 32-bit words
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in,1)));
|
|
utf32_output += 16;
|
|
buf += 16;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
|
|
if((word &0xF800 ) != 0xD800) {
|
|
// No surrogate pair
|
|
*utf32_output++ = char32_t(word);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, utf32_output); }
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(buf, utf32_output);
|
|
}
|
|
|
|
|
|
/*
|
|
Returns a pair: a result struct and utf8_output.
|
|
If there is an error, the count field of the result is the position of the error.
|
|
Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
|
|
A scalar routing should carry on the conversion of the tail if needed.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<result, char32_t*> avx2_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) {
|
|
const char16_t* start = buf;
|
|
const char16_t* end = buf + len;
|
|
const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
|
|
const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
|
|
|
|
while (buf + 16 <= end) {
|
|
__m256i in = _mm256_loadu_si256((__m256i*)buf);
|
|
if (big_endian) {
|
|
const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
|
|
17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
|
|
in = _mm256_shuffle_epi8(in, swap);
|
|
}
|
|
|
|
// 1. Check if there are any surrogate word in the input chunk.
|
|
// We have also deal with situation when there is a surrogate word
|
|
// at the end of a chunk.
|
|
const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
|
|
|
|
// bitmask = 0x0000 if there are no surrogates
|
|
// = 0xc000 if the last word is a surrogate
|
|
const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
|
|
// it is likely an uncommon occurrence.
|
|
if (surrogates_bitmask == 0x00000000) {
|
|
// case: we extend all sixteen 16-bit words to sixteen 32-bit words
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
|
|
_mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in,1)));
|
|
utf32_output += 16;
|
|
buf += 16;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
|
|
if((word &0xF800 ) != 0xD800) {
|
|
// No surrogate pair
|
|
*utf32_output++ = char32_t(word);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output); }
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
|
|
}
|
|
/* end file src/haswell/avx2_convert_utf16_to_utf32.cpp */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf32_to_utf8.cpp
|
|
/* begin file src/haswell/avx2_convert_utf32_to_utf8.cpp */
|
|
std::pair<const char32_t*, char*> avx2_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) {
|
|
const char32_t* end = buf + len;
|
|
const __m256i v_0000 = _mm256_setzero_si256();
|
|
const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
|
|
const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
|
|
const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
|
|
const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
|
|
const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
|
|
__m256i running_max = _mm256_setzero_si256();
|
|
__m256i forbidden_bytemask = _mm256_setzero_si256();
|
|
|
|
const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (buf + 16 + safety_margin <= end) {
|
|
__m256i in = _mm256_loadu_si256((__m256i*)buf);
|
|
__m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
|
|
running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
|
|
|
|
// Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
|
|
__m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
|
|
in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
|
|
|
|
// Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
|
|
|
|
if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
// no bits set above 7th bit
|
|
const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
|
|
const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
|
|
|
|
// no bits set above 11th bit
|
|
const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
|
|
const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
|
|
if (one_or_two_bytes_bitmask == 0xffffffff) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
|
|
const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m256i t0 = _mm256_slli_epi16(in_16, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m256i t1 = _mm256_and_si256(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m256i t2 = _mm256_and_si256(in_16, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m256i t3 = _mm256_or_si256(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m256i t4 = _mm256_or_si256(t3, v_c080);
|
|
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
|
|
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
const uint32_t M0 = one_byte_bitmask & 0x55555555;
|
|
const uint32_t M1 = M0 >> 7;
|
|
const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
|
|
// 4. pack the bytes
|
|
|
|
const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
|
|
const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
|
|
|
|
const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
|
|
const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
|
|
|
|
const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
|
|
// 5. store bytes
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
|
|
utf8_output += row[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
|
|
utf8_output += row_2[0];
|
|
|
|
// 6. adjust pointers
|
|
buf += 16;
|
|
continue;
|
|
}
|
|
// Must check for overflow in packing
|
|
const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
|
|
const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
|
|
if (saturation_bitmask == 0xffffffff) {
|
|
// case: words from register produce either 1, 2 or 3 UTF-8 bytes
|
|
const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
|
|
forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
|
|
|
|
const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
|
|
0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two words (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two words we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
|
|
const __m256i s0 = _mm256_srli_epi16(in_16, 4);
|
|
// [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
|
|
const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
|
|
// [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
|
|
const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
|
|
// [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
|
|
const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
|
|
const __m256i s4 = _mm256_xor_si256(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand words 16-bit => 32-bit
|
|
const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
|
|
const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
|
|
|
|
// 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint32_t mask = (one_byte_bitmask & 0x55555555) |
|
|
(one_or_two_bytes_bitmask & 0xaaaaaaaa);
|
|
// Due to the wider registers, the following path is less likely to be useful.
|
|
/*if(mask == 0) {
|
|
// We only have three-byte words. Use fast path.
|
|
const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
|
|
const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
|
|
const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
|
|
utf8_output += 12;
|
|
buf += 16;
|
|
continue;
|
|
}*/
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
|
|
|
|
const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
|
|
const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
|
|
const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
|
|
const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
|
|
|
|
|
|
const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
|
|
const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
|
|
const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
|
|
const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
|
|
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_2);
|
|
utf8_output += row2[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_3);
|
|
utf8_output += row3[0];
|
|
buf += 16;
|
|
} else {
|
|
// case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// may require large, non-trivial tables?
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if((word & 0xFFFFFF80)==0) { // 1-byte (ASCII)
|
|
*utf8_output++ = char(word);
|
|
} else if((word & 0xFFFFF800)==0) { // 2-byte
|
|
*utf8_output++ = char((word>>6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if((word & 0xFFFF0000 )==0) { // 3-byte
|
|
if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
|
|
*utf8_output++ = char((word>>12) | 0b11100000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else { // 4-byte
|
|
if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
|
|
*utf8_output++ = char((word>>18) | 0b11110000);
|
|
*utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
// check for invalid input
|
|
const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
|
|
if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
|
|
return std::make_pair(nullptr, utf8_output);
|
|
}
|
|
|
|
if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); }
|
|
|
|
return std::make_pair(buf, utf8_output);
|
|
}
|
|
|
|
|
|
std::pair<result, char*> avx2_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
|
|
const char32_t* end = buf + len;
|
|
const char32_t* start = buf;
|
|
|
|
const __m256i v_0000 = _mm256_setzero_si256();
|
|
const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
|
|
const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
|
|
const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
|
|
const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
|
|
const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
|
|
const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
|
|
|
|
const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (buf + 16 + safety_margin <= end) {
|
|
__m256i in = _mm256_loadu_si256((__m256i*)buf);
|
|
__m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
|
|
// Check for too large input
|
|
const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
|
|
if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
|
|
}
|
|
|
|
// Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
|
|
__m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
|
|
in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
|
|
|
|
// Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
|
|
|
|
if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
|
|
// 1. pack the bytes
|
|
const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
// no bits set above 7th bit
|
|
const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
|
|
const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
|
|
|
|
// no bits set above 11th bit
|
|
const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
|
|
const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
|
|
if (one_or_two_bytes_bitmask == 0xffffffff) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
|
|
const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m256i t0 = _mm256_slli_epi16(in_16, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m256i t1 = _mm256_and_si256(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m256i t2 = _mm256_and_si256(in_16, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m256i t3 = _mm256_or_si256(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m256i t4 = _mm256_or_si256(t3, v_c080);
|
|
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
|
|
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
const uint32_t M0 = one_byte_bitmask & 0x55555555;
|
|
const uint32_t M1 = M0 >> 7;
|
|
const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
|
|
// 4. pack the bytes
|
|
|
|
const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
|
|
const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
|
|
|
|
const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
|
|
const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
|
|
|
|
const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
|
|
// 5. store bytes
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
|
|
utf8_output += row[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
|
|
utf8_output += row_2[0];
|
|
|
|
// 6. adjust pointers
|
|
buf += 16;
|
|
continue;
|
|
}
|
|
// Must check for overflow in packing
|
|
const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
|
|
const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
|
|
if (saturation_bitmask == 0xffffffff) {
|
|
// case: words from register produce either 1, 2 or 3 UTF-8 bytes
|
|
|
|
// Check for illegal surrogate words
|
|
const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
|
|
const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
|
|
if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
|
|
}
|
|
|
|
const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
|
|
0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two words (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two words we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
|
|
const __m256i s0 = _mm256_srli_epi16(in_16, 4);
|
|
// [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
|
|
const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
|
|
// [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
|
|
const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
|
|
// [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
|
|
const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
|
|
const __m256i s4 = _mm256_xor_si256(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand words 16-bit => 32-bit
|
|
const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
|
|
const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
|
|
|
|
// 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint32_t mask = (one_byte_bitmask & 0x55555555) |
|
|
(one_or_two_bytes_bitmask & 0xaaaaaaaa);
|
|
// Due to the wider registers, the following path is less likely to be useful.
|
|
/*if(mask == 0) {
|
|
// We only have three-byte words. Use fast path.
|
|
const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
|
|
const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
|
|
const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
|
|
utf8_output += 12;
|
|
buf += 16;
|
|
continue;
|
|
}*/
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
|
|
|
|
const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
|
|
const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
|
|
const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
|
|
const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
|
|
|
|
|
|
const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
|
|
const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
|
|
const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
|
|
const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
|
|
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_2);
|
|
utf8_output += row2[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_3);
|
|
utf8_output += row3[0];
|
|
buf += 16;
|
|
} else {
|
|
// case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// may require large, non-trivial tables?
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if((word & 0xFFFFFF80)==0) { // 1-byte (ASCII)
|
|
*utf8_output++ = char(word);
|
|
} else if((word & 0xFFFFF800)==0) { // 2-byte
|
|
*utf8_output++ = char((word>>6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if((word & 0xFFFF0000 )==0) { // 3-byte
|
|
if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
|
|
*utf8_output++ = char((word>>12) | 0b11100000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else { // 4-byte
|
|
if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); }
|
|
*utf8_output++ = char((word>>18) | 0b11110000);
|
|
*utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
|
|
}
|
|
/* end file src/haswell/avx2_convert_utf32_to_utf8.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf32_to_utf16.cpp
|
|
/* begin file src/haswell/avx2_convert_utf32_to_utf16.cpp */
|
|
template <endianness big_endian>
|
|
std::pair<const char32_t*, char16_t*> avx2_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) {
|
|
const char32_t* end = buf + len;
|
|
|
|
const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
|
|
__m256i forbidden_bytemask = _mm256_setzero_si256();
|
|
|
|
|
|
while (buf + 8 + safety_margin <= end) {
|
|
__m256i in = _mm256_loadu_si256((__m256i*)buf);
|
|
|
|
const __m256i v_00000000 = _mm256_setzero_si256();
|
|
const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
|
|
|
|
// no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
|
|
const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
|
|
const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
|
|
|
|
if (saturation_bitmask == 0xffffffff) {
|
|
const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
|
|
const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
|
|
forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
|
|
|
|
__m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
|
|
if (big_endian) {
|
|
const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
|
|
}
|
|
_mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
|
|
utf16_output += 8;
|
|
buf += 8;
|
|
} else {
|
|
size_t forward = 7;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if((word & 0xFFFF0000)==0) {
|
|
// will not generate a surrogate pair
|
|
if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
|
|
*utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (big_endian) {
|
|
high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
|
|
low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
}
|
|
|
|
// check for invalid input
|
|
if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); }
|
|
|
|
return std::make_pair(buf, utf16_output);
|
|
}
|
|
|
|
|
|
template <endianness big_endian>
|
|
std::pair<result, char16_t*> avx2_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
|
|
const char32_t* start = buf;
|
|
const char32_t* end = buf + len;
|
|
|
|
const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (buf + 8 + safety_margin <= end) {
|
|
__m256i in = _mm256_loadu_si256((__m256i*)buf);
|
|
|
|
const __m256i v_00000000 = _mm256_setzero_si256();
|
|
const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
|
|
|
|
// no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
|
|
const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
|
|
const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
|
|
|
|
if (saturation_bitmask == 0xffffffff) {
|
|
const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
|
|
const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
|
|
const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
|
|
if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
|
|
}
|
|
|
|
__m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
|
|
if (big_endian) {
|
|
const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
|
|
}
|
|
_mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
|
|
utf16_output += 8;
|
|
buf += 8;
|
|
} else {
|
|
size_t forward = 7;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if((word & 0xFFFF0000)==0) {
|
|
// will not generate a surrogate pair
|
|
if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
|
|
*utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (big_endian) {
|
|
high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
|
|
low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
}
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
|
|
}
|
|
/* end file src/haswell/avx2_convert_utf32_to_utf16.cpp */
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h
|
|
/* begin file src/generic/buf_block_reader.h */
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
|
|
// Walks through a buffer in block-sized increments, loading the last part with spaces
|
|
template<size_t STEP_SIZE>
|
|
struct buf_block_reader {
|
|
public:
|
|
simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
|
|
simdutf_really_inline size_t block_index();
|
|
simdutf_really_inline bool has_full_block() const;
|
|
simdutf_really_inline const uint8_t *full_block() const;
|
|
/**
|
|
* Get the last block, padded with spaces.
|
|
*
|
|
* There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
|
|
* function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
|
|
* will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
|
|
*
|
|
* @return the number of effective characters in the last block.
|
|
*/
|
|
simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
|
|
simdutf_really_inline void advance();
|
|
private:
|
|
const uint8_t *buf;
|
|
const size_t len;
|
|
const size_t lenminusstep;
|
|
size_t idx;
|
|
};
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdutf_unused static char * format_input_text_64(const uint8_t *text) {
|
|
static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
|
|
static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
|
|
in.store(reinterpret_cast<uint8_t*>(buf));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] < ' ') { buf[i] = '_'; }
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdutf_unused static char * format_mask(uint64_t mask) {
|
|
static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
|
|
for (size_t i=0; i<64; i++) {
|
|
buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
|
|
}
|
|
buf[64] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
|
|
return idx < lenminusstep;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
|
|
return &buf[idx];
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
|
|
if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
|
|
std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
|
|
std::memcpy(dst, buf + idx, len - idx);
|
|
return len - idx;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
|
|
idx += STEP_SIZE;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/buf_block_reader.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
|
|
/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
|
|
);
|
|
constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY,
|
|
CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000
|
|
);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
|
|
);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
//
|
|
// Return nonzero if there are incomplete multibyte characters at the end of the block:
|
|
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
|
|
//
|
|
simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
|
|
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
|
|
// ... 1111____ 111_____ 11______
|
|
static const uint8_t max_array[32] = {
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
|
|
};
|
|
const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
|
|
return input.gt_bits(max_value);
|
|
}
|
|
|
|
struct utf8_checker {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
// The last input we received
|
|
simd8<uint8_t> prev_input_block;
|
|
// Whether the last input we received was incomplete (used for ASCII fast path)
|
|
simd8<uint8_t> prev_incomplete;
|
|
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
|
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
// The only problem that can happen at EOF is that a multibyte character is too short
|
|
// or a byte value too large in the last bytes: check_special_cases only checks for bytes
|
|
// too large in the first of two bytes.
|
|
simdutf_really_inline void check_eof() {
|
|
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
|
|
// possibly finish them.
|
|
this->error |= this->prev_incomplete;
|
|
}
|
|
|
|
simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
|
|
if(simdutf_likely(is_ascii(input))) {
|
|
this->error |= this->prev_incomplete;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
|
|
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
|
|
|
|
}
|
|
}
|
|
|
|
// do not forget to call check_eof!
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_validation
|
|
|
|
using utf8_validation::utf8_checker;
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
|
|
/* begin file src/generic/utf8_validation/utf8_validator.h */
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8.
|
|
*/
|
|
template<class checker>
|
|
bool generic_validate_utf8(const uint8_t * input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
return !c.errors();
|
|
}
|
|
|
|
bool generic_validate_utf8(const char * input, size_t length) {
|
|
return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8 and stops on errors.
|
|
*/
|
|
template<class checker>
|
|
result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
size_t count{0};
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
if(c.errors()) {
|
|
if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input + count), length - count);
|
|
res.count += count;
|
|
return res;
|
|
}
|
|
reader.advance();
|
|
count += 64;
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
if (c.errors()) {
|
|
if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input) + count, length - count);
|
|
res.count += count;
|
|
return res;
|
|
} else {
|
|
return result(error_code::SUCCESS, length);
|
|
}
|
|
}
|
|
|
|
result generic_validate_utf8_with_errors(const char * input, size_t length) {
|
|
return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
template<class checker>
|
|
bool generic_validate_ascii(const uint8_t * input, size_t length) {
|
|
buf_block_reader<64> reader(input, length);
|
|
uint8_t blocks[64]{};
|
|
simd::simd8x64<uint8_t> running_or(blocks);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
running_or |= in;
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
running_or |= in;
|
|
return running_or.is_ascii();
|
|
}
|
|
|
|
bool generic_validate_ascii(const char * input, size_t length) {
|
|
return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
template<class checker>
|
|
result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
|
|
buf_block_reader<64> reader(input, length);
|
|
size_t count{0};
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
if (!in.is_ascii()) {
|
|
result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
|
|
return result(res.error, count + res.count);
|
|
}
|
|
reader.advance();
|
|
|
|
count += 64;
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
if (!in.is_ascii()) {
|
|
result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
|
|
return result(res.error, count + res.count);
|
|
} else {
|
|
return result(error_code::SUCCESS, length);
|
|
}
|
|
}
|
|
|
|
result generic_validate_ascii_with_errors(const char * input, size_t length) {
|
|
return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
} // namespace utf8_validation
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_validation/utf8_validator.h */
|
|
// transcoding from UTF-8 to UTF-16
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
|
|
/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
|
|
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
|
|
using namespace simd;
|
|
|
|
template <endianness endian>
|
|
simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
|
|
char16_t* utf16_output) noexcept {
|
|
// The implementation is not specific to haswell and should be moved to the generic directory.
|
|
size_t pos = 0;
|
|
char16_t* start{utf16_output};
|
|
const size_t safety_margin = 16; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
// this loop could be unrolled further. For example, we could process the mask
|
|
// far more than 64 bytes.
|
|
simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
|
|
if(in.is_ascii()) {
|
|
in.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// Slow path. We hope that the compiler will recognize that this is a slow path.
|
|
// Anything that is not a continuation mask is a 'leading byte', that is, the
|
|
// start of a new code point.
|
|
uint64_t utf8_continuation_mask = in.lt(-65 + 1);
|
|
// -65 is 0b10111111 in two-complement's, so largest possible continuation byte
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
// The *start* of code points is not so useful, rather, we want the *end* of code points.
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times when using solely
|
|
// the slow/regular path, and at least four times if there are fast paths.
|
|
while(pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
//
|
|
// Thus we may allow convert_masked_utf8_to_utf16 to process
|
|
// more bytes at a time under a fast-path mode where 16 bytes
|
|
// are consumed at once (e.g., when encountering ASCII).
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
|
|
utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
|
|
return utf16_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
|
|
/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
|
|
|
|
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
using namespace simd;
|
|
|
|
|
|
simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
|
|
);
|
|
constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY,
|
|
CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000
|
|
);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
|
|
);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
|
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
|
|
size_t pos = 0;
|
|
char16_t* start{utf16_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow of
|
|
// 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
|
|
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
|
|
// much more than 8 bytes. However, you cannot generally assume that you have valid
|
|
// UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
|
|
// to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for(; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin-1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if(input.is_ascii()) {
|
|
input.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while(pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
|
|
utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if(errors()) { return 0; }
|
|
if(pos < size) {
|
|
size_t howmany = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
|
|
if(howmany == 0) { return 0; }
|
|
utf16_output += howmany;
|
|
}
|
|
return utf16_output - start;
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
|
|
size_t pos = 0;
|
|
char16_t* start{utf16_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow of
|
|
// 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
|
|
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
|
|
// much more than 8 bytes. However, you cannot generally assume that you have valid
|
|
// UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
|
|
// to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for(; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin-1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if(input.is_ascii()) {
|
|
input.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos onward,
|
|
// with the ability to go back up to pos bytes, and read size-pos bytes forward.
|
|
result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while(pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
|
|
utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if(errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos onward,
|
|
// with the ability to go back up to pos bytes, and read size-pos bytes forward.
|
|
result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if(pos < size) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos onward,
|
|
// with the ability to go back up to pos bytes, and read size-pos bytes forward.
|
|
result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
utf16_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf16_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // utf8_to_utf16 namespace
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
|
|
// transcoding from UTF-8 to UTF-32
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
|
|
/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
|
|
using namespace simd;
|
|
|
|
|
|
simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
|
|
char32_t* utf32_output) noexcept {
|
|
size_t pos = 0;
|
|
char32_t* start{utf32_output};
|
|
const size_t safety_margin = 16; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
|
|
if(in.is_ascii()) {
|
|
in.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// -65 is 0b10111111 in two-complement's, so largest possible continuation byte
|
|
uint64_t utf8_continuation_mask = in.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
while(pos < max_starting_point) {
|
|
size_t consumed = convert_masked_utf8_to_utf32(input + pos,
|
|
utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
}
|
|
}
|
|
utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
|
|
return utf32_output - start;
|
|
}
|
|
|
|
|
|
} // namespace utf8_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
|
|
/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
|
|
|
|
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
using namespace simd;
|
|
|
|
|
|
simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
|
|
);
|
|
constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY,
|
|
CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000
|
|
);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
|
|
);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
|
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
|
|
|
|
simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
|
|
size_t pos = 0;
|
|
char32_t* start{utf32_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow of
|
|
// 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
|
|
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
|
|
// much more than 8 bytes. However, you cannot generally assume that you have valid
|
|
// UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
|
|
// to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for(; margin > 0 && leading_byte < 4; margin--) {
|
|
leading_byte += (int8_t(in[margin-1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the fourth last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if(input.is_ascii()) {
|
|
input.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while(pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf32(in + pos,
|
|
utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if(errors()) { return 0; }
|
|
if(pos < size) {
|
|
size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
|
|
if(howmany == 0) { return 0; }
|
|
utf32_output += howmany;
|
|
}
|
|
return utf32_output - start;
|
|
}
|
|
|
|
simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
|
|
size_t pos = 0;
|
|
char32_t* start{utf32_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow of
|
|
// 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
|
|
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
|
|
// much more than 8 bytes. However, you cannot generally assume that you have valid
|
|
// UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
|
|
// to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for(; margin > 0 && leading_byte < 4; margin--) {
|
|
leading_byte += (int8_t(in[margin-1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the fourth last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if(input.is_ascii()) {
|
|
input.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
if (errors()) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while(pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf32(in + pos,
|
|
utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if(errors()) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if(pos < size) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
utf32_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf32_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // utf8_to_utf32 namespace
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
|
|
// other functions
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8.h
|
|
/* begin file src/generic/utf8.h */
|
|
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf8 {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for(;pos + 64 <= size; pos += 64) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
count += 64 - count_ones(utf8_continuation_mask);
|
|
}
|
|
return count + scalar::utf8::count_code_points(in + pos, size - pos);
|
|
}
|
|
|
|
|
|
simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
// This algorithm could no doubt be improved!
|
|
for(;pos + 64 <= size; pos += 64) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
// We count one word for anything that is not a continuation (so
|
|
// leading bytes).
|
|
count += 64 - count_ones(utf8_continuation_mask);
|
|
int64_t utf8_4byte = input.gteq_unsigned(240);
|
|
count += count_ones(utf8_4byte);
|
|
}
|
|
return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
|
|
}
|
|
|
|
|
|
simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) {
|
|
return count_code_points(in, size);
|
|
}
|
|
} // utf8 namespace
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf16.h
|
|
/* begin file src/generic/utf16.h */
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf16 {
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for(;pos + 32 <= size; pos += 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) input.swap_bytes();
|
|
uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
|
|
count += count_ones(not_pair) / 2;
|
|
}
|
|
return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
// This algorithm could no doubt be improved!
|
|
for(;pos + 32 <= size; pos += 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) input.swap_bytes();
|
|
uint64_t ascii_mask = input.lteq(0x7F);
|
|
uint64_t twobyte_mask = input.lteq(0x7FF);
|
|
uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
|
|
|
|
size_t ascii_count = count_ones(ascii_mask) / 2;
|
|
size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
|
|
size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
|
|
size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
|
|
count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
|
|
}
|
|
return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
|
|
return count_code_points<big_endian>(in, size);
|
|
}
|
|
|
|
simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
|
|
size_t pos = 0;
|
|
|
|
while (pos + 32 <= size) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
input.swap_bytes();
|
|
input.store(reinterpret_cast<uint16_t *>(output));
|
|
pos += 32;
|
|
output += 32;
|
|
}
|
|
|
|
scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
|
|
}
|
|
|
|
} // utf16
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf16.h */
|
|
|
|
namespace simdutf {
|
|
namespace haswell {
|
|
|
|
simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
|
|
// If there is a BOM, then we trust it.
|
|
auto bom_encoding = simdutf::BOM::check_bom(input, length);
|
|
if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
|
|
if (length % 2 == 0) {
|
|
return avx2_detect_encodings<utf8_validation::utf8_checker>(input, length);
|
|
} else {
|
|
if (implementation::validate_utf8(input, length)) {
|
|
return simdutf::encoding_type::UTF8;
|
|
} else {
|
|
return simdutf::encoding_type::unspecified;
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
return haswell::utf8_validation::generic_validate_utf8(buf,len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
|
|
return haswell::utf8_validation::generic_validate_utf8_with_errors(buf,len);
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
|
|
return haswell::utf8_validation::generic_validate_ascii(buf,len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
|
|
return haswell::utf8_validation::generic_validate_ascii_with_errors(buf,len);
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
|
|
const char16_t* tail = avx2_validate_utf16<endianness::LITTLE>(buf, len);
|
|
if (tail) {
|
|
return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
|
|
const char16_t* tail = avx2_validate_utf16<endianness::BIG>(buf, len);
|
|
if (tail) {
|
|
return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
|
|
result res = avx2_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
|
|
if (res.count != len) {
|
|
result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
|
|
result res = avx2_validate_utf16_with_errors<endianness::BIG>(buf, len);
|
|
if (res.count != len) {
|
|
result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
|
|
const char32_t* tail = avx2_validate_utf32le(buf, len);
|
|
if (tail) {
|
|
return scalar::utf32::validate(tail, len - (tail - buf));
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
|
|
result res = avx2_validate_utf32le_with_errors(buf, len);
|
|
if (res.count != len) {
|
|
result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size,
|
|
char16_t* utf16_output) const noexcept {
|
|
return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size,
|
|
char16_t* utf16_output) const noexcept {
|
|
return utf8_to_utf16::convert_valid<endianness::BIG>(input, size, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
utf8_to_utf32::validating_transcoder converter;
|
|
return converter.convert(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
utf8_to_utf32::validating_transcoder converter;
|
|
return converter.convert_with_errors(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
|
|
char32_t* utf32_output) const noexcept {
|
|
return utf8_to_utf32::convert_valid(input, size, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
std::pair<const char16_t*, char*> ret = haswell::avx2_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
std::pair<const char16_t*, char*> ret = haswell::avx2_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char*> ret = haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
|
|
if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char*> ret = haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
|
|
if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return convert_utf16le_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return convert_utf16be_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
std::pair<const char32_t*, char*> ret = avx2_convert_utf32_to_utf8(buf, len, utf8_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char*> ret = haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
std::pair<const char16_t*, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf32_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
std::pair<const char16_t*, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf32_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
|
|
if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
|
|
if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return convert_utf32_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
std::pair<const char32_t*, char16_t*> ret = avx2_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
std::pair<const char32_t*, char16_t*> ret = avx2_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char16_t*> ret = haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char16_t*> ret = haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return convert_utf32_to_utf16le(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return convert_utf32_to_utf16be(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
return convert_utf16le_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
return convert_utf16be_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
|
|
utf16::change_endianness_utf16(input, length, output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
|
|
return utf16::count_code_points<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
|
|
return utf16::count_code_points<endianness::BIG>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
|
|
return utf8::count_code_points(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
|
|
return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
|
|
return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
|
|
return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
|
|
return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
|
|
return utf8::utf16_length_from_utf8(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
|
|
const __m256i v_00000000 = _mm256_setzero_si256();
|
|
const __m256i v_ffffff80 = _mm256_set1_epi32((uint32_t)0xffffff80);
|
|
const __m256i v_fffff800 = _mm256_set1_epi32((uint32_t)0xfffff800);
|
|
const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for(;pos + 8 <= length; pos += 8) {
|
|
__m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
|
|
const __m256i ascii_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffffff80), v_00000000);
|
|
const __m256i one_two_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_fffff800), v_00000000);
|
|
const __m256i two_bytes_bytemask = _mm256_xor_si256(one_two_bytes_bytemask, ascii_bytes_bytemask);
|
|
const __m256i one_two_three_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
|
|
const __m256i three_bytes_bytemask = _mm256_xor_si256(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
|
|
const uint32_t ascii_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(ascii_bytes_bytemask));
|
|
const uint32_t two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(two_bytes_bytemask));
|
|
const uint32_t three_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(three_bytes_bytemask));
|
|
|
|
size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
|
|
size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
|
|
size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
|
|
count += 32 - 3*ascii_count - 2*two_bytes_count - three_bytes_count;
|
|
}
|
|
return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
|
|
const __m256i v_00000000 = _mm256_setzero_si256();
|
|
const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for(;pos + 8 <= length; pos += 8) {
|
|
__m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
|
|
const __m256i surrogate_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
|
|
const uint32_t surrogate_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogate_bytemask));
|
|
size_t surrogate_count = (32-count_ones(surrogate_bitmask))/4;
|
|
count += 8 + surrogate_count;
|
|
}
|
|
return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
|
|
return scalar::utf8::count_code_points(input, length);
|
|
}
|
|
|
|
} // namespace haswell
|
|
} // namespace simdutf
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/end.h
|
|
/* begin file src/simdutf/haswell/end.h */
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_UNTARGET_REGION
|
|
#endif
|
|
|
|
|
|
#if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
|
|
SIMDUTF_POP_DISABLE_WARNINGS
|
|
#endif // end of workaround
|
|
/* end file src/simdutf/haswell/end.h */
|
|
/* end file src/haswell/implementation.cpp */
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_PPC64
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=ppc64/implementation.cpp
|
|
/* begin file src/ppc64/implementation.cpp */
|
|
|
|
|
|
|
|
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/begin.h
|
|
/* begin file src/simdutf/ppc64/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "ppc64"
|
|
// #define SIMDUTF_IMPLEMENTATION ppc64
|
|
/* end file src/simdutf/ppc64/begin.h */
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
#ifndef SIMDUTF_PPC64_H
|
|
#error "ppc64.h must be included"
|
|
#endif
|
|
using namespace simd;
|
|
|
|
|
|
simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
|
|
// careful: 0x80 is not ascii.
|
|
return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere();
|
|
}
|
|
|
|
simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
|
|
simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
|
|
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
|
|
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
|
|
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
|
|
return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
|
|
}
|
|
|
|
simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
|
|
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
|
|
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
|
|
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
|
|
return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h
|
|
/* begin file src/generic/buf_block_reader.h */
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
|
|
// Walks through a buffer in block-sized increments, loading the last part with spaces
|
|
template<size_t STEP_SIZE>
|
|
struct buf_block_reader {
|
|
public:
|
|
simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
|
|
simdutf_really_inline size_t block_index();
|
|
simdutf_really_inline bool has_full_block() const;
|
|
simdutf_really_inline const uint8_t *full_block() const;
|
|
/**
|
|
* Get the last block, padded with spaces.
|
|
*
|
|
* There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
|
|
* function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
|
|
* will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
|
|
*
|
|
* @return the number of effective characters in the last block.
|
|
*/
|
|
simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
|
|
simdutf_really_inline void advance();
|
|
private:
|
|
const uint8_t *buf;
|
|
const size_t len;
|
|
const size_t lenminusstep;
|
|
size_t idx;
|
|
};
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdutf_unused static char * format_input_text_64(const uint8_t *text) {
|
|
static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
|
|
static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
|
|
in.store(reinterpret_cast<uint8_t*>(buf));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] < ' ') { buf[i] = '_'; }
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdutf_unused static char * format_mask(uint64_t mask) {
|
|
static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
|
|
for (size_t i=0; i<64; i++) {
|
|
buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
|
|
}
|
|
buf[64] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
|
|
return idx < lenminusstep;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
|
|
return &buf[idx];
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
|
|
if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
|
|
std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
|
|
std::memcpy(dst, buf + idx, len - idx);
|
|
return len - idx;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
|
|
idx += STEP_SIZE;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/buf_block_reader.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
|
|
/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
|
|
);
|
|
constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY,
|
|
CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000
|
|
);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
|
|
);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
//
|
|
// Return nonzero if there are incomplete multibyte characters at the end of the block:
|
|
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
|
|
//
|
|
simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
|
|
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
|
|
// ... 1111____ 111_____ 11______
|
|
static const uint8_t max_array[32] = {
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
|
|
};
|
|
const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
|
|
return input.gt_bits(max_value);
|
|
}
|
|
|
|
struct utf8_checker {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
// The last input we received
|
|
simd8<uint8_t> prev_input_block;
|
|
// Whether the last input we received was incomplete (used for ASCII fast path)
|
|
simd8<uint8_t> prev_incomplete;
|
|
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
|
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
// The only problem that can happen at EOF is that a multibyte character is too short
|
|
// or a byte value too large in the last bytes: check_special_cases only checks for bytes
|
|
// too large in the first of two bytes.
|
|
simdutf_really_inline void check_eof() {
|
|
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
|
|
// possibly finish them.
|
|
this->error |= this->prev_incomplete;
|
|
}
|
|
|
|
simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
|
|
if(simdutf_likely(is_ascii(input))) {
|
|
this->error |= this->prev_incomplete;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
|
|
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
|
|
|
|
}
|
|
}
|
|
|
|
// do not forget to call check_eof!
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_validation
|
|
|
|
using utf8_validation::utf8_checker;
|
|
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
|
|
/* begin file src/generic/utf8_validation/utf8_validator.h */
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8.
|
|
*/
|
|
template<class checker>
|
|
bool generic_validate_utf8(const uint8_t * input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
return !c.errors();
|
|
}
|
|
|
|
bool generic_validate_utf8(const char * input, size_t length) {
|
|
return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8 and stops on errors.
|
|
*/
|
|
template<class checker>
|
|
result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
size_t count{0};
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
if(c.errors()) {
|
|
if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input + count), length - count);
|
|
res.count += count;
|
|
return res;
|
|
}
|
|
reader.advance();
|
|
count += 64;
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
if (c.errors()) {
|
|
if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input) + count, length - count);
|
|
res.count += count;
|
|
return res;
|
|
} else {
|
|
return result(error_code::SUCCESS, length);
|
|
}
|
|
}
|
|
|
|
result generic_validate_utf8_with_errors(const char * input, size_t length) {
|
|
return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
template<class checker>
|
|
bool generic_validate_ascii(const uint8_t * input, size_t length) {
|
|
buf_block_reader<64> reader(input, length);
|
|
uint8_t blocks[64]{};
|
|
simd::simd8x64<uint8_t> running_or(blocks);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
running_or |= in;
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
running_or |= in;
|
|
return running_or.is_ascii();
|
|
}
|
|
|
|
bool generic_validate_ascii(const char * input, size_t length) {
|
|
return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
template<class checker>
|
|
result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
|
|
buf_block_reader<64> reader(input, length);
|
|
size_t count{0};
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
if (!in.is_ascii()) {
|
|
result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
|
|
return result(res.error, count + res.count);
|
|
}
|
|
reader.advance();
|
|
|
|
count += 64;
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
if (!in.is_ascii()) {
|
|
result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
|
|
return result(res.error, count + res.count);
|
|
} else {
|
|
return result(error_code::SUCCESS, length);
|
|
}
|
|
}
|
|
|
|
result generic_validate_ascii_with_errors(const char * input, size_t length) {
|
|
return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
} // namespace utf8_validation
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_validation/utf8_validator.h */
|
|
// transcoding from UTF-8 to UTF-16
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
|
|
/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
|
|
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
|
|
using namespace simd;
|
|
|
|
template <endianness endian>
|
|
simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
|
|
char16_t* utf16_output) noexcept {
|
|
// The implementation is not specific to haswell and should be moved to the generic directory.
|
|
size_t pos = 0;
|
|
char16_t* start{utf16_output};
|
|
const size_t safety_margin = 16; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
// this loop could be unrolled further. For example, we could process the mask
|
|
// far more than 64 bytes.
|
|
simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
|
|
if(in.is_ascii()) {
|
|
in.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// Slow path. We hope that the compiler will recognize that this is a slow path.
|
|
// Anything that is not a continuation mask is a 'leading byte', that is, the
|
|
// start of a new code point.
|
|
uint64_t utf8_continuation_mask = in.lt(-65 + 1);
|
|
// -65 is 0b10111111 in two-complement's, so largest possible continuation byte
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
// The *start* of code points is not so useful, rather, we want the *end* of code points.
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times when using solely
|
|
// the slow/regular path, and at least four times if there are fast paths.
|
|
while(pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
//
|
|
// Thus we may allow convert_masked_utf8_to_utf16 to process
|
|
// more bytes at a time under a fast-path mode where 16 bytes
|
|
// are consumed at once (e.g., when encountering ASCII).
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
|
|
utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
|
|
return utf16_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
|
|
/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
|
|
|
|
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
using namespace simd;
|
|
|
|
|
|
simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
|
|
);
|
|
constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY,
|
|
CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000
|
|
);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
|
|
);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
|
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
|
|
size_t pos = 0;
|
|
char16_t* start{utf16_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow of
|
|
// 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
|
|
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
|
|
// much more than 8 bytes. However, you cannot generally assume that you have valid
|
|
// UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
|
|
// to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for(; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin-1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if(input.is_ascii()) {
|
|
input.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while(pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
|
|
utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if(errors()) { return 0; }
|
|
if(pos < size) {
|
|
size_t howmany = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
|
|
if(howmany == 0) { return 0; }
|
|
utf16_output += howmany;
|
|
}
|
|
return utf16_output - start;
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
|
|
size_t pos = 0;
|
|
char16_t* start{utf16_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow of
|
|
// 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
|
|
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
|
|
// much more than 8 bytes. However, you cannot generally assume that you have valid
|
|
// UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
|
|
// to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for(; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin-1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if(input.is_ascii()) {
|
|
input.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos onward,
|
|
// with the ability to go back up to pos bytes, and read size-pos bytes forward.
|
|
result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while(pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
|
|
utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if(errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos onward,
|
|
// with the ability to go back up to pos bytes, and read size-pos bytes forward.
|
|
result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if(pos < size) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos onward,
|
|
// with the ability to go back up to pos bytes, and read size-pos bytes forward.
|
|
result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
utf16_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf16_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // utf8_to_utf16 namespace
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
|
|
// transcoding from UTF-8 to UTF-32
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
|
|
/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
|
|
using namespace simd;
|
|
|
|
|
|
simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
|
|
char32_t* utf32_output) noexcept {
|
|
size_t pos = 0;
|
|
char32_t* start{utf32_output};
|
|
const size_t safety_margin = 16; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
|
|
if(in.is_ascii()) {
|
|
in.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// -65 is 0b10111111 in two-complement's, so largest possible continuation byte
|
|
uint64_t utf8_continuation_mask = in.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
while(pos < max_starting_point) {
|
|
size_t consumed = convert_masked_utf8_to_utf32(input + pos,
|
|
utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
}
|
|
}
|
|
utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
|
|
return utf32_output - start;
|
|
}
|
|
|
|
|
|
} // namespace utf8_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
|
|
/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
|
|
|
|
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
using namespace simd;
|
|
|
|
|
|
simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
|
|
);
|
|
constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY,
|
|
CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000
|
|
);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
|
|
);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
|
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
|
|
|
|
simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
|
|
size_t pos = 0;
|
|
char32_t* start{utf32_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow of
|
|
// 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
|
|
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
|
|
// much more than 8 bytes. However, you cannot generally assume that you have valid
|
|
// UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
|
|
// to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for(; margin > 0 && leading_byte < 4; margin--) {
|
|
leading_byte += (int8_t(in[margin-1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the fourth last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if(input.is_ascii()) {
|
|
input.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while(pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf32(in + pos,
|
|
utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if(errors()) { return 0; }
|
|
if(pos < size) {
|
|
size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
|
|
if(howmany == 0) { return 0; }
|
|
utf32_output += howmany;
|
|
}
|
|
return utf32_output - start;
|
|
}
|
|
|
|
simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
|
|
size_t pos = 0;
|
|
char32_t* start{utf32_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow of
|
|
// 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
|
|
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
|
|
// much more than 8 bytes. However, you cannot generally assume that you have valid
|
|
// UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
|
|
// to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for(; margin > 0 && leading_byte < 4; margin--) {
|
|
leading_byte += (int8_t(in[margin-1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the fourth last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if(input.is_ascii()) {
|
|
input.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
if (errors()) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while(pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf32(in + pos,
|
|
utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if(errors()) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if(pos < size) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
utf32_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf32_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // utf8_to_utf32 namespace
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
|
|
// other functions
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8.h
|
|
/* begin file src/generic/utf8.h */
|
|
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf8 {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for(;pos + 64 <= size; pos += 64) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
count += 64 - count_ones(utf8_continuation_mask);
|
|
}
|
|
return count + scalar::utf8::count_code_points(in + pos, size - pos);
|
|
}
|
|
|
|
|
|
simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
// This algorithm could no doubt be improved!
|
|
for(;pos + 64 <= size; pos += 64) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
// We count one word for anything that is not a continuation (so
|
|
// leading bytes).
|
|
count += 64 - count_ones(utf8_continuation_mask);
|
|
int64_t utf8_4byte = input.gteq_unsigned(240);
|
|
count += count_ones(utf8_4byte);
|
|
}
|
|
return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
|
|
}
|
|
|
|
|
|
simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) {
|
|
return count_code_points(in, size);
|
|
}
|
|
} // utf8 namespace
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf16.h
|
|
/* begin file src/generic/utf16.h */
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf16 {
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for(;pos + 32 <= size; pos += 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) input.swap_bytes();
|
|
uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
|
|
count += count_ones(not_pair) / 2;
|
|
}
|
|
return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
// This algorithm could no doubt be improved!
|
|
for(;pos + 32 <= size; pos += 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) input.swap_bytes();
|
|
uint64_t ascii_mask = input.lteq(0x7F);
|
|
uint64_t twobyte_mask = input.lteq(0x7FF);
|
|
uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
|
|
|
|
size_t ascii_count = count_ones(ascii_mask) / 2;
|
|
size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
|
|
size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
|
|
size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
|
|
count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
|
|
}
|
|
return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
|
|
return count_code_points<big_endian>(in, size);
|
|
}
|
|
|
|
simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
|
|
size_t pos = 0;
|
|
|
|
while (pos + 32 <= size) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
input.swap_bytes();
|
|
input.store(reinterpret_cast<uint16_t *>(output));
|
|
pos += 32;
|
|
output += 32;
|
|
}
|
|
|
|
scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
|
|
}
|
|
|
|
} // utf16
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf16.h */
|
|
|
|
//
|
|
// Implementation-specific overrides
|
|
//
|
|
namespace simdutf {
|
|
namespace ppc64 {
|
|
|
|
simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
|
|
// If there is a BOM, then we trust it.
|
|
auto bom_encoding = simdutf::BOM::check_bom(input, length);
|
|
if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
|
|
int out = 0;
|
|
if(validate_utf8(input, length)) { out |= encoding_type::UTF8; }
|
|
if((length % 2) == 0) {
|
|
if(validate_utf16(reinterpret_cast<const char16_t*>(input), length/2)) { out |= encoding_type::UTF16_LE; }
|
|
}
|
|
if((length % 4) == 0) {
|
|
if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { out |= encoding_type::UTF32_LE; }
|
|
}
|
|
|
|
return out;
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
return ppc64::utf8_validation::generic_validate_utf8(buf,len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
|
|
return ppc64::utf8_validation::generic_validate_utf8_with_errors(buf,len);
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
|
|
return ppc64::utf8_validation::generic_validate_ascii(buf,len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
|
|
return ppc64::utf8_validation::generic_validate_ascii_with_errors(buf,len);
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
|
|
return scalar::utf16::validate<endianness::LITTLE>(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
|
|
return scalar::utf16::validate<endianness::BIG>(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
|
|
return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
|
|
return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
|
|
return scalar::utf32::validate_with_errors(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf32(const char16_t *buf, size_t len) const noexcept {
|
|
return scalar::utf32::validate(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
|
|
return 0; // stub
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
|
|
return 0; // stub
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
|
|
return result(error_code::OTHER, 0); // stub
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
|
|
return result(error_code::OTHER, 0); // stub
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
|
|
return 0; // stub
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
|
|
return 0; // stub
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept {
|
|
return 0; // stub
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept {
|
|
return result(error_code::OTHER, 0); // stub
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept {
|
|
return 0; // stub
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len, utf32_output);
|
|
}
|
|
|
|
void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
|
|
scalar::utf16::change_endianness_utf16(input, length, output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
|
|
return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
|
|
return scalar::utf16::count_code_points<endianness::BIG>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
|
|
return utf8::count_code_points(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
|
|
return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
|
|
return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
|
|
return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
|
|
return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
|
|
return scalar::utf8::utf16_length_from_utf8(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
|
|
return scalar::utf32::utf8_length_from_utf32(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
|
|
return scalar::utf32::utf16_length_from_utf32(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
|
|
return scalar::utf8::count_code_points(input, length);
|
|
}
|
|
|
|
} // namespace ppc64
|
|
} // namespace simdutf
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/end.h
|
|
/* begin file src/simdutf/ppc64/end.h */
|
|
/* end file src/simdutf/ppc64/end.h */
|
|
/* end file src/ppc64/implementation.cpp */
|
|
#endif
|
|
#if SIMDUTF_IMPLEMENTATION_WESTMERE
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/implementation.cpp
|
|
/* begin file src/westmere/implementation.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/begin.h
|
|
/* begin file src/simdutf/westmere/begin.h */
|
|
// redefining SIMDUTF_IMPLEMENTATION to "westmere"
|
|
// #define SIMDUTF_IMPLEMENTATION westmere
|
|
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_TARGET_WESTMERE
|
|
#endif
|
|
/* end file src/simdutf/westmere/begin.h */
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
#ifndef SIMDUTF_WESTMERE_H
|
|
#error "westmere.h must be included"
|
|
#endif
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
|
|
return input.reduce_or().is_ascii();
|
|
}
|
|
|
|
simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
|
|
simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
|
|
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
|
|
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
|
|
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
|
|
return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
|
|
}
|
|
|
|
simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
|
|
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
|
|
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
|
|
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
|
|
return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
|
|
}
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_detect_encodings.cpp
|
|
/* begin file src/westmere/sse_detect_encodings.cpp */
|
|
template<class checker>
|
|
// len is known to be a multiple of 2 when this is called
|
|
int sse_detect_encodings(const char * buf, size_t len) {
|
|
const char* start = buf;
|
|
const char* end = buf + len;
|
|
|
|
bool is_utf8 = true;
|
|
bool is_utf16 = true;
|
|
bool is_utf32 = true;
|
|
|
|
int out = 0;
|
|
|
|
const auto v_d8 = simd8<uint8_t>::splat(0xd8);
|
|
const auto v_f8 = simd8<uint8_t>::splat(0xf8);
|
|
|
|
__m128i currentmax = _mm_setzero_si128();
|
|
|
|
checker check{};
|
|
|
|
while(buf + 64 <= end) {
|
|
__m128i in = _mm_loadu_si128((__m128i*)buf);
|
|
__m128i secondin = _mm_loadu_si128((__m128i*)buf+1);
|
|
__m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
|
|
__m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
|
|
|
|
const auto u0 = simd16<uint16_t>(in);
|
|
const auto u1 = simd16<uint16_t>(secondin);
|
|
const auto u2 = simd16<uint16_t>(thirdin);
|
|
const auto u3 = simd16<uint16_t>(fourthin);
|
|
|
|
const auto v0 = u0.shr<8>();
|
|
const auto v1 = u1.shr<8>();
|
|
const auto v2 = u2.shr<8>();
|
|
const auto v3 = u3.shr<8>();
|
|
|
|
const auto in16 = simd16<uint16_t>::pack(v0, v1);
|
|
const auto nextin16 = simd16<uint16_t>::pack(v2, v3);
|
|
|
|
const auto surrogates_wordmask0 = (in16 & v_f8) == v_d8;
|
|
const auto surrogates_wordmask1 = (nextin16 & v_f8) == v_d8;
|
|
uint16_t surrogates_bitmask0 = static_cast<uint16_t>(surrogates_wordmask0.to_bitmask());
|
|
uint16_t surrogates_bitmask1 = static_cast<uint16_t>(surrogates_wordmask1.to_bitmask());
|
|
|
|
// Check for surrogates
|
|
if (surrogates_bitmask0 != 0x0 || surrogates_bitmask1 != 0x0) {
|
|
// Cannot be UTF8
|
|
is_utf8 = false;
|
|
// Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates
|
|
// To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word.
|
|
// On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant
|
|
// bytes of a 32-bit word since they always come in pairs in UTF-16LE.
|
|
// Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit words.
|
|
|
|
if (((surrogates_bitmask0 | surrogates_bitmask1) & 0xaaaa) != 0) {
|
|
is_utf32 = false;
|
|
// Code from sse_validate_utf16le.cpp
|
|
// Not efficient, we do not process surrogates_bitmask1
|
|
const char16_t * input = reinterpret_cast<const char16_t*>(buf);
|
|
const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len/2;
|
|
|
|
const auto v_fc = simd8<uint8_t>::splat(0xfc);
|
|
const auto v_dc = simd8<uint8_t>::splat(0xdc);
|
|
|
|
const uint16_t V0 = static_cast<uint16_t>(~surrogates_bitmask0);
|
|
|
|
const auto vH0 = (in16 & v_fc) == v_dc;
|
|
const uint16_t H0 = static_cast<uint16_t>(vH0.to_bitmask());
|
|
|
|
const uint16_t L0 = static_cast<uint16_t>(~H0 & surrogates_bitmask0);
|
|
|
|
const uint16_t a0 = static_cast<uint16_t>(L0 & (H0 >> 1));
|
|
|
|
const uint16_t b0 = static_cast<uint16_t>(a0 << 1);
|
|
|
|
const uint16_t c0 = static_cast<uint16_t>(V0 | a0 | b0);
|
|
|
|
if (c0 == 0xffff) {
|
|
input += 16;
|
|
} else if (c0 == 0x7fff) {
|
|
input += 15;
|
|
} else {
|
|
is_utf16 = false;
|
|
break;
|
|
}
|
|
|
|
while (input + simd16<uint16_t>::SIZE * 2 < end16) {
|
|
const auto in0 = simd16<uint16_t>(input);
|
|
const auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
|
|
|
|
const auto t0 = in0.shr<8>();
|
|
const auto t1 = in1.shr<8>();
|
|
|
|
const auto in_16 = simd16<uint16_t>::pack(t0, t1);
|
|
|
|
const auto surrogates_wordmask = (in_16 & v_f8) == v_d8;
|
|
const uint16_t surrogates_bitmask = static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
|
|
if (surrogates_bitmask == 0x0) {
|
|
input += 16;
|
|
} else {
|
|
const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
|
|
|
|
const auto vH = (in_16 & v_fc) == v_dc;
|
|
const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
|
|
|
|
const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
|
|
|
|
const uint16_t a = static_cast<uint16_t>(L & (H >> 1));
|
|
|
|
const uint16_t b = static_cast<uint16_t>(a << 1);
|
|
|
|
const uint16_t c = static_cast<uint16_t>(V | a | b);
|
|
|
|
if (c == 0xffff) {
|
|
input += 16;
|
|
} else if (c == 0x7fff) {
|
|
input += 15;
|
|
} else {
|
|
is_utf16 = false;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
is_utf16 = false;
|
|
// Check for UTF-32
|
|
if (len % 4 == 0) {
|
|
const char32_t * input = reinterpret_cast<const char32_t*>(buf);
|
|
const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len/4;
|
|
|
|
// Must start checking for surrogates
|
|
__m128i currentoffsetmax = _mm_setzero_si128();
|
|
const __m128i offset = _mm_set1_epi32(0xffff2000);
|
|
const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
|
|
|
|
currentmax = _mm_max_epu32(in, currentmax);
|
|
currentmax = _mm_max_epu32(secondin, currentmax);
|
|
currentmax = _mm_max_epu32(thirdin, currentmax);
|
|
currentmax = _mm_max_epu32(fourthin, currentmax);
|
|
|
|
currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
|
|
currentoffsetmax = _mm_max_epu32(_mm_add_epi32(secondin, offset), currentoffsetmax);
|
|
currentoffsetmax = _mm_max_epu32(_mm_add_epi32(thirdin, offset), currentoffsetmax);
|
|
currentoffsetmax = _mm_max_epu32(_mm_add_epi32(fourthin, offset), currentoffsetmax);
|
|
|
|
while (input + 4 < end32) {
|
|
const __m128i in32 = _mm_loadu_si128((__m128i *)input);
|
|
currentmax = _mm_max_epu32(in32,currentmax);
|
|
currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in32, offset), currentoffsetmax);
|
|
input += 4;
|
|
}
|
|
|
|
__m128i forbidden_words = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
|
|
if(_mm_testz_si128(forbidden_words, forbidden_words) == 0) {
|
|
is_utf32 = false;
|
|
}
|
|
} else {
|
|
is_utf32 = false;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
// If no surrogate, validate under other encodings as well
|
|
|
|
// UTF-32 validation
|
|
currentmax = _mm_max_epu32(in, currentmax);
|
|
currentmax = _mm_max_epu32(secondin, currentmax);
|
|
currentmax = _mm_max_epu32(thirdin, currentmax);
|
|
currentmax = _mm_max_epu32(fourthin, currentmax);
|
|
|
|
// UTF-8 validation
|
|
// Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h
|
|
simd::simd8x64<uint8_t> in8(in, secondin, thirdin, fourthin);
|
|
check.check_next_input(in8);
|
|
|
|
buf += 64;
|
|
}
|
|
|
|
// Check which encodings are possible
|
|
|
|
if (is_utf8) {
|
|
if (static_cast<size_t>(buf - start) != len) {
|
|
uint8_t block[64]{};
|
|
std::memset(block, 0x20, 64);
|
|
std::memcpy(block, buf, len - (buf - start));
|
|
simd::simd8x64<uint8_t> in(block);
|
|
check.check_next_input(in);
|
|
}
|
|
if (!check.errors()) {
|
|
out |= simdutf::encoding_type::UTF8;
|
|
}
|
|
}
|
|
|
|
if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start))/2)) {
|
|
out |= simdutf::encoding_type::UTF16_LE;
|
|
}
|
|
|
|
if (is_utf32 && (len % 4 == 0)) {
|
|
const __m128i standardmax = _mm_set1_epi32(0x10ffff);
|
|
__m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
|
|
if (_mm_testz_si128(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start))/4)) {
|
|
out |= simdutf::encoding_type::UTF32_LE;
|
|
}
|
|
}
|
|
|
|
return out;
|
|
}
|
|
/* end file src/westmere/sse_detect_encodings.cpp */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_validate_utf16.cpp
|
|
/* begin file src/westmere/sse_validate_utf16.cpp */
|
|
/*
|
|
In UTF-16 words in range 0xD800 to 0xDFFF have special meaning.
|
|
|
|
In a vectorized algorithm we want to examine the most significant
|
|
nibble in order to select a fast path. If none of highest nibbles
|
|
are 0xD (13), than we are sure that UTF-16 chunk in a vector
|
|
register is valid.
|
|
|
|
Let us analyze what we need to check if the nibble is 0xD. The
|
|
value of the preceding nibble determines what we have:
|
|
|
|
0xd000 .. 0xd7ff - a valid word
|
|
0xd800 .. 0xdbff - low surrogate
|
|
0xdc00 .. 0xdfff - high surrogate
|
|
|
|
Other constraints we have to consider:
|
|
- there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
|
|
- there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
|
|
- there must not be sole low surrogate nor high surrogate
|
|
|
|
We're going to build three bitmasks based on the 3rd nibble:
|
|
- V = valid word,
|
|
- L = low surrogate (0xd800 .. 0xdbff)
|
|
- H = high surrogate (0xdc00 .. 0xdfff)
|
|
|
|
0 1 2 3 4 5 6 7 <--- word index
|
|
[ V | L | H | L | H | V | V | L ]
|
|
1 0 0 0 0 1 1 0 - V = valid masks
|
|
0 1 0 1 0 0 0 1 - L = low surrogate
|
|
0 0 1 0 1 0 0 0 - H high surrogate
|
|
|
|
|
|
1 0 0 0 0 1 1 0 V = valid masks
|
|
0 1 0 1 0 0 0 0 a = L & (H >> 1)
|
|
0 0 1 0 1 0 0 0 b = a << 1
|
|
1 1 1 1 1 1 1 0 c = V | a | b
|
|
^
|
|
the last bit can be zero, we just consume 7 words
|
|
and recheck this word in the next iteration
|
|
*/
|
|
|
|
/* Returns:
|
|
- pointer to the last unprocessed character (a scalar fallback should check the rest);
|
|
- nullptr if an error was detected.
|
|
*/
|
|
template <endianness big_endian>
|
|
const char16_t* sse_validate_utf16(const char16_t* input, size_t size) {
|
|
const char16_t* end = input + size;
|
|
|
|
const auto v_d8 = simd8<uint8_t>::splat(0xd8);
|
|
const auto v_f8 = simd8<uint8_t>::splat(0xf8);
|
|
const auto v_fc = simd8<uint8_t>::splat(0xfc);
|
|
const auto v_dc = simd8<uint8_t>::splat(0xdc);
|
|
|
|
while (input + simd16<uint16_t>::SIZE * 2 < end) {
|
|
// 0. Load data: since the validation takes into account only higher
|
|
// byte of each word, we compress the two vectors into one which
|
|
// consists only the higher bytes.
|
|
auto in0 = simd16<uint16_t>(input);
|
|
auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
|
|
if (big_endian) {
|
|
in0 = in0.swap_bytes();
|
|
in1 = in1.swap_bytes();
|
|
}
|
|
|
|
const auto t0 = in0.shr<8>();
|
|
const auto t1 = in1.shr<8>();
|
|
|
|
const auto in = simd16<uint16_t>::pack(t0, t1);
|
|
|
|
// 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
|
|
const auto surrogates_wordmask = (in & v_f8) == v_d8;
|
|
const uint16_t surrogates_bitmask = static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
|
|
if (surrogates_bitmask == 0x0000) {
|
|
input += 16;
|
|
} else {
|
|
// 2. We have some surrogates that have to be distinguished:
|
|
// - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
|
|
// - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
|
|
//
|
|
// Fact: high surrogate has 11th bit set (3rd bit in the higher word)
|
|
|
|
// V - non-surrogate words
|
|
// V = not surrogates_wordmask
|
|
const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
|
|
|
|
// H - word-mask for high surrogates: the six highest bits are 0b1101'11
|
|
const auto vH = (in & v_fc) == v_dc;
|
|
const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
|
|
|
|
// L - word mask for low surrogates
|
|
// L = not H and surrogates_wordmask
|
|
const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
|
|
|
|
const uint16_t a = static_cast<uint16_t>(L & (H >> 1)); // A low surrogate must be followed by high one.
|
|
// (A low surrogate placed in the 7th register's word
|
|
// is an exception we handle.)
|
|
const uint16_t b = static_cast<uint16_t>(a << 1); // Just mark that the opinput - startite fact is hold,
|
|
// thanks to that we have only two masks for valid case.
|
|
const uint16_t c = static_cast<uint16_t>(V | a | b); // Combine all the masks into the final one.
|
|
|
|
if (c == 0xffff) {
|
|
// The whole input register contains valid UTF-16, i.e.,
|
|
// either single words or proper surrogate pairs.
|
|
input += 16;
|
|
} else if (c == 0x7fff) {
|
|
// The 15 lower words of the input register contains valid UTF-16.
|
|
// The 15th word may be either a low or high surrogate. It the next
|
|
// iteration we 1) check if the low surrogate is followed by a high
|
|
// one, 2) reject sole high surrogate.
|
|
input += 15;
|
|
} else {
|
|
return nullptr;
|
|
}
|
|
}
|
|
}
|
|
|
|
return input;
|
|
}
|
|
|
|
|
|
template <endianness big_endian>
|
|
const result sse_validate_utf16_with_errors(const char16_t* input, size_t size) {
|
|
const char16_t* start = input;
|
|
const char16_t* end = input + size;
|
|
|
|
const auto v_d8 = simd8<uint8_t>::splat(0xd8);
|
|
const auto v_f8 = simd8<uint8_t>::splat(0xf8);
|
|
const auto v_fc = simd8<uint8_t>::splat(0xfc);
|
|
const auto v_dc = simd8<uint8_t>::splat(0xdc);
|
|
|
|
while (input + simd16<uint16_t>::SIZE * 2 < end) {
|
|
// 0. Load data: since the validation takes into account only higher
|
|
// byte of each word, we compress the two vectors into one which
|
|
// consists only the higher bytes.
|
|
auto in0 = simd16<uint16_t>(input);
|
|
auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
|
|
|
|
if (big_endian) {
|
|
in0 = in0.swap_bytes();
|
|
in1 = in1.swap_bytes();
|
|
}
|
|
|
|
const auto t0 = in0.shr<8>();
|
|
const auto t1 = in1.shr<8>();
|
|
|
|
const auto in = simd16<uint16_t>::pack(t0, t1);
|
|
|
|
// 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
|
|
const auto surrogates_wordmask = (in & v_f8) == v_d8;
|
|
const uint16_t surrogates_bitmask = static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
|
|
if (surrogates_bitmask == 0x0000) {
|
|
input += 16;
|
|
} else {
|
|
// 2. We have some surrogates that have to be distinguished:
|
|
// - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
|
|
// - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
|
|
//
|
|
// Fact: high surrogate has 11th bit set (3rd bit in the higher word)
|
|
|
|
// V - non-surrogate words
|
|
// V = not surrogates_wordmask
|
|
const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
|
|
|
|
// H - word-mask for high surrogates: the six highest bits are 0b1101'11
|
|
const auto vH = (in & v_fc) == v_dc;
|
|
const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
|
|
|
|
// L - word mask for low surrogates
|
|
// L = not H and surrogates_wordmask
|
|
const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
|
|
|
|
const uint16_t a = static_cast<uint16_t>(L & (H >> 1)); // A low surrogate must be followed by high one.
|
|
// (A low surrogate placed in the 7th register's word
|
|
// is an exception we handle.)
|
|
const uint16_t b = static_cast<uint16_t>(a << 1); // Just mark that the opinput - startite fact is hold,
|
|
// thanks to that we have only two masks for valid case.
|
|
const uint16_t c = static_cast<uint16_t>(V | a | b); // Combine all the masks into the final one.
|
|
|
|
if (c == 0xffff) {
|
|
// The whole input register contains valid UTF-16, i.e.,
|
|
// either single words or proper surrogate pairs.
|
|
input += 16;
|
|
} else if (c == 0x7fff) {
|
|
// The 15 lower words of the input register contains valid UTF-16.
|
|
// The 15th word may be either a low or high surrogate. It the next
|
|
// iteration we 1) check if the low surrogate is followed by a high
|
|
// one, 2) reject sole high surrogate.
|
|
input += 15;
|
|
} else {
|
|
return result(error_code::SURROGATE, input - start);
|
|
}
|
|
}
|
|
}
|
|
|
|
return result(error_code::SUCCESS, input - start);
|
|
}
|
|
/* end file src/westmere/sse_validate_utf16.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_validate_utf32le.cpp
|
|
/* begin file src/westmere/sse_validate_utf32le.cpp */
|
|
/* Returns:
|
|
- pointer to the last unprocessed character (a scalar fallback should check the rest);
|
|
- nullptr if an error was detected.
|
|
*/
|
|
const char32_t* sse_validate_utf32le(const char32_t* input, size_t size) {
|
|
const char32_t* end = input + size;
|
|
|
|
const __m128i standardmax = _mm_set1_epi32(0x10ffff);
|
|
const __m128i offset = _mm_set1_epi32(0xffff2000);
|
|
const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
|
|
__m128i currentmax = _mm_setzero_si128();
|
|
__m128i currentoffsetmax = _mm_setzero_si128();
|
|
|
|
while (input + 4 < end) {
|
|
const __m128i in = _mm_loadu_si128((__m128i *)input);
|
|
currentmax = _mm_max_epu32(in,currentmax);
|
|
currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
|
|
input += 4;
|
|
}
|
|
__m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
|
|
if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
|
|
return nullptr;
|
|
}
|
|
|
|
is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
|
|
if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
|
|
return nullptr;
|
|
}
|
|
|
|
return input;
|
|
}
|
|
|
|
|
|
const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size) {
|
|
const char32_t* start = input;
|
|
const char32_t* end = input + size;
|
|
|
|
const __m128i standardmax = _mm_set1_epi32(0x10ffff);
|
|
const __m128i offset = _mm_set1_epi32(0xffff2000);
|
|
const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
|
|
__m128i currentmax = _mm_setzero_si128();
|
|
__m128i currentoffsetmax = _mm_setzero_si128();
|
|
|
|
while (input + 4 < end) {
|
|
const __m128i in = _mm_loadu_si128((__m128i *)input);
|
|
currentmax = _mm_max_epu32(in,currentmax);
|
|
currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
|
|
|
|
__m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
|
|
if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
|
|
return result(error_code::TOO_LARGE, input - start);
|
|
}
|
|
|
|
is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
|
|
if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
|
|
return result(error_code::SURROGATE, input - start);
|
|
}
|
|
input += 4;
|
|
}
|
|
|
|
return result(error_code::SUCCESS, input - start);
|
|
}
|
|
/* end file src/westmere/sse_validate_utf32le.cpp */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf8_to_utf16.cpp
|
|
/* begin file src/westmere/sse_convert_utf8_to_utf16.cpp */
|
|
// depends on "tables/utf8_to_utf16_tables.h"
|
|
|
|
|
|
// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 12).
|
|
template <endianness big_endian>
|
|
size_t convert_masked_utf8_to_utf16(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char16_t *&utf16_output) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it is maybe
|
|
// beneficial to have fast paths that depend on branch prediction but have less latency.
|
|
// This results in more instructions but, potentially, also higher speeds.
|
|
//
|
|
// We first try a few fast paths.
|
|
const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
const __m128i in = _mm_loadu_si128((__m128i *)input);
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask & 0xfff;
|
|
if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
|
|
// We process the data in chunks of 16 bytes.
|
|
__m128i ascii_first = _mm_cvtepu8_epi16(in);
|
|
__m128i ascii_second = _mm_cvtepu8_epi16(_mm_srli_si128(in,8));
|
|
if (big_endian) {
|
|
ascii_first = _mm_shuffle_epi8(ascii_first, swap);
|
|
ascii_second = _mm_shuffle_epi8(ascii_second, swap);
|
|
}
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output), ascii_first);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + 8), ascii_second);
|
|
utf16_output += 16; // We wrote 16 16-bit characters.
|
|
return 16; // We consumed 16 bytes.
|
|
}
|
|
if(((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) {
|
|
// We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
|
|
// There is probably a more efficient sequence, but the following might do.
|
|
const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
|
|
__m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
|
|
if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
|
|
_mm_storeu_si128((__m128i *)utf16_output, composed);
|
|
utf16_output += 8; // We wrote 16 bytes, 8 code points.
|
|
return 16;
|
|
}
|
|
if(input_utf8_end_of_code_point_mask == 0x924) {
|
|
// We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
|
|
// There is probably a more efficient sequence, but the following might do.
|
|
const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
|
|
const __m128i middlebyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
const __m128i highbyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
|
|
__m128i composed_repacked = _mm_packus_epi32(composed, composed);
|
|
if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
|
|
_mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
|
|
utf16_output += 4;
|
|
return 12;
|
|
}
|
|
/// We do not have a fast path available, so we fallback.
|
|
|
|
const uint8_t idx =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
|
|
const uint8_t consumed =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
|
|
if (idx < 64) {
|
|
// SIX (6) input code-words
|
|
// this is a relatively easy scenario
|
|
// we process SIX (6) input code-words. The max length in bytes of six code
|
|
// words spanning between 1 and 2 bytes each is 12 bytes. On processors
|
|
// where pdep/pext is fast, we might be able to use a small lookup table.
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
|
|
__m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
|
|
if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
|
|
_mm_storeu_si128((__m128i *)utf16_output, composed);
|
|
utf16_output += 6; // We wrote 12 bytes, 6 code points.
|
|
} else if (idx < 145) {
|
|
// FOUR (4) input code-words
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
|
|
const __m128i middlebyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
const __m128i highbyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
|
|
__m128i composed_repacked = _mm_packus_epi32(composed, composed);
|
|
if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
|
|
_mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
|
|
utf16_output += 4;
|
|
} else if (idx < 209) {
|
|
// TWO (2) input code-words
|
|
//////////////
|
|
// There might be garbage inputs where a leading byte mascarades as a four-byte
|
|
// leading byte (by being followed by 3 continuation byte), but is not greater than
|
|
// 0xf0. This could trigger a buffer overflow if we only counted leading
|
|
// bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation.
|
|
// Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs.
|
|
// We do as at the cost of an extra mask.
|
|
/////////////
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
|
|
const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
__m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
|
|
// correct for spurious high bit
|
|
const __m128i correct =
|
|
_mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
|
|
middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
|
|
const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
|
|
// We deliberately carry the leading four bits in highbyte if they are present,
|
|
// we remove them later when computing hightenbits.
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
|
|
// When we need to generate a surrogate pair (leading byte > 0xF0), then
|
|
// the corresponding 32-bit value in 'composed' will be greater than
|
|
// > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
|
|
// location of the surrogate pairs.
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
|
|
_mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
|
|
const __m128i composedminus =
|
|
_mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
|
|
const __m128i lowtenbits =
|
|
_mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
|
|
// Notice the 0x3ff mask:
|
|
const __m128i hightenbits = _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
|
|
const __m128i lowtenbitsadd =
|
|
_mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
|
|
const __m128i hightenbitsadd =
|
|
_mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
|
|
const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
|
|
__m128i surrogates =
|
|
_mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
|
|
uint32_t basic_buffer[4];
|
|
uint32_t basic_buffer_swap[4];
|
|
if (big_endian) {
|
|
_mm_storeu_si128((__m128i *)basic_buffer_swap, _mm_shuffle_epi8(composed, swap));
|
|
surrogates = _mm_shuffle_epi8(surrogates, swap);
|
|
}
|
|
_mm_storeu_si128((__m128i *)basic_buffer, composed);
|
|
uint32_t surrogate_buffer[4];
|
|
_mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
|
|
for (size_t i = 0; i < 3; i++) {
|
|
if(basic_buffer[i] > 0x3c00000) {
|
|
utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
|
|
utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
|
|
utf16_output += 2;
|
|
} else {
|
|
utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
|
|
utf16_output++;
|
|
}
|
|
}
|
|
} else {
|
|
// here we know that there is an error but we do not handle errors
|
|
}
|
|
return consumed;
|
|
}
|
|
/* end file src/westmere/sse_convert_utf8_to_utf16.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf8_to_utf32.cpp
|
|
/* begin file src/westmere/sse_convert_utf8_to_utf32.cpp */
|
|
// depends on "tables/utf8_to_utf16_tables.h"
|
|
|
|
|
|
// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
|
|
// end of the code points. Only the least significant 12 bits of the mask
|
|
// are accessed.
|
|
// It returns how many bytes were consumed (up to 12).
|
|
size_t convert_masked_utf8_to_utf32(const char *input,
|
|
uint64_t utf8_end_of_code_point_mask,
|
|
char32_t *&utf32_output) {
|
|
// we use an approach where we try to process up to 12 input bytes.
|
|
// Why 12 input bytes and not 16? Because we are concerned with the size of
|
|
// the lookup tables. Also 12 is nicely divisible by two and three.
|
|
//
|
|
//
|
|
// Optimization note: our main path below is load-latency dependent. Thus it is maybe
|
|
// beneficial to have fast paths that depend on branch prediction but have less latency.
|
|
// This results in more instructions but, potentially, also higher speeds.
|
|
//
|
|
// We first try a few fast paths.
|
|
const __m128i in = _mm_loadu_si128((__m128i *)input);
|
|
const uint16_t input_utf8_end_of_code_point_mask =
|
|
utf8_end_of_code_point_mask & 0xfff;
|
|
if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
|
|
// We process the data in chunks of 16 bytes.
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu8_epi32(in));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu8_epi32(_mm_srli_si128(in,4)));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+8), _mm_cvtepu8_epi32(_mm_srli_si128(in,8)));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+12), _mm_cvtepu8_epi32(_mm_srli_si128(in,12)));
|
|
utf32_output += 16; // We wrote 16 32-bit characters.
|
|
return 16; // We consumed 16 bytes.
|
|
}
|
|
if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
|
|
// We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
|
|
// There is probably a more efficient sequence, but the following might do.
|
|
const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
|
|
const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(composed));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(composed,8)));
|
|
utf32_output += 8; // We wrote 32 bytes, 8 code points.
|
|
return 16;
|
|
}
|
|
if(input_utf8_end_of_code_point_mask == 0x924) {
|
|
// We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
|
|
// There is probably a more efficient sequence, but the following might do.
|
|
const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
|
|
const __m128i middlebyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
const __m128i highbyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
|
|
_mm_storeu_si128((__m128i *)utf32_output, composed);
|
|
utf32_output += 4;
|
|
return 12;
|
|
}
|
|
/// We do not have a fast path available, so we fallback.
|
|
|
|
const uint8_t idx =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
|
|
const uint8_t consumed =
|
|
tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
|
|
if (idx < 64) {
|
|
// SIX (6) input code-words
|
|
// this is a relatively easy scenario
|
|
// we process SIX (6) input code-words. The max length in bytes of six code
|
|
// words spanning between 1 and 2 bytes each is 12 bytes. On processors
|
|
// where pdep/pext is fast, we might be able to use a small lookup table.
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
|
|
const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(composed));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(composed,8)));
|
|
utf32_output += 6; // We wrote 12 bytes, 6 code points.
|
|
} else if (idx < 145) {
|
|
// FOUR (4) input code-words
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
|
|
const __m128i middlebyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
const __m128i highbyte =
|
|
_mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
|
|
_mm_storeu_si128((__m128i *)utf32_output, composed);
|
|
utf32_output += 4;
|
|
} else if (idx < 209) {
|
|
// TWO (2) input code-words
|
|
const __m128i sh =
|
|
_mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
|
|
const __m128i perm = _mm_shuffle_epi8(in, sh);
|
|
const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
|
|
const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
|
|
const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
|
|
__m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
|
|
// correct for spurious high bit
|
|
const __m128i correct =
|
|
_mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
|
|
middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
|
|
const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
|
|
const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
|
|
const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
|
|
const __m128i composed =
|
|
_mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
|
|
_mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
|
|
_mm_storeu_si128((__m128i *)utf32_output, composed);
|
|
utf32_output += 3;
|
|
} else {
|
|
// here we know that there is an error but we do not handle errors
|
|
}
|
|
return consumed;
|
|
}
|
|
/* end file src/westmere/sse_convert_utf8_to_utf32.cpp */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf16_to_utf8.cpp
|
|
/* begin file src/westmere/sse_convert_utf16_to_utf8.cpp */
|
|
/*
|
|
The vectorized algorithm works on single SSE register i.e., it
|
|
loads eight 16-bit words.
|
|
|
|
We consider three cases:
|
|
1. an input register contains no surrogates and each value
|
|
is in range 0x0000 .. 0x07ff.
|
|
2. an input register contains no surrogates and values are
|
|
is in range 0x0000 .. 0xffff.
|
|
3. an input register contains surrogates --- i.e. codepoints
|
|
can have 16 or 32 bits.
|
|
|
|
Ad 1.
|
|
|
|
When values are less than 0x0800, it means that a 16-bit words
|
|
can be converted into: 1) single UTF8 byte (when it's an ASCII
|
|
char) or 2) two UTF8 bytes.
|
|
|
|
For this case we do only some shuffle to obtain these 2-byte
|
|
codes and finally compress the whole SSE register with a single
|
|
shuffle.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
Ad 2.
|
|
|
|
When values fit in 16-bit words, but are above 0x07ff, then
|
|
a single word may produce one, two or three UTF8 bytes.
|
|
|
|
We prepare data for all these three cases in two registers.
|
|
The first register contains lower two UTF8 bytes (used in all
|
|
cases), while the second one contains just the third byte for
|
|
the three-UTF8-bytes case.
|
|
|
|
Finally these two registers are interleaved forming eight-element
|
|
array of 32-bit values. The array spans two SSE registers.
|
|
The bytes from the registers are compressed using two shuffles.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
|
|
To summarize:
|
|
- We need two 256-entry tables that have 8704 bytes in total.
|
|
*/
|
|
|
|
/*
|
|
Returns a pair: the first unprocessed byte from buf and utf8_output
|
|
A scalar routing should carry on the conversion of the tail.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<const char16_t*, char*> sse_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) {
|
|
|
|
const char16_t* end = buf + len;
|
|
|
|
const __m128i v_0000 = _mm_setzero_si128();
|
|
const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
|
|
const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
|
|
const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
|
|
const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (buf + 16 + safety_margin <= end) {
|
|
__m128i in = _mm_loadu_si128((__m128i*)buf);
|
|
if (big_endian) {
|
|
const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
in = _mm_shuffle_epi8(in, swap);
|
|
}
|
|
// a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
|
|
const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
|
|
if(_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
|
|
__m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
|
|
if (big_endian) {
|
|
const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
nextin = _mm_shuffle_epi8(nextin, swap);
|
|
}
|
|
if(!_mm_testz_si128(nextin, v_ff80)) {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
const __m128i utf8_packed = _mm_packus_epi16(in,in);
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
utf8_output += 8;
|
|
in = nextin;
|
|
} else {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
const __m128i utf8_packed = _mm_packus_epi16(in,nextin);
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
}
|
|
|
|
// no bits set above 7th bit
|
|
const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
|
|
const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
|
|
|
|
// no bits set above 11th bit
|
|
const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
|
|
const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
|
|
|
|
if (one_or_two_bytes_bitmask == 0xffff) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
|
|
const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m128i t0 = _mm_slli_epi16(in, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m128i t1 = _mm_and_si128(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m128i t2 = _mm_and_si128(in, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m128i t3 = _mm_or_si128(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m128i t4 = _mm_or_si128(t3, v_c080);
|
|
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask);
|
|
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
// one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
|
|
const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
|
|
const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
|
|
const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 = hdgcfbea
|
|
// 4. pack the bytes
|
|
const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
|
|
const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
|
|
const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
|
|
|
|
// 5. store bytes
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
|
|
|
|
// 6. adjust pointers
|
|
buf += 8;
|
|
utf8_output += row[0];
|
|
continue;
|
|
|
|
}
|
|
|
|
// 1. Check if there are any surrogate word in the input chunk.
|
|
// We have also deal with situation when there is a surrogate word
|
|
// at the end of a chunk.
|
|
const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
|
|
|
|
// bitmask = 0x0000 if there are no surrogates
|
|
// = 0xc000 if the last word is a surrogate
|
|
const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
|
|
// it is likely an uncommon occurrence.
|
|
if (surrogates_bitmask == 0x0000) {
|
|
// case: words from register produce either 1, 2 or 3 UTF-8 bytes
|
|
const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two words (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two words we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
|
|
const __m128i s0 = _mm_srli_epi16(in, 4);
|
|
// [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
|
|
const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
|
|
// [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
|
|
const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
|
|
// [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
|
|
const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
|
|
const __m128i s4 = _mm_xor_si128(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand words 16-bit => 32-bit
|
|
const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
|
|
const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
|
|
|
|
// 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint16_t mask = (one_byte_bitmask & 0x5555) |
|
|
(one_or_two_bytes_bitmask & 0xaaaa);
|
|
if(mask == 0) {
|
|
// We only have three-byte words. Use fast path.
|
|
const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_0);
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_1);
|
|
utf8_output += 12;
|
|
buf += 8;
|
|
continue;
|
|
}
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
|
|
const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
|
|
const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
|
|
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
|
|
if((word & 0xFF80)==0) {
|
|
*utf8_output++ = char(word);
|
|
} else if((word & 0xF800)==0) {
|
|
*utf8_output++ = char((word>>6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if((word &0xF800 ) != 0xD800) {
|
|
*utf8_output++ = char((word>>12) | 0b11100000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, utf8_output); }
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf8_output++ = char((value>>18) | 0b11110000);
|
|
*utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
return std::make_pair(buf, utf8_output);
|
|
}
|
|
|
|
|
|
/*
|
|
Returns a pair: a result struct and utf8_output.
|
|
If there is an error, the count field of the result is the position of the error.
|
|
Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
|
|
A scalar routing should carry on the conversion of the tail if needed.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<result, char*> sse_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) {
|
|
const char16_t* start = buf;
|
|
const char16_t* end = buf + len;
|
|
|
|
const __m128i v_0000 = _mm_setzero_si128();
|
|
const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
|
|
const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
|
|
const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
|
|
const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (buf + 16 + safety_margin <= end) {
|
|
__m128i in = _mm_loadu_si128((__m128i*)buf);
|
|
if (big_endian) {
|
|
const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
in = _mm_shuffle_epi8(in, swap);
|
|
}
|
|
// a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
|
|
const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
|
|
if(_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
|
|
__m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
|
|
if (big_endian) {
|
|
const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
nextin = _mm_shuffle_epi8(nextin, swap);
|
|
}
|
|
if(!_mm_testz_si128(nextin, v_ff80)) {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
const __m128i utf8_packed = _mm_packus_epi16(in,in);
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
utf8_output += 8;
|
|
in = nextin;
|
|
} else {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
const __m128i utf8_packed = _mm_packus_epi16(in,nextin);
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
}
|
|
|
|
// no bits set above 7th bit
|
|
const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
|
|
const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
|
|
|
|
// no bits set above 11th bit
|
|
const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
|
|
const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
|
|
|
|
if (one_or_two_bytes_bitmask == 0xffff) {
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
|
|
const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m128i t0 = _mm_slli_epi16(in, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m128i t1 = _mm_and_si128(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m128i t2 = _mm_and_si128(in, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m128i t3 = _mm_or_si128(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m128i t4 = _mm_or_si128(t3, v_c080);
|
|
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask);
|
|
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
// one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
|
|
const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
|
|
const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
|
|
const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 = hdgcfbea
|
|
// 4. pack the bytes
|
|
const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
|
|
const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
|
|
const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
|
|
|
|
// 5. store bytes
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
|
|
|
|
// 6. adjust pointers
|
|
buf += 8;
|
|
utf8_output += row[0];
|
|
continue;
|
|
|
|
}
|
|
|
|
// 1. Check if there are any surrogate word in the input chunk.
|
|
// We have also deal with situation when there is a surrogate word
|
|
// at the end of a chunk.
|
|
const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
|
|
|
|
// bitmask = 0x0000 if there are no surrogates
|
|
// = 0xc000 if the last word is a surrogate
|
|
const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
|
|
// it is likely an uncommon occurrence.
|
|
if (surrogates_bitmask == 0x0000) {
|
|
// case: words from register produce either 1, 2 or 3 UTF-8 bytes
|
|
const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two words (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two words we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
|
|
const __m128i s0 = _mm_srli_epi16(in, 4);
|
|
// [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
|
|
const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
|
|
// [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
|
|
const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
|
|
// [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
|
|
const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
|
|
const __m128i s4 = _mm_xor_si128(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand words 16-bit => 32-bit
|
|
const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
|
|
const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
|
|
|
|
// 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint16_t mask = (one_byte_bitmask & 0x5555) |
|
|
(one_or_two_bytes_bitmask & 0xaaaa);
|
|
if(mask == 0) {
|
|
// We only have three-byte words. Use fast path.
|
|
const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_0);
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_1);
|
|
utf8_output += 12;
|
|
buf += 8;
|
|
continue;
|
|
}
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
|
|
const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
|
|
const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
|
|
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
|
|
if((word & 0xFF80)==0) {
|
|
*utf8_output++ = char(word);
|
|
} else if((word & 0xF800)==0) {
|
|
*utf8_output++ = char((word>>6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if((word &0xF800 ) != 0xD800) {
|
|
*utf8_output++ = char((word>>12) | 0b11100000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output); }
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf8_output++ = char((value>>18) | 0b11110000);
|
|
*utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((value & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
|
|
}
|
|
/* end file src/westmere/sse_convert_utf16_to_utf8.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf16_to_utf32.cpp
|
|
/* begin file src/westmere/sse_convert_utf16_to_utf32.cpp */
|
|
/*
|
|
The vectorized algorithm works on single SSE register i.e., it
|
|
loads eight 16-bit words.
|
|
|
|
We consider three cases:
|
|
1. an input register contains no surrogates and each value
|
|
is in range 0x0000 .. 0x07ff.
|
|
2. an input register contains no surrogates and values are
|
|
is in range 0x0000 .. 0xffff.
|
|
3. an input register contains surrogates --- i.e. codepoints
|
|
can have 16 or 32 bits.
|
|
|
|
Ad 1.
|
|
|
|
When values are less than 0x0800, it means that a 16-bit words
|
|
can be converted into: 1) single UTF8 byte (when it's an ASCII
|
|
char) or 2) two UTF8 bytes.
|
|
|
|
For this case we do only some shuffle to obtain these 2-byte
|
|
codes and finally compress the whole SSE register with a single
|
|
shuffle.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
Ad 2.
|
|
|
|
When values fit in 16-bit words, but are above 0x07ff, then
|
|
a single word may produce one, two or three UTF8 bytes.
|
|
|
|
We prepare data for all these three cases in two registers.
|
|
The first register contains lower two UTF8 bytes (used in all
|
|
cases), while the second one contains just the third byte for
|
|
the three-UTF8-bytes case.
|
|
|
|
Finally these two registers are interleaved forming eight-element
|
|
array of 32-bit values. The array spans two SSE registers.
|
|
The bytes from the registers are compressed using two shuffles.
|
|
|
|
We need 256-entry lookup table to get a compression pattern
|
|
and the number of output bytes in the compressed vector register.
|
|
Each entry occupies 17 bytes.
|
|
|
|
|
|
To summarize:
|
|
- We need two 256-entry tables that have 8704 bytes in total.
|
|
*/
|
|
|
|
/*
|
|
Returns a pair: the first unprocessed byte from buf and utf8_output
|
|
A scalar routing should carry on the conversion of the tail.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<const char16_t*, char32_t*> sse_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) {
|
|
const char16_t* end = buf + len;
|
|
|
|
const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
|
|
const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
|
|
|
|
while (buf + 16 <= end) {
|
|
__m128i in = _mm_loadu_si128((__m128i*)buf);
|
|
|
|
if (big_endian) {
|
|
const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
in = _mm_shuffle_epi8(in, swap);
|
|
}
|
|
|
|
// 1. Check if there are any surrogate word in the input chunk.
|
|
// We have also deal with situation when there is a surrogate word
|
|
// at the end of a chunk.
|
|
const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
|
|
|
|
// bitmask = 0x0000 if there are no surrogates
|
|
// = 0xc000 if the last word is a surrogate
|
|
const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
|
|
// it is likely an uncommon occurrence.
|
|
if (surrogates_bitmask == 0x0000) {
|
|
// case: no surrogate pair, extend 16-bit words to 32-bit words
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(in));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(in,8)));
|
|
utf32_output += 8;
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
|
|
if((word &0xF800 ) != 0xD800) {
|
|
*utf32_output++ = char32_t(word);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, utf32_output); }
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(buf, utf32_output);
|
|
}
|
|
|
|
|
|
/*
|
|
Returns a pair: a result struct and utf8_output.
|
|
If there is an error, the count field of the result is the position of the error.
|
|
Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
|
|
A scalar routing should carry on the conversion of the tail if needed.
|
|
*/
|
|
template <endianness big_endian>
|
|
std::pair<result, char32_t*> sse_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) {
|
|
const char16_t* start = buf;
|
|
const char16_t* end = buf + len;
|
|
|
|
const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
|
|
const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
|
|
|
|
while (buf + 16 <= end) {
|
|
__m128i in = _mm_loadu_si128((__m128i*)buf);
|
|
|
|
if (big_endian) {
|
|
const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
in = _mm_shuffle_epi8(in, swap);
|
|
}
|
|
|
|
// 1. Check if there are any surrogate word in the input chunk.
|
|
// We have also deal with situation when there is a surrogate word
|
|
// at the end of a chunk.
|
|
const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
|
|
|
|
// bitmask = 0x0000 if there are no surrogates
|
|
// = 0xc000 if the last word is a surrogate
|
|
const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
|
|
// It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
|
|
// it is likely an uncommon occurrence.
|
|
if (surrogates_bitmask == 0x0000) {
|
|
// case: no surrogate pair, extend 16-bit words to 32-bit words
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(in));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(in,8)));
|
|
utf32_output += 8;
|
|
buf += 8;
|
|
// surrogate pair(s) in a register
|
|
} else {
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
|
|
if((word &0xF800 ) != 0xD800) {
|
|
*utf32_output++ = char32_t(word);
|
|
} else {
|
|
// must be a surrogate pair
|
|
uint16_t diff = uint16_t(word - 0xD800);
|
|
uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
|
|
k++;
|
|
uint16_t diff2 = uint16_t(next_word - 0xDC00);
|
|
if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output); }
|
|
uint32_t value = (diff << 10) + diff2 + 0x10000;
|
|
*utf32_output++ = char32_t(value);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
|
|
}
|
|
/* end file src/westmere/sse_convert_utf16_to_utf32.cpp */
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf32_to_utf8.cpp
|
|
/* begin file src/westmere/sse_convert_utf32_to_utf8.cpp */
|
|
std::pair<const char32_t*, char*> sse_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) {
|
|
const char32_t* end = buf + len;
|
|
|
|
const __m128i v_0000 = _mm_setzero_si128();
|
|
const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
|
|
const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080);
|
|
const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
|
|
const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
|
|
const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff);
|
|
__m128i running_max = _mm_setzero_si128();
|
|
__m128i forbidden_bytemask = _mm_setzero_si128();
|
|
const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (buf + 16 + safety_margin <= end) {
|
|
// We load two 16 bytes registers for a total of 32 bytes or 16 characters.
|
|
__m128i in = _mm_loadu_si128((__m128i*)buf);
|
|
__m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
|
|
running_max = _mm_max_epu32(_mm_max_epu32(in, running_max), nextin);
|
|
|
|
// Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
|
|
__m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff), _mm_and_si128(nextin, v_7fffffff));
|
|
|
|
// Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
|
|
|
|
// Check for ASCII fast path
|
|
if(_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!!
|
|
// We eagerly load another 32 bytes, hoping that they will be ASCII too.
|
|
// The intuition is that we try to collect 16 ASCII characters which requires
|
|
// a total of 64 bytes of input. If we fail, we just pass thirdin and fourthin
|
|
// as our new inputs.
|
|
__m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
|
|
__m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
|
|
running_max = _mm_max_epu32(_mm_max_epu32(thirdin, running_max), fourthin);
|
|
__m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff));
|
|
if(!_mm_testz_si128(nextin_16, v_ff80)) {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
const __m128i utf8_packed = _mm_packus_epi16(in_16,in_16);
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
utf8_output += 8;
|
|
// Proceed with next input
|
|
in_16 = nextin_16;
|
|
// We need to update in and nextin because they are used later.
|
|
in = thirdin;
|
|
nextin = fourthin;
|
|
} else {
|
|
// 1. pack the bytes
|
|
const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
}
|
|
|
|
// no bits set above 7th bit
|
|
const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000);
|
|
const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
|
|
|
|
// no bits set above 11th bit
|
|
const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
|
|
const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
|
|
|
|
if (one_or_two_bytes_bitmask == 0xffff) {
|
|
// case: all words either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes)
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
|
|
const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m128i t0 = _mm_slli_epi16(in_16, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m128i t1 = _mm_and_si128(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m128i t2 = _mm_and_si128(in_16, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m128i t3 = _mm_or_si128(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m128i t4 = _mm_or_si128(t3, v_c080);
|
|
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
|
|
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
// one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
|
|
const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
|
|
const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
|
|
const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 = hdgcfbea
|
|
// 4. pack the bytes
|
|
const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
|
|
const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
|
|
const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
|
|
|
|
// 5. store bytes
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
|
|
|
|
// 6. adjust pointers
|
|
buf += 8;
|
|
utf8_output += row[0];
|
|
continue;
|
|
}
|
|
|
|
// Check for overflow in packing
|
|
|
|
const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
|
|
const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
|
|
if (saturation_bitmask == 0xffff) {
|
|
// case: words from register produce either 1, 2 or 3 UTF-8 bytes
|
|
const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
|
|
forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800));
|
|
|
|
const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two words (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two words we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
|
|
const __m128i s0 = _mm_srli_epi16(in_16, 4);
|
|
// [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
|
|
const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
|
|
// [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
|
|
const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
|
|
// [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
|
|
const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
|
|
const __m128i s4 = _mm_xor_si128(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand words 16-bit => 32-bit
|
|
const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
|
|
const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
|
|
|
|
// 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint16_t mask = (one_byte_bitmask & 0x5555) |
|
|
(one_or_two_bytes_bitmask & 0xaaaa);
|
|
if(mask == 0) {
|
|
// We only have three-byte words. Use fast path.
|
|
const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_0);
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_1);
|
|
utf8_output += 12;
|
|
buf += 8;
|
|
continue;
|
|
}
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
|
|
const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
|
|
const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
|
|
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
|
|
buf += 8;
|
|
} else {
|
|
// case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if((word & 0xFFFFFF80)==0) {
|
|
*utf8_output++ = char(word);
|
|
} else if((word & 0xFFFFF800)==0) {
|
|
*utf8_output++ = char((word>>6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if((word &0xFFFF0000 )==0) {
|
|
if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
|
|
*utf8_output++ = char((word>>12) | 0b11100000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
|
|
*utf8_output++ = char((word>>18) | 0b11110000);
|
|
*utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
// check for invalid input
|
|
const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
|
|
if(static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(_mm_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffff) {
|
|
return std::make_pair(nullptr, utf8_output);
|
|
}
|
|
|
|
if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); }
|
|
|
|
return std::make_pair(buf, utf8_output);
|
|
}
|
|
|
|
|
|
std::pair<result, char*> sse_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
|
|
|
|
const char32_t* end = buf + len;
|
|
const char32_t* start = buf;
|
|
|
|
const __m128i v_0000 = _mm_setzero_si128();
|
|
const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
|
|
const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080);
|
|
const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
|
|
const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
|
|
const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff);
|
|
const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
|
|
|
|
const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
|
|
|
|
while (buf + 16 + safety_margin <= end) {
|
|
// We load two 16 bytes registers for a total of 32 bytes or 16 characters.
|
|
__m128i in = _mm_loadu_si128((__m128i*)buf);
|
|
__m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
|
|
|
|
// Check for too large input
|
|
__m128i max_input = _mm_max_epu32(_mm_max_epu32(in, nextin), v_10ffff);
|
|
if(static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(max_input, v_10ffff))) != 0xffff) {
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
|
|
}
|
|
|
|
// Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
|
|
__m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff), _mm_and_si128(nextin, v_7fffffff));
|
|
|
|
// Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
|
|
|
|
// Check for ASCII fast path
|
|
if(_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!!
|
|
// We eagerly load another 32 bytes, hoping that they will be ASCII too.
|
|
// The intuition is that we try to collect 16 ASCII characters which requires
|
|
// a total of 64 bytes of input. If we fail, we just pass thirdin and fourthin
|
|
// as our new inputs.
|
|
__m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
|
|
__m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
|
|
__m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff));
|
|
if(!_mm_testz_si128(nextin_16, v_ff80)) {
|
|
// 1. pack the bytes
|
|
// obviously suboptimal.
|
|
const __m128i utf8_packed = _mm_packus_epi16(in_16,in_16);
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 8;
|
|
utf8_output += 8;
|
|
// Proceed with next input
|
|
in_16 = nextin_16;
|
|
__m128i next_max_input = _mm_max_epu32(_mm_max_epu32(thirdin, fourthin), v_10ffff);
|
|
if(static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(next_max_input, v_10ffff))) != 0xffff) {
|
|
return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
|
|
}
|
|
// We need to update in and nextin because they are used later.
|
|
in = thirdin;
|
|
nextin = fourthin;
|
|
} else {
|
|
// 1. pack the bytes
|
|
const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
|
|
// 2. store (16 bytes)
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
|
|
// 3. adjust pointers
|
|
buf += 16;
|
|
utf8_output += 16;
|
|
continue; // we are done for this round!
|
|
}
|
|
}
|
|
|
|
// no bits set above 7th bit
|
|
const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000);
|
|
const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
|
|
|
|
// no bits set above 11th bit
|
|
const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
|
|
const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
|
|
|
|
if (one_or_two_bytes_bitmask == 0xffff) {
|
|
// case: all words either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes)
|
|
// 1. prepare 2-byte values
|
|
// input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
|
|
// expected output : [110a|aaaa|10bb|bbbb] x 8
|
|
const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
|
|
const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
|
|
|
|
// t0 = [000a|aaaa|bbbb|bb00]
|
|
const __m128i t0 = _mm_slli_epi16(in_16, 2);
|
|
// t1 = [000a|aaaa|0000|0000]
|
|
const __m128i t1 = _mm_and_si128(t0, v_1f00);
|
|
// t2 = [0000|0000|00bb|bbbb]
|
|
const __m128i t2 = _mm_and_si128(in_16, v_003f);
|
|
// t3 = [000a|aaaa|00bb|bbbb]
|
|
const __m128i t3 = _mm_or_si128(t1, t2);
|
|
// t4 = [110a|aaaa|10bb|bbbb]
|
|
const __m128i t4 = _mm_or_si128(t3, v_c080);
|
|
|
|
// 2. merge ASCII and 2-byte codewords
|
|
const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
|
|
|
|
// 3. prepare bitmask for 8-bit lookup
|
|
// one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
|
|
const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
|
|
const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
|
|
const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 = hdgcfbea
|
|
// 4. pack the bytes
|
|
const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
|
|
const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
|
|
const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
|
|
|
|
// 5. store bytes
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
|
|
|
|
// 6. adjust pointers
|
|
buf += 8;
|
|
utf8_output += row[0];
|
|
continue;
|
|
}
|
|
|
|
|
|
// Check for overflow in packing
|
|
const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
|
|
const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
|
|
|
|
if (saturation_bitmask == 0xffff) {
|
|
// case: words from register produce either 1, 2 or 3 UTF-8 bytes
|
|
|
|
// Check for illegal surrogate words
|
|
const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
|
|
const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800);
|
|
if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
|
|
}
|
|
|
|
const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
|
|
0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
|
|
|
|
/* In this branch we handle three cases:
|
|
1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte
|
|
2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes
|
|
3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
|
|
|
|
We expand the input word (16-bit) into two words (32-bit), thus
|
|
we have room for four bytes. However, we need five distinct bit
|
|
layouts. Note that the last byte in cases #2 and #3 is the same.
|
|
|
|
We precompute byte 1 for case #1 and the common byte for cases #2 & #3
|
|
in register t2.
|
|
|
|
We precompute byte 1 for case #3 and -- **conditionally** -- precompute
|
|
either byte 1 for case #2 or byte 2 for case #3. Note that they
|
|
differ by exactly one bit.
|
|
|
|
Finally from these two words we build proper UTF-8 sequence, taking
|
|
into account the case (i.e, the number of bytes to write).
|
|
*/
|
|
/**
|
|
* Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
|
|
* t2 => [0ccc|cccc] [10cc|cccc]
|
|
* s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
|
|
*/
|
|
#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
|
|
// [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
|
|
const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
|
|
// [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
|
|
const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
|
|
// [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
|
|
const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000));
|
|
|
|
// [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc]
|
|
const __m128i s0 = _mm_srli_epi16(in_16, 4);
|
|
// [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
|
|
const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
|
|
// [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
|
|
const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
|
|
// [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
|
|
const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
|
|
const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
|
|
const __m128i s4 = _mm_xor_si128(s3, m0);
|
|
#undef simdutf_vec
|
|
|
|
// 4. expand words 16-bit => 32-bit
|
|
const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
|
|
const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
|
|
|
|
// 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
|
|
const uint16_t mask = (one_byte_bitmask & 0x5555) |
|
|
(one_or_two_bytes_bitmask & 0xaaaa);
|
|
if(mask == 0) {
|
|
// We only have three-byte words. Use fast path.
|
|
const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_0);
|
|
utf8_output += 12;
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_1);
|
|
utf8_output += 12;
|
|
buf += 8;
|
|
continue;
|
|
}
|
|
const uint8_t mask0 = uint8_t(mask);
|
|
|
|
const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
|
|
const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
|
|
const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
|
|
|
|
const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
|
|
|
|
const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
|
|
const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
|
|
const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
|
|
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_0);
|
|
utf8_output += row0[0];
|
|
_mm_storeu_si128((__m128i*)utf8_output, utf8_1);
|
|
utf8_output += row1[0];
|
|
|
|
buf += 8;
|
|
} else {
|
|
// case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
|
|
// Let us do a scalar fallback.
|
|
// It may seem wasteful to use scalar code, but being efficient with SIMD
|
|
// in the presence of surrogate pairs may require non-trivial tables.
|
|
size_t forward = 15;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if((word & 0xFFFFFF80)==0) {
|
|
*utf8_output++ = char(word);
|
|
} else if((word & 0xFFFFF800)==0) {
|
|
*utf8_output++ = char((word>>6) | 0b11000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else if((word &0xFFFF0000 )==0) {
|
|
if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
|
|
*utf8_output++ = char((word>>12) | 0b11100000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
} else {
|
|
if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf- start + k), utf8_output); }
|
|
*utf8_output++ = char((word>>18) | 0b11110000);
|
|
*utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
|
|
*utf8_output++ = char((word & 0b111111) | 0b10000000);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
} // while
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
|
|
}
|
|
/* end file src/westmere/sse_convert_utf32_to_utf8.cpp */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf32_to_utf16.cpp
|
|
/* begin file src/westmere/sse_convert_utf32_to_utf16.cpp */
|
|
template <endianness big_endian>
|
|
std::pair<const char32_t*, char16_t*> sse_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) {
|
|
|
|
const char32_t* end = buf + len;
|
|
|
|
const __m128i v_0000 = _mm_setzero_si128();
|
|
const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
|
|
__m128i forbidden_bytemask = _mm_setzero_si128();
|
|
|
|
while (buf + 8 <= end) {
|
|
__m128i in = _mm_loadu_si128((__m128i*)buf);
|
|
__m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
|
|
const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
|
|
const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
|
|
|
|
// Check if no bits set above 16th
|
|
if (saturation_bitmask == 0xffff) {
|
|
// Pack UTF-32 to UTF-16
|
|
__m128i utf16_packed = _mm_packus_epi32(in, nextin);
|
|
|
|
const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
|
|
const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
|
|
forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800));
|
|
|
|
if (big_endian) {
|
|
const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
|
|
}
|
|
|
|
_mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
|
|
utf16_output += 8;
|
|
buf += 8;
|
|
} else {
|
|
size_t forward = 7;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if((word & 0xFFFF0000)==0) {
|
|
// will not generate a surrogate pair
|
|
if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
|
|
*utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (big_endian) {
|
|
high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
|
|
low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
}
|
|
|
|
// check for invalid input
|
|
if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); }
|
|
|
|
return std::make_pair(buf, utf16_output);
|
|
}
|
|
|
|
|
|
template <endianness big_endian>
|
|
std::pair<result, char16_t*> sse_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
|
|
const char32_t* start = buf;
|
|
const char32_t* end = buf + len;
|
|
|
|
const __m128i v_0000 = _mm_setzero_si128();
|
|
const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
|
|
|
|
while (buf + 8 <= end) {
|
|
__m128i in = _mm_loadu_si128((__m128i*)buf);
|
|
__m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
|
|
const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
|
|
const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
|
|
|
|
// Check if no bits set above 16th
|
|
if (saturation_bitmask == 0xffff) {
|
|
// Pack UTF-32 to UTF-16
|
|
__m128i utf16_packed = _mm_packus_epi32(in, nextin);
|
|
|
|
const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
|
|
const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
|
|
const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800);
|
|
if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
|
|
return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
|
|
}
|
|
|
|
if (big_endian) {
|
|
const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
|
|
utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
|
|
}
|
|
|
|
_mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
|
|
utf16_output += 8;
|
|
buf += 8;
|
|
} else {
|
|
size_t forward = 7;
|
|
size_t k = 0;
|
|
if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
|
|
for(; k < forward; k++) {
|
|
uint32_t word = buf[k];
|
|
if((word & 0xFFFF0000)==0) {
|
|
// will not generate a surrogate pair
|
|
if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
|
|
*utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
|
|
} else {
|
|
// will generate a surrogate pair
|
|
if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
|
|
word -= 0x10000;
|
|
uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
|
|
uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
|
|
if (big_endian) {
|
|
high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
|
|
low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
|
|
}
|
|
*utf16_output++ = char16_t(high_surrogate);
|
|
*utf16_output++ = char16_t(low_surrogate);
|
|
}
|
|
}
|
|
buf += k;
|
|
}
|
|
}
|
|
|
|
return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
|
|
}
|
|
/* end file src/westmere/sse_convert_utf32_to_utf16.cpp */
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h
|
|
/* begin file src/generic/buf_block_reader.h */
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
|
|
// Walks through a buffer in block-sized increments, loading the last part with spaces
|
|
template<size_t STEP_SIZE>
|
|
struct buf_block_reader {
|
|
public:
|
|
simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
|
|
simdutf_really_inline size_t block_index();
|
|
simdutf_really_inline bool has_full_block() const;
|
|
simdutf_really_inline const uint8_t *full_block() const;
|
|
/**
|
|
* Get the last block, padded with spaces.
|
|
*
|
|
* There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
|
|
* function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
|
|
* will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
|
|
*
|
|
* @return the number of effective characters in the last block.
|
|
*/
|
|
simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
|
|
simdutf_really_inline void advance();
|
|
private:
|
|
const uint8_t *buf;
|
|
const size_t len;
|
|
const size_t lenminusstep;
|
|
size_t idx;
|
|
};
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdutf_unused static char * format_input_text_64(const uint8_t *text) {
|
|
static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
|
|
static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
|
|
in.store(reinterpret_cast<uint8_t*>(buf));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] < ' ') { buf[i] = '_'; }
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdutf_unused static char * format_mask(uint64_t mask) {
|
|
static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
|
|
for (size_t i=0; i<64; i++) {
|
|
buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
|
|
}
|
|
buf[64] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
|
|
return idx < lenminusstep;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
|
|
return &buf[idx];
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
|
|
if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
|
|
std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
|
|
std::memcpy(dst, buf + idx, len - idx);
|
|
return len - idx;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
|
|
idx += STEP_SIZE;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/buf_block_reader.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
|
|
/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
|
|
);
|
|
constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY,
|
|
CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000
|
|
);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
|
|
);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
//
|
|
// Return nonzero if there are incomplete multibyte characters at the end of the block:
|
|
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
|
|
//
|
|
simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
|
|
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
|
|
// ... 1111____ 111_____ 11______
|
|
static const uint8_t max_array[32] = {
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
|
|
};
|
|
const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
|
|
return input.gt_bits(max_value);
|
|
}
|
|
|
|
struct utf8_checker {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
// The last input we received
|
|
simd8<uint8_t> prev_input_block;
|
|
// Whether the last input we received was incomplete (used for ASCII fast path)
|
|
simd8<uint8_t> prev_incomplete;
|
|
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
|
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
// The only problem that can happen at EOF is that a multibyte character is too short
|
|
// or a byte value too large in the last bytes: check_special_cases only checks for bytes
|
|
// too large in the first of two bytes.
|
|
simdutf_really_inline void check_eof() {
|
|
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
|
|
// possibly finish them.
|
|
this->error |= this->prev_incomplete;
|
|
}
|
|
|
|
simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
|
|
if(simdutf_likely(is_ascii(input))) {
|
|
this->error |= this->prev_incomplete;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
|
|
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
|
|
|
|
}
|
|
}
|
|
|
|
// do not forget to call check_eof!
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_validation
|
|
|
|
using utf8_validation::utf8_checker;
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
|
|
/* begin file src/generic/utf8_validation/utf8_validator.h */
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8.
|
|
*/
|
|
template<class checker>
|
|
bool generic_validate_utf8(const uint8_t * input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
return !c.errors();
|
|
}
|
|
|
|
bool generic_validate_utf8(const char * input, size_t length) {
|
|
return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8 and stops on errors.
|
|
*/
|
|
template<class checker>
|
|
result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
size_t count{0};
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
if(c.errors()) {
|
|
if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input + count), length - count);
|
|
res.count += count;
|
|
return res;
|
|
}
|
|
reader.advance();
|
|
count += 64;
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
if (c.errors()) {
|
|
if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
|
|
result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input) + count, length - count);
|
|
res.count += count;
|
|
return res;
|
|
} else {
|
|
return result(error_code::SUCCESS, length);
|
|
}
|
|
}
|
|
|
|
result generic_validate_utf8_with_errors(const char * input, size_t length) {
|
|
return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
template<class checker>
|
|
bool generic_validate_ascii(const uint8_t * input, size_t length) {
|
|
buf_block_reader<64> reader(input, length);
|
|
uint8_t blocks[64]{};
|
|
simd::simd8x64<uint8_t> running_or(blocks);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
running_or |= in;
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
running_or |= in;
|
|
return running_or.is_ascii();
|
|
}
|
|
|
|
bool generic_validate_ascii(const char * input, size_t length) {
|
|
return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
template<class checker>
|
|
result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
|
|
buf_block_reader<64> reader(input, length);
|
|
size_t count{0};
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
if (!in.is_ascii()) {
|
|
result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
|
|
return result(res.error, count + res.count);
|
|
}
|
|
reader.advance();
|
|
|
|
count += 64;
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
if (!in.is_ascii()) {
|
|
result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
|
|
return result(res.error, count + res.count);
|
|
} else {
|
|
return result(error_code::SUCCESS, length);
|
|
}
|
|
}
|
|
|
|
result generic_validate_ascii_with_errors(const char * input, size_t length) {
|
|
return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
} // namespace utf8_validation
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_validation/utf8_validator.h */
|
|
// transcoding from UTF-8 to UTF-16
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
|
|
/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
|
|
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
|
|
using namespace simd;
|
|
|
|
template <endianness endian>
|
|
simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
|
|
char16_t* utf16_output) noexcept {
|
|
// The implementation is not specific to haswell and should be moved to the generic directory.
|
|
size_t pos = 0;
|
|
char16_t* start{utf16_output};
|
|
const size_t safety_margin = 16; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
// this loop could be unrolled further. For example, we could process the mask
|
|
// far more than 64 bytes.
|
|
simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
|
|
if(in.is_ascii()) {
|
|
in.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// Slow path. We hope that the compiler will recognize that this is a slow path.
|
|
// Anything that is not a continuation mask is a 'leading byte', that is, the
|
|
// start of a new code point.
|
|
uint64_t utf8_continuation_mask = in.lt(-65 + 1);
|
|
// -65 is 0b10111111 in two-complement's, so largest possible continuation byte
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
// The *start* of code points is not so useful, rather, we want the *end* of code points.
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times when using solely
|
|
// the slow/regular path, and at least four times if there are fast paths.
|
|
while(pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
//
|
|
// Thus we may allow convert_masked_utf8_to_utf16 to process
|
|
// more bytes at a time under a fast-path mode where 16 bytes
|
|
// are consumed at once (e.g., when encountering ASCII).
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
|
|
utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
|
|
return utf16_output - start;
|
|
}
|
|
|
|
} // namespace utf8_to_utf16
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
|
|
/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
|
|
|
|
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf8_to_utf16 {
|
|
using namespace simd;
|
|
|
|
|
|
simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
|
|
);
|
|
constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY,
|
|
CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000
|
|
);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
|
|
);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
|
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
|
|
size_t pos = 0;
|
|
char16_t* start{utf16_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow of
|
|
// 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
|
|
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
|
|
// much more than 8 bytes. However, you cannot generally assume that you have valid
|
|
// UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
|
|
// to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for(; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin-1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if(input.is_ascii()) {
|
|
input.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while(pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
|
|
utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if(errors()) { return 0; }
|
|
if(pos < size) {
|
|
size_t howmany = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
|
|
if(howmany == 0) { return 0; }
|
|
utf16_output += howmany;
|
|
}
|
|
return utf16_output - start;
|
|
}
|
|
|
|
template <endianness endian>
|
|
simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
|
|
size_t pos = 0;
|
|
char16_t* start{utf16_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow of
|
|
// 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
|
|
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
|
|
// much more than 8 bytes. However, you cannot generally assume that you have valid
|
|
// UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
|
|
// to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for(; margin > 0 && leading_byte < 8; margin--) {
|
|
leading_byte += (int8_t(in[margin-1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the eight last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if(input.is_ascii()) {
|
|
input.store_ascii_as_utf16<endian>(utf16_output);
|
|
utf16_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
if (errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos onward,
|
|
// with the ability to go back up to pos bytes, and read size-pos bytes forward.
|
|
result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while(pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
|
|
utf8_end_of_code_point_mask, utf16_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if(errors()) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos onward,
|
|
// with the ability to go back up to pos bytes, and read size-pos bytes forward.
|
|
result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if(pos < size) {
|
|
// rewind_and_convert_with_errors will seek a potential error from in+pos onward,
|
|
// with the ability to go back up to pos bytes, and read size-pos bytes forward.
|
|
result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
utf16_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf16_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // utf8_to_utf16 namespace
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
|
|
// transcoding from UTF-8 to UTF-32
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
|
|
/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
|
|
using namespace simd;
|
|
|
|
|
|
simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
|
|
char32_t* utf32_output) noexcept {
|
|
size_t pos = 0;
|
|
char32_t* start{utf32_output};
|
|
const size_t safety_margin = 16; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
|
|
if(in.is_ascii()) {
|
|
in.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// -65 is 0b10111111 in two-complement's, so largest possible continuation byte
|
|
uint64_t utf8_continuation_mask = in.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
while(pos < max_starting_point) {
|
|
size_t consumed = convert_masked_utf8_to_utf32(input + pos,
|
|
utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
}
|
|
}
|
|
utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
|
|
return utf32_output - start;
|
|
}
|
|
|
|
|
|
} // namespace utf8_to_utf32
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
|
|
/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
|
|
|
|
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf8_to_utf32 {
|
|
using namespace simd;
|
|
|
|
|
|
simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
|
|
);
|
|
constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY,
|
|
CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000
|
|
);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
|
|
);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
|
|
struct validating_transcoder {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
|
|
validating_transcoder() : error(uint8_t(0)) {}
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
|
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
|
|
|
|
simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
|
|
size_t pos = 0;
|
|
char32_t* start{utf32_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow of
|
|
// 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
|
|
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
|
|
// much more than 8 bytes. However, you cannot generally assume that you have valid
|
|
// UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
|
|
// to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for(; margin > 0 && leading_byte < 4; margin--) {
|
|
leading_byte += (int8_t(in[margin-1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the fourth last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if(input.is_ascii()) {
|
|
input.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while(pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf32(in + pos,
|
|
utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if(errors()) { return 0; }
|
|
if(pos < size) {
|
|
size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
|
|
if(howmany == 0) { return 0; }
|
|
utf32_output += howmany;
|
|
}
|
|
return utf32_output - start;
|
|
}
|
|
|
|
simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
|
|
size_t pos = 0;
|
|
char32_t* start{utf32_output};
|
|
// In the worst case, we have the haswell kernel which can cause an overflow of
|
|
// 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
|
|
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
|
|
// much more than 8 bytes. However, you cannot generally assume that you have valid
|
|
// UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
|
|
// to give us a good margin.
|
|
size_t leading_byte = 0;
|
|
size_t margin = size;
|
|
for(; margin > 0 && leading_byte < 4; margin--) {
|
|
leading_byte += (int8_t(in[margin-1]) > -65);
|
|
}
|
|
// If the input is long enough, then we have that margin-1 is the fourth last leading byte.
|
|
const size_t safety_margin = size - margin + 1; // to avoid overruns!
|
|
while(pos + 64 + safety_margin <= size) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
if(input.is_ascii()) {
|
|
input.store_ascii_as_utf32(utf32_output);
|
|
utf32_output += 64;
|
|
pos += 64;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support either two or four chunks per 64-byte block.");
|
|
auto zero = simd8<uint8_t>{uint8_t(0)};
|
|
if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], zero);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
if (errors()) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
|
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
|
|
// We process in blocks of up to 12 bytes except possibly
|
|
// for fast paths which may process up to 16 bytes. For the
|
|
// slow path to work, we should have at least 12 input bytes left.
|
|
size_t max_starting_point = (pos + 64) - 12;
|
|
// Next loop is going to run at least five times.
|
|
while(pos < max_starting_point) {
|
|
// Performance note: our ability to compute 'consumed' and
|
|
// then shift and recompute is critical. If there is a
|
|
// latency of, say, 4 cycles on getting 'consumed', then
|
|
// the inner loop might have a total latency of about 6 cycles.
|
|
// Yet we process between 6 to 12 inputs bytes, thus we get
|
|
// a speed limit between 1 cycle/byte and 0.5 cycle/byte
|
|
// for this section of the code. Hence, there is a limit
|
|
// to how much we can further increase this latency before
|
|
// it seriously harms performance.
|
|
size_t consumed = convert_masked_utf8_to_utf32(in + pos,
|
|
utf8_end_of_code_point_mask, utf32_output);
|
|
pos += consumed;
|
|
utf8_end_of_code_point_mask >>= consumed;
|
|
}
|
|
// At this point there may remain between 0 and 12 bytes in the
|
|
// 64-byte block. These bytes will be processed again. So we have an
|
|
// 80% efficiency (in the worst case). In practice we expect an
|
|
// 85% to 90% efficiency.
|
|
}
|
|
}
|
|
if(errors()) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
|
|
res.count += pos;
|
|
return res;
|
|
}
|
|
if(pos < size) {
|
|
result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
|
|
if (res.error) { // In case of error, we want the error position
|
|
res.count += pos;
|
|
return res;
|
|
} else { // In case of success, we want the number of word written
|
|
utf32_output += res.count;
|
|
}
|
|
}
|
|
return result(error_code::SUCCESS, utf32_output - start);
|
|
}
|
|
|
|
simdutf_really_inline bool errors() const {
|
|
return this->error.any_bits_set_anywhere();
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // utf8_to_utf32 namespace
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
|
|
// other functions
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8.h
|
|
/* begin file src/generic/utf8.h */
|
|
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf8 {
|
|
|
|
using namespace simd;
|
|
|
|
simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for(;pos + 64 <= size; pos += 64) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
count += 64 - count_ones(utf8_continuation_mask);
|
|
}
|
|
return count + scalar::utf8::count_code_points(in + pos, size - pos);
|
|
}
|
|
|
|
|
|
simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
// This algorithm could no doubt be improved!
|
|
for(;pos + 64 <= size; pos += 64) {
|
|
simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
|
|
uint64_t utf8_continuation_mask = input.lt(-65 + 1);
|
|
// We count one word for anything that is not a continuation (so
|
|
// leading bytes).
|
|
count += 64 - count_ones(utf8_continuation_mask);
|
|
int64_t utf8_4byte = input.gteq_unsigned(240);
|
|
count += count_ones(utf8_4byte);
|
|
}
|
|
return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
|
|
}
|
|
|
|
|
|
simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) {
|
|
return count_code_points(in, size);
|
|
}
|
|
} // utf8 namespace
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf8.h */
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf16.h
|
|
/* begin file src/generic/utf16.h */
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf16 {
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for(;pos + 32 <= size; pos += 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) input.swap_bytes();
|
|
uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
|
|
count += count_ones(not_pair) / 2;
|
|
}
|
|
return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
// This algorithm could no doubt be improved!
|
|
for(;pos + 32 <= size; pos += 32) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
if (!match_system(big_endian)) input.swap_bytes();
|
|
uint64_t ascii_mask = input.lteq(0x7F);
|
|
uint64_t twobyte_mask = input.lteq(0x7FF);
|
|
uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
|
|
|
|
size_t ascii_count = count_ones(ascii_mask) / 2;
|
|
size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
|
|
size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
|
|
size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
|
|
count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
|
|
}
|
|
return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
|
|
}
|
|
|
|
template <endianness big_endian>
|
|
simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
|
|
return count_code_points<big_endian>(in, size);
|
|
}
|
|
|
|
simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
|
|
size_t pos = 0;
|
|
|
|
while (pos + 32 <= size) {
|
|
simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
|
|
input.swap_bytes();
|
|
input.store(reinterpret_cast<uint16_t *>(output));
|
|
pos += 32;
|
|
output += 32;
|
|
}
|
|
|
|
scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
|
|
}
|
|
|
|
} // utf16
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
/* end file src/generic/utf16.h */
|
|
//
|
|
// Implementation-specific overrides
|
|
//
|
|
|
|
namespace simdutf {
|
|
namespace westmere {
|
|
|
|
simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
|
|
// If there is a BOM, then we trust it.
|
|
auto bom_encoding = simdutf::BOM::check_bom(input, length);
|
|
if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
|
|
if (length % 2 == 0) {
|
|
return sse_detect_encodings<utf8_validation::utf8_checker>(input, length);
|
|
} else {
|
|
if (implementation::validate_utf8(input, length)) {
|
|
return simdutf::encoding_type::UTF8;
|
|
} else {
|
|
return simdutf::encoding_type::unspecified;
|
|
}
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
return westmere::utf8_validation::generic_validate_utf8(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
|
|
return westmere::utf8_validation::generic_validate_utf8_with_errors(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
|
|
return westmere::utf8_validation::generic_validate_ascii(buf, len);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
|
|
return westmere::utf8_validation::generic_validate_ascii_with_errors(buf,len);
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
|
|
const char16_t* tail = sse_validate_utf16<endianness::LITTLE>(buf, len);
|
|
if (tail) {
|
|
return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
|
|
const char16_t* tail = sse_validate_utf16<endianness::BIG>(buf, len);
|
|
if (tail) {
|
|
return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
|
|
result res = sse_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
|
|
if (res.count != len) {
|
|
result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
|
|
result res = sse_validate_utf16_with_errors<endianness::BIG>(buf, len);
|
|
if (res.count != len) {
|
|
result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
|
|
const char32_t* tail = sse_validate_utf32le(buf, len);
|
|
if (tail) {
|
|
return scalar::utf32::validate(tail, len - (tail - buf));
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
|
|
result res = sse_validate_utf32le_with_errors(buf, len);
|
|
if (res.count != len) {
|
|
result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
|
|
return result(scalar_res.error, res.count + scalar_res.count);
|
|
} else {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
utf8_to_utf16::validating_transcoder converter;
|
|
return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
|
|
}
|
|
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size,
|
|
char16_t* utf16_output) const noexcept {
|
|
return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size,
|
|
char16_t* utf16_output) const noexcept {
|
|
return utf8_to_utf16::convert_valid<endianness::BIG>(input, size, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
utf8_to_utf32::validating_transcoder converter;
|
|
return converter.convert(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
utf8_to_utf32::validating_transcoder converter;
|
|
return converter.convert_with_errors(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
|
|
char32_t* utf32_output) const noexcept {
|
|
return utf8_to_utf32::convert_valid(input, size, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
std::pair<const char16_t*, char*> ret = sse_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
std::pair<const char16_t*, char*> ret = sse_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char*> ret = westmere::sse_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
|
|
if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char*> ret = westmere::sse_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
|
|
if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return convert_utf16le_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return convert_utf16be_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
std::pair<const char32_t*, char*> ret = sse_convert_utf32_to_utf8(buf, len, utf8_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf8_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char*> ret = westmere::sse_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
std::pair<const char16_t*, char32_t*> ret = sse_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf32_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
std::pair<const char16_t*, char32_t*> ret = sse_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf32_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char32_t*> ret = westmere::sse_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
|
|
if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char32_t*> ret = westmere::sse_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
|
|
if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count
|
|
if (ret.first.count != len) { // All good so far, but not finished
|
|
result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
|
|
return convert_utf32_to_utf8(buf, len, utf8_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
std::pair<const char32_t*, char16_t*> ret = sse_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
std::pair<const char32_t*, char16_t*> ret = sse_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
|
|
if (ret.first == nullptr) { return 0; }
|
|
size_t saved_bytes = ret.second - utf16_output;
|
|
if (ret.first != buf + len) {
|
|
const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
|
|
ret.first, len - (ret.first - buf), ret.second);
|
|
if (scalar_saved_bytes == 0) { return 0; }
|
|
saved_bytes += scalar_saved_bytes;
|
|
}
|
|
return saved_bytes;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char16_t*> ret = westmere::sse_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
// ret.first.count is always the position in the buffer, not the number of words written even if finished
|
|
std::pair<result, char16_t*> ret = westmere::sse_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
|
|
if (ret.first.count != len) {
|
|
result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
|
|
buf + ret.first.count, len - ret.first.count, ret.second);
|
|
if (scalar_res.error) {
|
|
scalar_res.count += ret.first.count;
|
|
return scalar_res;
|
|
} else {
|
|
ret.second += scalar_res.count;
|
|
}
|
|
}
|
|
ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
|
|
return ret.first;
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return convert_utf32_to_utf16le(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
|
|
return convert_utf32_to_utf16be(buf, len, utf16_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
return convert_utf16le_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
|
|
return convert_utf16be_to_utf32(buf, len, utf32_output);
|
|
}
|
|
|
|
void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
|
|
utf16::change_endianness_utf16(input, length, output);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
|
|
return utf16::count_code_points<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
|
|
return utf16::count_code_points<endianness::BIG>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
|
|
return utf8::count_code_points(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
|
|
return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
|
|
return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
|
|
return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
|
|
return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
|
|
return utf8::utf16_length_from_utf8(input, length);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
|
|
const __m128i v_00000000 = _mm_setzero_si128();
|
|
const __m128i v_ffffff80 = _mm_set1_epi32((uint32_t)0xffffff80);
|
|
const __m128i v_fffff800 = _mm_set1_epi32((uint32_t)0xfffff800);
|
|
const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for(;pos + 4 <= length; pos += 4) {
|
|
__m128i in = _mm_loadu_si128((__m128i*)(input + pos));
|
|
const __m128i ascii_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffffff80), v_00000000);
|
|
const __m128i one_two_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_fffff800), v_00000000);
|
|
const __m128i two_bytes_bytemask = _mm_xor_si128(one_two_bytes_bytemask, ascii_bytes_bytemask);
|
|
const __m128i one_two_three_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
|
|
const __m128i three_bytes_bytemask = _mm_xor_si128(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
|
|
const uint16_t ascii_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(ascii_bytes_bytemask));
|
|
const uint16_t two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(two_bytes_bytemask));
|
|
const uint16_t three_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(three_bytes_bytemask));
|
|
|
|
size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
|
|
size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
|
|
size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
|
|
count += 16 - 3*ascii_count - 2*two_bytes_count - three_bytes_count;
|
|
}
|
|
return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
|
|
const __m128i v_00000000 = _mm_setzero_si128();
|
|
const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
|
|
size_t pos = 0;
|
|
size_t count = 0;
|
|
for(;pos + 4 <= length; pos += 4) {
|
|
__m128i in = _mm_loadu_si128((__m128i*)(input + pos));
|
|
const __m128i surrogate_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
|
|
const uint16_t surrogate_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogate_bytemask));
|
|
size_t surrogate_count = (16-count_ones(surrogate_bitmask))/4;
|
|
count += 4 + surrogate_count;
|
|
}
|
|
return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
|
|
}
|
|
|
|
simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
|
|
return scalar::utf8::count_code_points(input, length);
|
|
}
|
|
|
|
} // namespace westmere
|
|
} // namespace simdutf
|
|
|
|
// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/end.h
|
|
/* begin file src/simdutf/westmere/end.h */
|
|
#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
|
|
// nothing needed.
|
|
#else
|
|
SIMDUTF_UNTARGET_REGION
|
|
#endif
|
|
|
|
/* end file src/simdutf/westmere/end.h */
|
|
/* end file src/westmere/implementation.cpp */
|
|
#endif
|
|
|
|
SIMDUTF_POP_DISABLE_WARNINGS
|
|
/* end file src/simdutf.cpp */
|