[libc] Reworked CharacterConverter isComplete into isFull and isEmpty (#144799)
isComplete previously meant different things for different conversion directions. Refactored bytes_processed to bytes_stored which now consistently increments on every push and decrements on pop making both directions more consistent with each other
This commit is contained in:
parent
7157f33c6c
commit
8d6e29d0d3
@ -30,18 +30,20 @@ CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
|
||||
|
||||
void CharacterConverter::clear() {
|
||||
state->partial = 0;
|
||||
state->bytes_processed = 0;
|
||||
state->bytes_stored = 0;
|
||||
state->total_bytes = 0;
|
||||
}
|
||||
|
||||
bool CharacterConverter::isComplete() {
|
||||
return state->bytes_processed == state->total_bytes;
|
||||
bool CharacterConverter::isFull() {
|
||||
return state->bytes_stored == state->total_bytes && state->total_bytes != 0;
|
||||
}
|
||||
|
||||
bool CharacterConverter::isEmpty() { return state->bytes_stored == 0; }
|
||||
|
||||
int CharacterConverter::push(char8_t utf8_byte) {
|
||||
uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
|
||||
// Checking the first byte if first push
|
||||
if (state->bytes_processed == 0) {
|
||||
if (isEmpty()) {
|
||||
// UTF-8 char has 1 byte total
|
||||
if (num_ones == 0) {
|
||||
state->total_bytes = 1;
|
||||
@ -58,21 +60,21 @@ int CharacterConverter::push(char8_t utf8_byte) {
|
||||
}
|
||||
// Invalid first byte
|
||||
else {
|
||||
// bytes_processed and total_bytes will always be 0 here
|
||||
// bytes_stored and total_bytes will always be 0 here
|
||||
state->partial = static_cast<char32_t>(0);
|
||||
return -1;
|
||||
}
|
||||
state->partial = static_cast<char32_t>(utf8_byte);
|
||||
state->bytes_processed++;
|
||||
state->bytes_stored++;
|
||||
return 0;
|
||||
}
|
||||
// Any subsequent push
|
||||
// Adding 6 more bits so need to left shift
|
||||
if (num_ones == 1 && !isComplete()) {
|
||||
if (num_ones == 1 && !isFull()) {
|
||||
char32_t byte = utf8_byte & MASK_ENCODED_BITS;
|
||||
state->partial = state->partial << ENCODED_BITS_PER_UTF8;
|
||||
state->partial |= byte;
|
||||
state->bytes_processed++;
|
||||
state->bytes_stored++;
|
||||
return 0;
|
||||
}
|
||||
// Invalid byte -> reset the state
|
||||
@ -82,11 +84,10 @@ int CharacterConverter::push(char8_t utf8_byte) {
|
||||
|
||||
int CharacterConverter::push(char32_t utf32) {
|
||||
// we can't be partially through a conversion when pushing a utf32 value
|
||||
if (!isComplete())
|
||||
if (!isEmpty())
|
||||
return -1;
|
||||
|
||||
state->partial = utf32;
|
||||
state->bytes_processed = 0;
|
||||
|
||||
// determine number of utf-8 bytes needed to represent this utf32 value
|
||||
constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
|
||||
@ -94,6 +95,7 @@ int CharacterConverter::push(char32_t utf32) {
|
||||
for (uint8_t i = 0; i < NUM_RANGES; i++) {
|
||||
if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) {
|
||||
state->total_bytes = i + 1;
|
||||
state->bytes_stored = i + 1;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
@ -107,7 +109,7 @@ int CharacterConverter::push(char32_t utf32) {
|
||||
ErrorOr<char32_t> CharacterConverter::pop_utf32() {
|
||||
// If pop is called too early, do not reset the state, use error to determine
|
||||
// whether enough bytes have been pushed
|
||||
if (!isComplete() || state->bytes_processed == 0)
|
||||
if (!isFull())
|
||||
return Error(-1);
|
||||
char32_t utf32 = state->partial;
|
||||
// reset if successful pop
|
||||
@ -116,7 +118,7 @@ ErrorOr<char32_t> CharacterConverter::pop_utf32() {
|
||||
}
|
||||
|
||||
ErrorOr<char8_t> CharacterConverter::pop_utf8() {
|
||||
if (isComplete())
|
||||
if (isEmpty())
|
||||
return Error(-1);
|
||||
|
||||
constexpr char8_t FIRST_BYTE_HEADERS[] = {0, 0xC0, 0xE0, 0xF0};
|
||||
@ -125,9 +127,8 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
|
||||
char32_t output;
|
||||
|
||||
// Shift to get the next 6 bits from the utf32 encoding
|
||||
const size_t shift_amount =
|
||||
(state->total_bytes - state->bytes_processed - 1) * ENCODED_BITS_PER_UTF8;
|
||||
if (state->bytes_processed == 0) {
|
||||
const size_t shift_amount = (state->bytes_stored - 1) * ENCODED_BITS_PER_UTF8;
|
||||
if (isFull()) {
|
||||
/*
|
||||
Choose the correct set of most significant bits to encode the length
|
||||
of the utf8 sequence. The remaining bits contain the most significant
|
||||
@ -141,7 +142,7 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
|
||||
((state->partial >> shift_amount) & MASK_ENCODED_BITS);
|
||||
}
|
||||
|
||||
state->bytes_processed++;
|
||||
state->bytes_stored--;
|
||||
return static_cast<char8_t>(output);
|
||||
}
|
||||
|
||||
|
@ -26,7 +26,8 @@ public:
|
||||
CharacterConverter(mbstate *mbstate);
|
||||
|
||||
void clear();
|
||||
bool isComplete();
|
||||
bool isFull();
|
||||
bool isEmpty();
|
||||
|
||||
int push(char8_t utf8_byte);
|
||||
int push(char32_t utf32);
|
||||
|
@ -22,10 +22,10 @@ struct mbstate {
|
||||
|
||||
/*
|
||||
Progress towards a conversion
|
||||
For utf8 -> utf32, increases with each CharacterConverter::push(utf8_byte)
|
||||
For utf32 -> utf8, increases with each CharacterConverter::pop_utf8()
|
||||
Increases with each push(...) until it reaches total_bytes
|
||||
Decreases with each pop(...) until it reaches 0
|
||||
*/
|
||||
uint8_t bytes_processed;
|
||||
uint8_t bytes_stored;
|
||||
|
||||
// Total number of bytes that will be needed to represent this character
|
||||
uint8_t total_bytes;
|
||||
|
@ -20,17 +20,19 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
|
||||
// utf8 1-byte encodings are identical to their utf32 representations
|
||||
char32_t utf32_A = 0x41; // 'A'
|
||||
cr.push(utf32_A);
|
||||
ASSERT_TRUE(cr.isFull());
|
||||
auto popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<char>(popped.value()), 'A');
|
||||
ASSERT_TRUE(cr.isComplete());
|
||||
ASSERT_TRUE(cr.isEmpty());
|
||||
|
||||
char32_t utf32_B = 0x42; // 'B'
|
||||
cr.push(utf32_B);
|
||||
ASSERT_TRUE(cr.isFull());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<char>(popped.value()), 'B');
|
||||
ASSERT_TRUE(cr.isComplete());
|
||||
ASSERT_TRUE(cr.isEmpty());
|
||||
|
||||
// should error if we try to pop another utf8 byte out
|
||||
popped = cr.pop_utf8();
|
||||
@ -45,26 +47,28 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
|
||||
// testing utf32: 0xff -> utf8: 0xc3 0xbf
|
||||
char32_t utf32 = 0xff;
|
||||
cr.push(utf32);
|
||||
ASSERT_TRUE(cr.isFull());
|
||||
auto popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xc3);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
ASSERT_TRUE(!cr.isEmpty());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xbf);
|
||||
ASSERT_TRUE(cr.isComplete());
|
||||
ASSERT_TRUE(cr.isEmpty());
|
||||
|
||||
// testing utf32: 0x58e -> utf8: 0xd6 0x8e
|
||||
utf32 = 0x58e;
|
||||
cr.push(utf32);
|
||||
ASSERT_TRUE(cr.isFull());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xd6);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
ASSERT_TRUE(!cr.isEmpty());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0x8e);
|
||||
ASSERT_TRUE(cr.isComplete());
|
||||
ASSERT_TRUE(cr.isEmpty());
|
||||
|
||||
// should error if we try to pop another utf8 byte out
|
||||
popped = cr.pop_utf8();
|
||||
@ -79,34 +83,36 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
|
||||
// testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
|
||||
char32_t utf32 = 0xac15;
|
||||
cr.push(utf32);
|
||||
ASSERT_TRUE(cr.isFull());
|
||||
auto popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xea);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
ASSERT_TRUE(!cr.isEmpty());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xb0);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
ASSERT_TRUE(!cr.isEmpty());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0x95);
|
||||
ASSERT_TRUE(cr.isComplete());
|
||||
ASSERT_TRUE(cr.isEmpty());
|
||||
|
||||
// testing utf32: 0x267b -> utf8: 0xe2 0x99 0xbb
|
||||
utf32 = 0x267b;
|
||||
cr.push(utf32);
|
||||
ASSERT_TRUE(cr.isFull());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xe2);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
ASSERT_TRUE(!cr.isEmpty());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0x99);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
ASSERT_TRUE(!cr.isEmpty());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xbb);
|
||||
ASSERT_TRUE(cr.isComplete());
|
||||
ASSERT_TRUE(cr.isEmpty());
|
||||
|
||||
// should error if we try to pop another utf8 byte out
|
||||
popped = cr.pop_utf8();
|
||||
@ -121,42 +127,44 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
|
||||
// testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
|
||||
char32_t utf32 = 0x1f921;
|
||||
cr.push(utf32);
|
||||
ASSERT_TRUE(cr.isFull());
|
||||
auto popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
ASSERT_TRUE(!cr.isEmpty());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0x9f);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
ASSERT_TRUE(!cr.isEmpty());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xa4);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
ASSERT_TRUE(!cr.isEmpty());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
|
||||
ASSERT_TRUE(cr.isComplete());
|
||||
ASSERT_TRUE(cr.isEmpty());
|
||||
|
||||
// testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
|
||||
utf32 = 0x12121;
|
||||
cr.push(utf32);
|
||||
ASSERT_TRUE(cr.isFull());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
ASSERT_TRUE(!cr.isEmpty());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0x92);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
ASSERT_TRUE(!cr.isEmpty());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0x84);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
ASSERT_TRUE(!cr.isEmpty());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
|
||||
ASSERT_TRUE(cr.isComplete());
|
||||
ASSERT_TRUE(cr.isEmpty());
|
||||
|
||||
// should error if we try to pop another utf8 byte out
|
||||
popped = cr.pop_utf8();
|
||||
|
@ -13,7 +13,7 @@
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
state.bytes_processed = 0;
|
||||
state.bytes_stored = 0;
|
||||
state.total_bytes = 0;
|
||||
char ch = 'A';
|
||||
|
||||
@ -28,7 +28,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
state.bytes_processed = 0;
|
||||
state.bytes_stored = 0;
|
||||
state.total_bytes = 0;
|
||||
const char ch[2] = {static_cast<char>(0xC2),
|
||||
static_cast<char>(0x8E)}; // car symbol
|
||||
@ -44,7 +44,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
state.bytes_processed = 0;
|
||||
state.bytes_stored = 0;
|
||||
state.total_bytes = 0;
|
||||
const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
|
||||
static_cast<char>(0x91)}; // ∑ sigma symbol
|
||||
@ -61,7 +61,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
state.bytes_processed = 0;
|
||||
state.bytes_stored = 0;
|
||||
state.total_bytes = 0;
|
||||
const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
|
||||
static_cast<char>(0xA4),
|
||||
@ -80,7 +80,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
state.bytes_processed = 0;
|
||||
state.bytes_stored = 0;
|
||||
state.total_bytes = 0;
|
||||
const char ch = static_cast<char>(0x80); // invalid starting bit sequence
|
||||
|
||||
@ -92,7 +92,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
state.bytes_processed = 0;
|
||||
state.bytes_stored = 0;
|
||||
state.total_bytes = 0;
|
||||
const char ch[4] = {
|
||||
static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
|
||||
@ -112,7 +112,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
state.bytes_processed = 0;
|
||||
state.bytes_stored = 0;
|
||||
state.total_bytes = 0;
|
||||
// Last byte is invalid since it does not have correct starting sequence.
|
||||
// 0xC0 --> 11000000 starting sequence should be 10xxxxxx
|
||||
@ -132,7 +132,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
state.bytes_processed = 0;
|
||||
state.bytes_stored = 0;
|
||||
state.total_bytes = 0;
|
||||
const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
|
||||
static_cast<char>(0x80)};
|
||||
@ -153,7 +153,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
state.bytes_processed = 0;
|
||||
state.bytes_stored = 0;
|
||||
state.total_bytes = 0;
|
||||
const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
|
||||
static_cast<char>(0xC7), static_cast<char>(0x8C)};
|
||||
@ -179,7 +179,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
state.bytes_processed = 0;
|
||||
state.bytes_stored = 0;
|
||||
state.total_bytes = 0;
|
||||
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
|
||||
const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};
|
||||
|
Loading…
x
Reference in New Issue
Block a user