
This patch fixes the following error: ``` llvm/lib/Support/TextEncoding.cpp:274:11: error: cannot initialize a variable of type 'char *' with an rvalue of type 'const char *' 274 | char *Input = InputLength ? const_cast<char *>(Source.data()) : ""; | ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ```
355 lines
12 KiB
C++
355 lines
12 KiB
C++
//===-- TextEncoding.cpp - Text encoding conversion class ---------*- C++ -*-=//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
///
|
|
/// \file
|
|
/// This file provides utility classes to convert between different character
|
|
/// encodings.
|
|
///
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "llvm/Support/TextEncoding.h"
|
|
#include "llvm/ADT/SmallString.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/ADT/StringExtras.h"
|
|
#include "llvm/Support/ConvertEBCDIC.h"
|
|
#include <system_error>
|
|
|
|
#if HAVE_ICU
|
|
#include <unicode/ucnv.h>
|
|
#elif HAVE_ICONV
|
|
#include <iconv.h>
|
|
#endif
|
|
|
|
using namespace llvm;
|
|
|
|
// Normalize the charset name with the charset alias matching algorithm proposed
|
|
// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
|
|
static void normalizeCharSetName(StringRef CSName,
|
|
SmallVectorImpl<char> &Normalized) {
|
|
bool PrevDigit = false;
|
|
for (auto Ch : CSName) {
|
|
if (isAlnum(Ch)) {
|
|
Ch = toLower(Ch);
|
|
if (Ch != '0' || PrevDigit) {
|
|
PrevDigit = isDigit(Ch);
|
|
Normalized.push_back(Ch);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Maps the encoding name to enum constant if possible.
|
|
static std::optional<TextEncoding> getKnownEncoding(StringRef Name) {
|
|
SmallString<16> Normalized;
|
|
normalizeCharSetName(Name, Normalized);
|
|
if (Normalized.equals("utf8"))
|
|
return TextEncoding::UTF8;
|
|
if (Normalized.equals("ibm1047"))
|
|
return TextEncoding::IBM1047;
|
|
return std::nullopt;
|
|
}
|
|
|
|
LLVM_ATTRIBUTE_UNUSED static void
|
|
HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
|
|
SmallVectorImpl<char> &Result) {
|
|
// No space left in output buffer. Double the size of the underlying
|
|
// memory in the SmallVectorImpl, adjust pointer and length and continue
|
|
// the conversion.
|
|
Capacity =
|
|
(Capacity < Result.max_size() / 2) ? 2 * Capacity : Result.max_size();
|
|
Result.resize(0);
|
|
Result.resize_for_overwrite(Capacity);
|
|
Output = static_cast<char *>(Result.data());
|
|
OutputLength = Capacity;
|
|
}
|
|
|
|
namespace {
|
|
enum ConversionType {
|
|
UTF8ToIBM1047,
|
|
IBM1047ToUTF8,
|
|
};
|
|
|
|
// Support conversion between EBCDIC 1047 and UTF-8. This class uses
|
|
// built-in translation tables that allow for translation between the
|
|
// aforementioned encodings. The use of tables for conversion is only
|
|
// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
|
|
// encodings are not supported.
|
|
class TextEncodingConverterTable final
|
|
: public details::TextEncodingConverterImplBase {
|
|
const ConversionType ConvType;
|
|
|
|
public:
|
|
TextEncodingConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
|
|
|
|
std::error_code convertString(StringRef Source,
|
|
SmallVectorImpl<char> &Result) override;
|
|
|
|
void reset() override {}
|
|
};
|
|
|
|
std::error_code
|
|
TextEncodingConverterTable::convertString(StringRef Source,
|
|
SmallVectorImpl<char> &Result) {
|
|
switch (ConvType) {
|
|
case IBM1047ToUTF8:
|
|
ConverterEBCDIC::convertToUTF8(Source, Result);
|
|
return std::error_code();
|
|
case UTF8ToIBM1047:
|
|
return ConverterEBCDIC::convertToEBCDIC(Source, Result);
|
|
}
|
|
llvm_unreachable("Invalid ConvType!");
|
|
return std::error_code();
|
|
}
|
|
|
|
#if HAVE_ICU
|
|
struct UConverterDeleter {
|
|
void operator()(UConverter *Converter) const {
|
|
if (Converter)
|
|
ucnv_close(Converter);
|
|
}
|
|
};
|
|
using UConverterUniquePtr = std::unique_ptr<UConverter, UConverterDeleter>;
|
|
|
|
class TextEncodingConverterICU final
|
|
: public details::TextEncodingConverterImplBase {
|
|
UConverterUniquePtr FromConvDesc;
|
|
UConverterUniquePtr ToConvDesc;
|
|
|
|
public:
|
|
TextEncodingConverterICU(UConverterUniquePtr FromConverter,
|
|
UConverterUniquePtr ToConverter)
|
|
: FromConvDesc(std::move(FromConverter)),
|
|
ToConvDesc(std::move(ToConverter)) {}
|
|
|
|
std::error_code convertString(StringRef Source,
|
|
SmallVectorImpl<char> &Result) override;
|
|
|
|
void reset() override;
|
|
};
|
|
|
|
// TODO: The current implementation discards the partial result and restarts the
|
|
// conversion from the beginning if there is a conversion error due to
|
|
// insufficient buffer size. In the future, it would better to save the partial
|
|
// result and resume the conversion for the remaining string.
|
|
// TODO: Improve translation of ICU errors to error_code
|
|
std::error_code
|
|
TextEncodingConverterICU::convertString(StringRef Source,
|
|
SmallVectorImpl<char> &Result) {
|
|
// Setup the input in case it has no backing data.
|
|
size_t InputLength = Source.size();
|
|
const char *In = InputLength ? const_cast<char *>(Source.data()) : "";
|
|
|
|
// Setup the output. We directly write into the SmallVector.
|
|
size_t Capacity = Result.capacity();
|
|
size_t OutputLength = Capacity;
|
|
Result.resize_for_overwrite(Capacity);
|
|
char *Output;
|
|
UErrorCode EC = U_ZERO_ERROR;
|
|
|
|
ucnv_setToUCallBack(&*FromConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL,
|
|
&EC);
|
|
ucnv_setFromUCallBack(&*ToConvDesc, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL,
|
|
NULL, &EC);
|
|
assert(U_SUCCESS(EC));
|
|
|
|
do {
|
|
EC = U_ZERO_ERROR;
|
|
const char *Input = In;
|
|
|
|
Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
|
|
ucnv_convertEx(&*ToConvDesc, &*FromConvDesc, &Output, Result.end(), &Input,
|
|
In + InputLength, /*pivotStart=*/NULL,
|
|
/*pivotSource=*/NULL, /*pivotTarget=*/NULL,
|
|
/*pivotLimit=*/NULL, /*reset=*/true,
|
|
/*flush=*/true, &EC);
|
|
if (U_FAILURE(EC)) {
|
|
if (EC == U_BUFFER_OVERFLOW_ERROR) {
|
|
if (Capacity < Result.max_size()) {
|
|
HandleOverflow(Capacity, Output, OutputLength, Result);
|
|
continue;
|
|
} else
|
|
return std::error_code(E2BIG, std::generic_category());
|
|
}
|
|
// Some other error occured.
|
|
Result.resize(Output - Result.data());
|
|
return std::error_code(EILSEQ, std::generic_category());
|
|
}
|
|
break;
|
|
} while (true);
|
|
|
|
Result.resize(Output - Result.data());
|
|
return std::error_code();
|
|
}
|
|
|
|
void TextEncodingConverterICU::reset() {
|
|
ucnv_reset(&*FromConvDesc);
|
|
ucnv_reset(&*ToConvDesc);
|
|
}
|
|
|
|
#elif HAVE_ICONV
|
|
class TextEncodingConverterIconv final
|
|
: public details::TextEncodingConverterImplBase {
|
|
class UniqueIconvT {
|
|
iconv_t ConvDesc;
|
|
|
|
public:
|
|
operator iconv_t() const { return ConvDesc; }
|
|
UniqueIconvT(iconv_t CD) : ConvDesc(CD) {}
|
|
~UniqueIconvT() {
|
|
if (ConvDesc != (iconv_t)-1) {
|
|
iconv_close(ConvDesc);
|
|
ConvDesc = (iconv_t)-1;
|
|
}
|
|
}
|
|
UniqueIconvT(UniqueIconvT &&Other) : ConvDesc(Other.ConvDesc) {
|
|
Other.ConvDesc = (iconv_t)-1;
|
|
}
|
|
UniqueIconvT &operator=(UniqueIconvT &&Other) {
|
|
if (&Other != this) {
|
|
ConvDesc = Other.ConvDesc;
|
|
Other.ConvDesc = (iconv_t)-1;
|
|
}
|
|
return *this;
|
|
}
|
|
};
|
|
UniqueIconvT ConvDesc;
|
|
|
|
public:
|
|
TextEncodingConverterIconv(UniqueIconvT ConvDesc)
|
|
: ConvDesc(std::move(ConvDesc)) {}
|
|
|
|
std::error_code convertString(StringRef Source,
|
|
SmallVectorImpl<char> &Result) override;
|
|
|
|
void reset() override;
|
|
};
|
|
|
|
// TODO: The current implementation discards the partial result and restarts the
|
|
// conversion from the beginning if there is a conversion error due to
|
|
// insufficient buffer size. In the future, it would better to save the partial
|
|
// result and resume the conversion for the remaining string.
|
|
std::error_code
|
|
TextEncodingConverterIconv::convertString(StringRef Source,
|
|
SmallVectorImpl<char> &Result) {
|
|
// Setup the output. We directly write into the SmallVector.
|
|
size_t Capacity = Result.capacity();
|
|
char *Output = static_cast<char *>(Result.data());
|
|
size_t OutputLength = Capacity;
|
|
Result.resize_for_overwrite(Capacity);
|
|
|
|
size_t Ret;
|
|
// Handle errors returned from iconv().
|
|
auto HandleError = [&Capacity, &Output, &OutputLength, &Result,
|
|
this](size_t Ret) {
|
|
if (Ret == static_cast<size_t>(-1)) {
|
|
// An error occured. Check if we can gracefully handle it.
|
|
if (errno == E2BIG && Capacity < Result.max_size()) {
|
|
HandleOverflow(Capacity, Output, OutputLength, Result);
|
|
// Reset converter
|
|
reset();
|
|
return std::error_code();
|
|
} else {
|
|
// Some other error occured.
|
|
Result.resize(Output - Result.data());
|
|
return std::error_code(errno, std::generic_category());
|
|
}
|
|
} else {
|
|
// A positive return value indicates that some characters were converted
|
|
// in a nonreversible way, that is, replaced with a SUB symbol. Returning
|
|
// an error in this case makes sure that both conversion routines behave
|
|
// in the same way.
|
|
return std::make_error_code(std::errc::illegal_byte_sequence);
|
|
}
|
|
};
|
|
|
|
do {
|
|
// Setup the input. Use nullptr to reset iconv state if input length is
|
|
// zero.
|
|
size_t InputLength = Source.size();
|
|
char *Input = const_cast<char *>(InputLength ? Source.data() : "");
|
|
Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength);
|
|
if (Ret != 0) {
|
|
if (auto EC = HandleError(Ret))
|
|
return EC;
|
|
continue;
|
|
}
|
|
// Flush the converter
|
|
Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength);
|
|
if (Ret != 0) {
|
|
if (auto EC = HandleError(Ret))
|
|
return EC;
|
|
continue;
|
|
}
|
|
break;
|
|
} while (true);
|
|
|
|
// Re-adjust size to actual size.
|
|
Result.resize(Output - Result.data());
|
|
return std::error_code();
|
|
}
|
|
|
|
inline void TextEncodingConverterIconv::reset() {
|
|
iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
|
|
}
|
|
|
|
#endif // HAVE_ICONV
|
|
} // namespace
|
|
|
|
ErrorOr<TextEncodingConverter>
|
|
TextEncodingConverter::create(TextEncoding CPFrom, TextEncoding CPTo) {
|
|
|
|
// Text encodings should be distinct.
|
|
if (CPFrom == CPTo)
|
|
return std::make_error_code(std::errc::invalid_argument);
|
|
|
|
ConversionType Conversion;
|
|
if (CPFrom == TextEncoding::UTF8 && CPTo == TextEncoding::IBM1047)
|
|
Conversion = UTF8ToIBM1047;
|
|
else if (CPFrom == TextEncoding::IBM1047 && CPTo == TextEncoding::UTF8)
|
|
Conversion = IBM1047ToUTF8;
|
|
else
|
|
return std::make_error_code(std::errc::invalid_argument);
|
|
|
|
return TextEncodingConverter(
|
|
std::make_unique<TextEncodingConverterTable>(Conversion));
|
|
}
|
|
|
|
ErrorOr<TextEncodingConverter> TextEncodingConverter::create(StringRef From,
|
|
StringRef To) {
|
|
std::optional<TextEncoding> FromEncoding = getKnownEncoding(From);
|
|
std::optional<TextEncoding> ToEncoding = getKnownEncoding(To);
|
|
if (FromEncoding && ToEncoding) {
|
|
ErrorOr<TextEncodingConverter> Converter =
|
|
create(*FromEncoding, *ToEncoding);
|
|
if (Converter)
|
|
return Converter;
|
|
}
|
|
#if HAVE_ICU
|
|
UErrorCode EC = U_ZERO_ERROR;
|
|
UConverterUniquePtr FromConvDesc(ucnv_open(From.str().c_str(), &EC));
|
|
if (U_FAILURE(EC))
|
|
return std::make_error_code(std::errc::invalid_argument);
|
|
|
|
UConverterUniquePtr ToConvDesc(ucnv_open(To.str().c_str(), &EC));
|
|
if (U_FAILURE(EC))
|
|
return std::make_error_code(std::errc::invalid_argument);
|
|
|
|
auto Converter = std::make_unique<TextEncodingConverterICU>(
|
|
std::move(FromConvDesc), std::move(ToConvDesc));
|
|
return TextEncodingConverter(std::move(Converter));
|
|
#elif HAVE_ICONV
|
|
iconv_t ConvDesc = iconv_open(To.str().c_str(), From.str().c_str());
|
|
if (ConvDesc == (iconv_t)-1)
|
|
return std::make_error_code(std::errc::invalid_argument);
|
|
return TextEncodingConverter(
|
|
std::make_unique<TextEncodingConverterIconv>(ConvDesc));
|
|
#else
|
|
return std::make_error_code(std::errc::invalid_argument);
|
|
#endif
|
|
}
|