Commit ff9c3321 authored by Kenton Varda's avatar Kenton Varda

Support encoding to and from wchar_t arrays.

Different platforms have different sizes for wchar_t. For example:

* Linux: 32-bit (originally intended as UCS-4, rarely used in practice)
* Windows: 16-bit (originally intended as UCS-2, but now probably treated as UTF-16)
* BeOS: 8-bit (strictly intended to be UTF-8)

For KJ purposes, we'll assume wchar_t arrays use the UTF encoding appropriate to their size, whatever that may be on the target platform.

This is mainly being added because the Win32 API uses wchar_t heavily.
parent 5483d8f7
...@@ -30,6 +30,7 @@ CappedArray<char, sizeof(char ) * 2 + 1> hex(byte i) { return kj::hex((ui ...@@ -30,6 +30,7 @@ CappedArray<char, sizeof(char ) * 2 + 1> hex(byte i) { return kj::hex((ui
CappedArray<char, sizeof(char ) * 2 + 1> hex(char i) { return kj::hex((uint8_t )i); } CappedArray<char, sizeof(char ) * 2 + 1> hex(char i) { return kj::hex((uint8_t )i); }
CappedArray<char, sizeof(char16_t) * 2 + 1> hex(char16_t i) { return kj::hex((uint16_t)i); } CappedArray<char, sizeof(char16_t) * 2 + 1> hex(char16_t i) { return kj::hex((uint16_t)i); }
CappedArray<char, sizeof(char32_t) * 2 + 1> hex(char32_t i) { return kj::hex((uint32_t)i); } CappedArray<char, sizeof(char32_t) * 2 + 1> hex(char32_t i) { return kj::hex((uint32_t)i); }
CappedArray<char, sizeof(uint32_t) * 2 + 1> hex(wchar_t i) { return kj::hex((uint32_t)i); }
// Hexify chars correctly. // Hexify chars correctly.
// //
// TODO(cleanup): Should this go into string.h with the other definitions of hex()? // TODO(cleanup): Should this go into string.h with the other definitions of hex()?
...@@ -245,6 +246,20 @@ KJ_TEST("EncodingResult as a Maybe") { ...@@ -245,6 +246,20 @@ KJ_TEST("EncodingResult as a Maybe") {
KJ_EXPECT(KJ_ASSERT_NONNULL(decodeUtf16(u"foo")) == "foo"); KJ_EXPECT(KJ_ASSERT_NONNULL(decodeUtf16(u"foo")) == "foo");
} }
KJ_TEST("encode to wchar_t") {
expectRes(encodeWideString(u8"foo"), L"foo");
expectRes(encodeWideString(u8"Здравствуйте"), L"Здравствуйте");
expectRes(encodeWideString(u8"中国网络"), L"中国网络");
expectRes(encodeWideString(u8"😺☁☄🐵"), L"😺☁☄🐵");
}
KJ_TEST("decode from wchar_t") {
expectRes(decodeWideString(L"foo"), u8"foo");
expectRes(decodeWideString(L"Здравствуйте"), u8"Здравствуйте");
expectRes(decodeWideString(L"中国网络"), u8"中国网络");
expectRes(decodeWideString(L"😺☁☄🐵"), u8"😺☁☄🐵");
}
// ======================================================================================= // =======================================================================================
KJ_TEST("hex encoding/decoding") { KJ_TEST("hex encoding/decoding") {
......
...@@ -247,6 +247,85 @@ EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf16) { ...@@ -247,6 +247,85 @@ EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf16) {
return { String(result.releaseAsArray()), hadErrors }; return { String(result.releaseAsArray()), hadErrors };
} }
namespace {
template <typename To, typename From>
Array<To> coerceTo(Array<From>&& array) {
static_assert(sizeof(To) == sizeof(From), "incompatible coercion");
Array<wchar_t> result;
memcpy(&result, &array, sizeof(array));
memset(&array, 0, sizeof(array));
return result;
}
template <typename To, typename From>
ArrayPtr<To> coerceTo(ArrayPtr<From> array) {
static_assert(sizeof(To) == sizeof(From), "incompatible coercion");
return arrayPtr(reinterpret_cast<To*>(array.begin()), array.size());
}
template <typename To, typename From>
EncodingResult<Array<To>> coerceTo(EncodingResult<Array<From>>&& result) {
return { coerceTo<To>(Array<From>(kj::mv(result))), result.hadErrors };
}
template <size_t s>
struct WideConverter;
template <>
struct WideConverter<sizeof(char)> {
typedef char Type;
static EncodingResult<Array<char>> encode(ArrayPtr<const char> text, bool nulTerminate) {
auto result = heapArray<char>(text.size() + nulTerminate);
memcpy(result.begin(), text.begin(), text.size());
if (nulTerminate) result.back() = 0;
return { kj::mv(result), false };
}
static EncodingResult<kj::String> decode(ArrayPtr<const char> text) {
return { kj::heapString(text), false };
}
};
template <>
struct WideConverter<sizeof(char16_t)> {
typedef char16_t Type;
static inline EncodingResult<Array<char16_t>> encode(
ArrayPtr<const char> text, bool nulTerminate) {
return encodeUtf16(text, nulTerminate);
}
static inline EncodingResult<kj::String> decode(ArrayPtr<const char16_t> text) {
return decodeUtf16(text);
}
};
template <>
struct WideConverter<sizeof(char32_t)> {
typedef char32_t Type;
static inline EncodingResult<Array<char32_t>> encode(
ArrayPtr<const char> text, bool nulTerminate) {
return encodeUtf32(text, nulTerminate);
}
static inline EncodingResult<kj::String> decode(ArrayPtr<const char32_t> text) {
return decodeUtf32(text);
}
};
} // namespace
EncodingResult<Array<wchar_t>> encodeWideString(ArrayPtr<const char> text, bool nulTerminate) {
return coerceTo<wchar_t>(WideConverter<sizeof(wchar_t)>::encode(text, nulTerminate));
}
EncodingResult<String> decodeWideString(ArrayPtr<const wchar_t> wide) {
using Converter = WideConverter<sizeof(wchar_t)>;
return Converter::decode(coerceTo<const Converter::Type>(wide));
}
// ======================================================================================= // =======================================================================================
namespace { namespace {
......
...@@ -107,6 +107,18 @@ EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32); ...@@ -107,6 +107,18 @@ EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32);
// raised on subsequent legs unless all invalid sequences were replaced with U+FFFD (which, after // raised on subsequent legs unless all invalid sequences were replaced with U+FFFD (which, after
// all, is a valid code point). // all, is a valid code point).
EncodingResult<Array<wchar_t>> encodeWideString(
ArrayPtr<const char> text, bool nulTerminate = false);
EncodingResult<String> decodeWideString(ArrayPtr<const wchar_t> wide);
// Encode / decode strings of wchar_t, aka "wide strings". Unfortunately, different platforms have
// different definitions for wchar_t. For example, on Windows they are 16-bit and encode UTF-16,
// but on Linux they are 32-bit and encode UTF-32. Some platforms even define wchar_t as 8-bit,
// encoding UTF-8 (e.g. BeOS did this).
//
// KJ assumes that wide strings use the UTF encoding that corresponds to the size of wchar_t on
// the target platform. So, these functions are simple aliases for encodeUtf*/decodeUtf*, above
// (or simply make a copy if wchar_t is 8 bits).
String encodeHex(ArrayPtr<const byte> bytes); String encodeHex(ArrayPtr<const byte> bytes);
EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text); EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text);
// Encode/decode bytes as hex strings. // Encode/decode bytes as hex strings.
...@@ -195,6 +207,11 @@ inline EncodingResult<Array<char32_t>> encodeUtf32(const char (&text)[s], bool n ...@@ -195,6 +207,11 @@ inline EncodingResult<Array<char32_t>> encodeUtf32(const char (&text)[s], bool n
return encodeUtf32(arrayPtr(text, s - 1), nulTerminate); return encodeUtf32(arrayPtr(text, s - 1), nulTerminate);
} }
template <size_t s> template <size_t s>
inline EncodingResult<Array<wchar_t>> encodeWideString(
const char (&text)[s], bool nulTerminate=false) {
return encodeWideString(arrayPtr(text, s - 1), nulTerminate);
}
template <size_t s>
inline EncodingResult<String> decodeUtf16(const char16_t (&utf16)[s]) { inline EncodingResult<String> decodeUtf16(const char16_t (&utf16)[s]) {
return decodeUtf16(arrayPtr(utf16, s - 1)); return decodeUtf16(arrayPtr(utf16, s - 1));
} }
...@@ -203,6 +220,10 @@ inline EncodingResult<String> decodeUtf32(const char32_t (&utf32)[s]) { ...@@ -203,6 +220,10 @@ inline EncodingResult<String> decodeUtf32(const char32_t (&utf32)[s]) {
return decodeUtf32(arrayPtr(utf32, s - 1)); return decodeUtf32(arrayPtr(utf32, s - 1));
} }
template <size_t s> template <size_t s>
inline EncodingResult<String> decodeWideString(const wchar_t (&utf32)[s]) {
return decodeWideString(arrayPtr(utf32, s - 1));
}
template <size_t s>
inline EncodingResult<Array<byte>> decodeHex(const char (&text)[s]) { inline EncodingResult<Array<byte>> decodeHex(const char (&text)[s]) {
return decodeHex(arrayPtr(text, s - 1)); return decodeHex(arrayPtr(text, s - 1));
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment