Support encoding to and from wchar_t arrays.

Different platforms have different sizes for wchar_t. For example: * Linux: 32-bit (originally intended as UCS-4, rarely used in practice) * Windows: 16-bit (originally intended as UCS-2, but now probably treated as UTF-16) * BeOS: 8-bit (strictly intended to be UTF-8) For KJ purposes, we'll assume wchar_t arrays use the UTF encoding appropriate to their size, whatever that may be on the target platform. This is mainly being added because the Win32 API uses wchar_t heavily.

Support encoding to and from wchar_t arrays.
Different platforms have different sizes for wchar_t. For example: * Linux: 32-bit (originally intended as UCS-4, rarely used in practice) * Windows: 16-bit (originally intended as UCS-2, but now probably treated as UTF-16) * BeOS: 8-bit (strictly intended to be UTF-8) For KJ purposes, we'll assume wchar_t arrays use the UTF encoding appropriate to their size, whatever that may be on the target platform. This is mainly being added because the Win32 API uses wchar_t heavily.
ff9c3321 · Kenton Varda · 5483d8f7 · ff9c3321 · ff9c3321 · ff9c3321
Commit ff9c3321 authored Dec 03, 2017 by Kenton Varda
Show whitespace changes
Inline Side-by-side

Showing with 115 additions and 0 deletions

encoding-test.c++ c++/src/kj/encoding-test.c++ +15 -0

encoding.c++ c++/src/kj/encoding.c++ +79 -0

encoding.h c++/src/kj/encoding.h +21 -0

No files found.
--- a/c++/src/kj/encoding-test.c++
+++ b/c++/src/kj/encoding-test.c++
@@ -30,6 +30,7 @@ CappedArray<char, sizeof(char    ) * 2 + 1> hex(byte     i) { return kj::hex((ui
 CappedArray<char, sizeof(char    ) * 2 + 1> hex(char     i) { return kj::hex((uint8_t )i); }
 CappedArray<char, sizeof(char16_t) * 2 + 1> hex(char16_t i) { return kj::hex((uint16_t)i); }
 CappedArray<char, sizeof(char32_t) * 2 + 1> hex(char32_t i) { return kj::hex((uint32_t)i); }
+CappedArray<char, sizeof(uint32_t) * 2 + 1> hex(wchar_t  i) { return kj::hex((uint32_t)i); }
 // Hexify chars correctly.
 //
 // TODO(cleanup): Should this go into string.h with the other definitions of hex()?
@@ -245,6 +246,20 @@ KJ_TEST("EncodingResult as a Maybe") {
  KJ_EXPECT(KJ_ASSERT_NONNULL(decodeUtf16(u"foo")) == "foo");
 }
+KJ_TEST("encode to wchar_t") {
+  expectRes(encodeWideString(u8"foo"), L"foo");
+  expectRes(encodeWideString(u8"Здравствуйте"), L"Здравствуйте");
+  expectRes(encodeWideString(u8"中国网络"), L"中国网络");
+  expectRes(encodeWideString(u8"😺☁☄🐵"), L"😺☁☄🐵");
+}
+KJ_TEST("decode from wchar_t") {
+  expectRes(decodeWideString(L"foo"), u8"foo");
+  expectRes(decodeWideString(L"Здравствуйте"), u8"Здравствуйте");
+  expectRes(decodeWideString(L"中国网络"), u8"中国网络");
+  expectRes(decodeWideString(L"😺☁☄🐵"), u8"😺☁☄🐵");
+}
 // =======================================================================================
 KJ_TEST("hex encoding/decoding") {

--- a/c++/src/kj/encoding.c++
+++ b/c++/src/kj/encoding.c++
@@ -247,6 +247,85 @@ EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf16) {
  return { String(result.releaseAsArray()), hadErrors };
 }
+namespace {
+template <typename To, typename From>
+Array<To> coerceTo(Array<From>&& array) {
+  static_assert(sizeof(To) == sizeof(From), "incompatible coercion");
+  Array<wchar_t> result;
+  memcpy(&result, &array, sizeof(array));
+  memset(&array, 0, sizeof(array));
+  return result;
+}
+template <typename To, typename From>
+ArrayPtr<To> coerceTo(ArrayPtr<From> array) {
+  static_assert(sizeof(To) == sizeof(From), "incompatible coercion");
+  return arrayPtr(reinterpret_cast<To*>(array.begin()), array.size());
+}
+template <typename To, typename From>
+EncodingResult<Array<To>> coerceTo(EncodingResult<Array<From>>&& result) {
+  return { coerceTo<To>(Array<From>(kj::mv(result))), result.hadErrors };
+}
+template <size_t s>
+struct WideConverter;
+template <>
+struct WideConverter<sizeof(char)> {
+  typedef char Type;
+  static EncodingResult<Array<char>> encode(ArrayPtr<const char> text, bool nulTerminate) {
+    auto result = heapArray<char>(text.size() + nulTerminate);
+    memcpy(result.begin(), text.begin(), text.size());
+    if (nulTerminate) result.back() = 0;
+    return { kj::mv(result), false };
+  }
+  static EncodingResult<kj::String> decode(ArrayPtr<const char> text) {
+    return { kj::heapString(text), false };
+  }
+};
+template <>
+struct WideConverter<sizeof(char16_t)> {
+  typedef char16_t Type;
+  static inline EncodingResult<Array<char16_t>> encode(
+      ArrayPtr<const char> text, bool nulTerminate) {
+    return encodeUtf16(text, nulTerminate);
+  }
+  static inline EncodingResult<kj::String> decode(ArrayPtr<const char16_t> text) {
+    return decodeUtf16(text);
+  }
+};
+template <>
+struct WideConverter<sizeof(char32_t)> {
+  typedef char32_t Type;
+  static inline EncodingResult<Array<char32_t>> encode(
+      ArrayPtr<const char> text, bool nulTerminate) {
+    return encodeUtf32(text, nulTerminate);
+  }
+  static inline EncodingResult<kj::String> decode(ArrayPtr<const char32_t> text) {
+    return decodeUtf32(text);
+  }
+};
+}  // namespace
+EncodingResult<Array<wchar_t>> encodeWideString(ArrayPtr<const char> text, bool nulTerminate) {
+  return coerceTo<wchar_t>(WideConverter<sizeof(wchar_t)>::encode(text, nulTerminate));
+}
+EncodingResult<String> decodeWideString(ArrayPtr<const wchar_t> wide) {
+  using Converter = WideConverter<sizeof(wchar_t)>;
+  return Converter::decode(coerceTo<const Converter::Type>(wide));
+}
 // =======================================================================================
 namespace {

--- a/c++/src/kj/encoding.h
+++ b/c++/src/kj/encoding.h
@@ -107,6 +107,18 @@ EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32);
 //   raised on subsequent legs unless all invalid sequences were replaced with U+FFFD (which, after
 //   all, is a valid code point).
+EncodingResult<Array<wchar_t>> encodeWideString(
+    ArrayPtr<const char> text, bool nulTerminate = false);
+EncodingResult<String> decodeWideString(ArrayPtr<const wchar_t> wide);
+// Encode / decode strings of wchar_t, aka "wide strings". Unfortunately, different platforms have
+// different definitions for wchar_t. For example, on Windows they are 16-bit and encode UTF-16,
+// but on Linux they are 32-bit and encode UTF-32. Some platforms even define wchar_t as 8-bit,
+// encoding UTF-8 (e.g. BeOS did this).
+//
+// KJ assumes that wide strings use the UTF encoding that corresponds to the size of wchar_t on
+// the target platform. So, these functions are simple aliases for encodeUtf*/decodeUtf*, above
+// (or simply make a copy if wchar_t is 8 bits).
 String encodeHex(ArrayPtr<const byte> bytes);
 EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text);
 // Encode/decode bytes as hex strings.
@@ -195,6 +207,11 @@ inline EncodingResult<Array<char32_t>> encodeUtf32(const char (&text)[s], bool n
  return encodeUtf32(arrayPtr(text, s - 1), nulTerminate);
 }
 template <size_t s>
+inline EncodingResult<Array<wchar_t>> encodeWideString(
+    const char (&text)[s], bool nulTerminate=false) {
+  return encodeWideString(arrayPtr(text, s - 1), nulTerminate);
+}
+template <size_t s>
 inline EncodingResult<String> decodeUtf16(const char16_t (&utf16)[s]) {
  return decodeUtf16(arrayPtr(utf16, s - 1));
 }
@@ -203,6 +220,10 @@ inline EncodingResult<String> decodeUtf32(const char32_t (&utf32)[s]) {
  return decodeUtf32(arrayPtr(utf32, s - 1));
 }
 template <size_t s>
+inline EncodingResult<String> decodeWideString(const wchar_t (&utf32)[s]) {
+  return decodeWideString(arrayPtr(utf32, s - 1));
+}
+template <size_t s>
 inline EncodingResult<Array<byte>> decodeHex(const char (&text)[s]) {
  return decodeHex(arrayPtr(text, s - 1));
 }