Extend Unicode encoders to support 'WTF-8'.

This allows arbitrary char16 arrays to round-trip through UTF-8 without losing information, even if the char16 arrays are not valid UTF-16. This is necessary e.g. for filesystem manipulation on Windows, where filenames contain 16-bit characters but valid UTF-16 is not enforced. Invalid UTF-16 represented in UTF-8 is affectionately known as WTF-8: http://simonsapin.github.io/wtf-8/

Extend Unicode encoders to support 'WTF-8'.
This allows arbitrary char16 arrays to round-trip through UTF-8 without losing information, even if the char16 arrays are not valid UTF-16. This is necessary e.g. for filesystem manipulation on Windows, where filenames contain 16-bit characters but valid UTF-16 is not enforced. Invalid UTF-16 represented in UTF-8 is affectionately known as WTF-8: http://simonsapin.github.io/wtf-8/
5483d8f7 · Kenton Varda · d3278477 · 5483d8f7 · 5483d8f7 · 5483d8f7
Commit 5483d8f7 authored Dec 03, 2017 by Kenton Varda
Show whitespace changes
Inline Side-by-side

Showing with 109 additions and 21 deletions

encoding-test.c++ c++/src/kj/encoding-test.c++ +45 -6

encoding.c++ c++/src/kj/encoding.c++ +27 -9

encoding.h c++/src/kj/encoding.h +37 -6

No files found.
--- a/c++/src/kj/encoding-test.c++
+++ b/c++/src/kj/encoding-test.c++
@@ -64,6 +64,13 @@ void expectRes(EncodingResult<T> result,
  expectResImpl(kj::mv(result), arrayPtr<const byte>(expected, s), errors);
 }
+// Handy reference for surrogate pair edge cases:
+//
+// \ud800 -> \xed\xa0\x80
+// \udc00 -> \xed\xb0\x80
+// \udbff -> \xed\xaf\xbf
+// \udfff -> \xed\xbf\xbf
 KJ_TEST("encode UTF-8 to UTF-16") {
  expectRes(encodeUtf16(u8"foo"), u"foo");
  expectRes(encodeUtf16(u8"Здравствуйте"), u"Здравствуйте");
@@ -113,6 +120,26 @@ KJ_TEST("invalid UTF-8 to UTF-16") {
  expectRes(encodeUtf16("\xfc\xbf\x80\x80\x80\x80"), u"\ufffd", true);
  expectRes(encodeUtf16("\xfe\xbf\x80\x80\x80\x80\x80"), u"\ufffd", true);
  expectRes(encodeUtf16("\xff\xbf\x80\x80\x80\x80\x80\x80"), u"\ufffd", true);
+  // Surrogates encoded as separate UTF-8 code points are flagged as errors but allowed to decode
+  // to UTF-16 surrogate values.
+  expectRes(encodeUtf16(u8"\ud7ff\xed\xb0\x80\xed\xaf\xbf\ue000"),
+      u"\xd7ff\xdc00\xdbff\xe000", true);
+  expectRes(encodeUtf16(u8"\ud7ff\xed\xbf\xbf\xed\xa0\x80\ue000"),
+      u"\xd7ff\xdfff\xd800\xe000", true);
+  expectRes(encodeUtf16(u8"\ud7ff\xed\xb0\x80\xed\xbf\xbf\ue000"),
+      u"\xd7ff\xdc00\xdfff\xe000", true);
+  expectRes(encodeUtf16(u8"f\xed\xa0\x80"), u"f\xd800", true);
+  expectRes(encodeUtf16(u8"f\xed\xa0\x80x"), u"f\xd800x", true);
+  expectRes(encodeUtf16(u8"f\xed\xa0\x80\xed\xa0\x80x"), u"f\xd800\xd800x", true);
+  // However, if successive UTF-8 codepoints decode to a proper surrogate pair, the second
+  // surrogate is replaced with the Unicode replacement character to avoid creating valid UTF-16.
+  expectRes(encodeUtf16(u8"\ud7ff\xed\xa0\x80\xed\xbf\xbf\ue000"),
+      u"\xd7ff\xd800\xfffd\xe000", true);
+  expectRes(encodeUtf16(u8"\ud7ff\xed\xaf\xbf\xed\xb0\x80\ue000"),
+      u"\xd7ff\xdbff\xfffd\xe000", true);
 }
 KJ_TEST("encode UTF-8 to UTF-32") {
@@ -169,12 +196,15 @@ KJ_TEST("decode UTF-16 to UTF-8") {
 KJ_TEST("invalid UTF-16 to UTF-8") {
  // Surrogates in wrong order.
-  expectRes(decodeUtf16(u"\xd7ff\xdc00\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true);
+  expectRes(decodeUtf16(u"\xd7ff\xdc00\xdbff\xe000"),
+      u8"\ud7ff\xed\xb0\x80\xed\xaf\xbf\ue000", true);
+  expectRes(decodeUtf16(u"\xd7ff\xdfff\xd800\xe000"),
+      u8"\ud7ff\xed\xbf\xbf\xed\xa0\x80\ue000", true);
  // Missing second surrogate.
-  expectRes(decodeUtf16(u"f\xd800"), u8"f\ufffd", true);
+  expectRes(decodeUtf16(u"f\xd800"), u8"f\xed\xa0\x80", true);
-  expectRes(decodeUtf16(u"f\xd800x"), u8"f\ufffdx", true);
+  expectRes(decodeUtf16(u"f\xd800x"), u8"f\xed\xa0\x80x", true);
-  expectRes(decodeUtf16(u"f\xd800\xd800x"), u8"f\ufffd\ufffdx", true);
+  expectRes(decodeUtf16(u"f\xd800\xd800x"), u8"f\xed\xa0\x80\xed\xa0\x80x", true);
 }
 KJ_TEST("decode UTF-32 to UTF-8") {
@@ -186,10 +216,19 @@ KJ_TEST("decode UTF-32 to UTF-8") {
 KJ_TEST("invalid UTF-32 to UTF-8") {
  // Surrogates rejected.
-  expectRes(decodeUtf32(U"\xd7ff\xdc00\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true);
+  expectRes(decodeUtf32(U"\xd7ff\xdfff\xd800\xe000"),
+      u8"\ud7ff\xed\xbf\xbf\xed\xa0\x80\ue000", true);
  // Even if it would be a valid surrogate pair in UTF-16.
-  expectRes(decodeUtf32(U"\xd7ff\xd800\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true);
+  expectRes(decodeUtf32(U"\xd7ff\xd800\xdfff\xe000"),
+      u8"\ud7ff\xed\xa0\x80\xed\xbf\xbf\ue000", true);
+}
+KJ_TEST("round-trip invalid UTF-16") {
+  const char16_t INVALID[] = u"\xdfff foo \xd800\xdc00 bar \xdc00\xd800 baz \xdbff qux \xd800";
+  expectRes(encodeUtf16(decodeUtf16(INVALID)), INVALID, true);
+  expectRes(encodeUtf16(decodeUtf32(encodeUtf32(decodeUtf16(INVALID)))), INVALID, true);
 }
 KJ_TEST("EncodingResult as a Maybe") {

--- a/c++/src/kj/encoding.c++
+++ b/c++/src/kj/encoding.c++
@@ -79,8 +79,23 @@ EncodingResult<Array<T>> encodeUtf(ArrayPtr<const char> text, bool nulTerminate)
      // Disallow overlong sequence.
      GOTO_ERROR_IF(u < 0x0800);
-      // Disallow surrogate pair code points.
+      // Flag surrogate pair code points as errors, but allow them through.
-      GOTO_ERROR_IF((u & 0xf800) == 0xd800);
+      if (KJ_UNLIKELY((u & 0xf800) == 0xd800)) {
+        if (result.size() > 0 &&
+            (u & 0xfc00) == 0xdc00 &&
+            (result.back() & 0xfc00) == 0xd800) {
+          // Whoops, the *previous* character was also an invalid surrogate, and if we add this
+          // one too, they'll form a valid surrogate pair. If we allowed this, then it would mean
+          // invalid UTF-8 round-tripped to UTF-16 and back could actually change meaning entirely.
+          // OTOH, the reason we allow dangling surrogates is to allow invalid UTF-16 to round-trip
+          // to UTF-8 without loss, but if the original UTF-16 had a valid surrogate pair, it would
+          // have been encoded as a valid single UTF-8 codepoint, not as separate UTF-8 codepoints
+          // for each surrogate.
+          goto error;
+        }
+        hadErrors = true;
+      }
      result.add(u);
      continue;
@@ -153,9 +168,12 @@ EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) {
    } else if ((u & 0xf800) == 0xd800) {
      // surrogate pair
      char16_t u2;
-      GOTO_ERROR_IF(i == utf16.size()                       // missing second half
+      if (KJ_UNLIKELY(i == utf16.size()                         // missing second half
                   || (u & 0x0400) != 0                         // first half in wrong range
-                 || ((u2 = utf16[i]) & 0xfc00) != 0xdc00);  // second half in wrong range
+                   || ((u2 = utf16[i]) & 0xfc00) != 0xdc00)) {  // second half in wrong range
+        hadErrors = true;
+        goto threeByte;
+      }
      ++i;
      char32_t u32 = (((u & 0x03ff) << 10) | (u2 & 0x03ff)) + 0x10000;
@@ -167,6 +185,7 @@ EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) {
      });
      continue;
    } else {
+    threeByte:
      result.addAll<std::initializer_list<char>>({
        static_cast<char>(((u >> 12)       ) | 0xe0),
        static_cast<char>(((u >>  6) & 0x3f) | 0x80),
@@ -174,10 +193,6 @@ EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) {
      });
      continue;
    }
-  error:
-    result.addAll(StringPtr(u8"\ufffd"));
-    hadErrors = true;
  }
  result.add(0);
@@ -202,7 +217,10 @@ EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf16) {
      });
      continue;
    } else if (u < 0x10000) {
-      GOTO_ERROR_IF((u & 0xfffff800) == 0xd800);  // no surrogates allowed in utf-32
+      if (KJ_UNLIKELY((u & 0xfffff800) == 0xd800)) {
+        // no surrogates allowed in utf-32
+        hadErrors = true;
+      }
      result.addAll<std::initializer_list<char>>({
        static_cast<char>(((u >> 12)       ) | 0xe0),
        static_cast<char>(((u >>  6) & 0x3f) | 0x80),

--- a/c++/src/kj/encoding.h
+++ b/c++/src/kj/encoding.h
@@ -52,17 +52,24 @@ struct EncodingResult: public ResultType {
  const bool hadErrors;
 };
+template <typename T>
+inline auto KJ_STRINGIFY(const EncodingResult<T>& value)
+    -> decltype(toCharSequence(implicitCast<const T&>(value))) {
+  return toCharSequence(implicitCast<const T&>(value));
+}
 EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false);
 EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false);
 // Convert UTF-8 text (which KJ strings use) to UTF-16 or UTF-32.
 //
 // If `nulTerminate` is true, an extra NUL character will be added to the end of the output.
 //
-// The `try` versions return null if the input is invalid; the non-`try` versions return data
-// containing the Unicode replacement character (U+FFFD).
-//
 // The returned arrays are in platform-native endianness (otherwise they wouldn't really be
 // char16_t / char32_t).
+//
+// Note that the KJ Unicode encoding and decoding functions actually implement
+// [WTF-8 encoding](http://simonsapin.github.io/wtf-8/), which affects how invalid input is
+// handled. See comments on decodeUtf16() for more info.
 EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16);
 EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32);
@@ -71,10 +78,34 @@ EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32);
 // The input should NOT include a NUL terminator; any NUL characters in the input array will be
 // preserved in the output.
 //
-// The `try` versions return null if the input is invalid; the non-`try` versions return data
-// containing the Unicode replacement character (U+FFFD).
-//
 // The input must be in platform-native endianness. BOMs are NOT recognized by these functions.
+//
+// Note that the KJ Unicode encoding and decoding functions actually implement
+// [WTF-8 encoding](http://simonsapin.github.io/wtf-8/). This means that if you start with an array
+// of char16_t and you pass it through any number of conversions to other Unicode encodings,
+// eventually returning it to UTF-16, all the while ignoring `hadErrors`, you will end up with
+// exactly the same char16_t array you started with, *even if* the array is not valid UTF-16. This
+// is useful because many real-world systems that were designed for UCS-2 (plain 16-bit Unicode)
+// and later "upgraded" to UTF-16 do not enforce that their UTF-16 is well-formed. For example,
+// file names on Windows NT are encoded using 16-bit characters, without enforcing that the
+// character sequence is valid UTF-16. It is important that programs on Windows be able to handle
+// such filenames, even if they choose to convert the name to UTF-8 for internal processing.
+//
+// Specifically, KJ's Unicode handling allows unpaired surrogate code points to round-trip through
+// UTF-8 and UTF-32. Unpaired surrogates will be flagged as an error (setting `hadErrors` in the
+// result), but will NOT be replaced with the Unicode replacement character as other erroneous
+// sequences would be, but rather encoded as an invalid surrogate codepoint in the target encoding.
+//
+// KJ makes the following guarantees about invalid input:
+// - A round trip from UTF-16 to other encodings and back will produce exactly the original input,
+//   with every leg of the trip raising the `hadErrors` flag if the original input was not valid.
+// - A round trip from UTF-8 or UTF-32 to other encodings and back will either produce exactly
+//   the original input, or will have replaced some invalid sequences with the Unicode replacement
+//   character, U+FFFD. No code units will ever be removed unless they are replaced with U+FFFD,
+//   and no code units will ever be added except to encode U+FFFD. If the original input was not
+//   valid, the `hadErrors` flag will be raised on the first leg of the trip, and will also be
+//   raised on subsequent legs unless all invalid sequences were replaced with U+FFFD (which, after
+//   all, is a valid code point).
 String encodeHex(ArrayPtr<const byte> bytes);
 EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text);