Add KJ utility functions to encode/decode blobs in common formats.

In particular: UTF-{8,16,32}, Hex, URI encoding, and Base64

Add KJ utility functions to encode/decode blobs in common formats.
In particular: UTF-{8,16,32}, Hex, URI encoding, and Base64
f74555b4 · Kenton Varda · 97aae1bb · f74555b4 · f74555b4 · f74555b4
Commit f74555b4 authored May 22, 2017 by Kenton Varda
5 changed files
--- a/c++/Makefile.am
+++ b/c++/Makefile.am
@@ -128,6 +128,7 @@ includekj_HEADERS =                                            \
  src/kj/vector.h                                              \
  src/kj/string.h                                              \
  src/kj/string-tree.h                                         \
+  src/kj/encoding.h                                            \
  src/kj/exception.h                                           \
  src/kj/debug.h                                               \
  src/kj/arena.h                                               \
@@ -218,6 +219,7 @@ libkj_la_SOURCES=                                              \
  src/kj/array.c++                                             \
  src/kj/string.c++                                            \
  src/kj/string-tree.c++                                       \
+  src/kj/encoding.c++                                          \
  src/kj/exception.c++                                         \
  src/kj/debug.c++                                             \
  src/kj/arena.c++                                             \
@@ -451,6 +453,7 @@ capnp_test_SOURCES =                                           \
  src/kj/array-test.c++                                        \
  src/kj/string-test.c++                                       \
  src/kj/string-tree-test.c++                                  \
+  src/kj/encoding-test.c++                                     \
  src/kj/exception-test.c++                                    \
  src/kj/debug-test.c++                                        \
  src/kj/arena-test.c++                                        \

--- a/c++/src/kj/CMakeLists.txt
+++ b/c++/src/kj/CMakeLists.txt
@@ -19,6 +19,7 @@ set(kj_sources_heavy
  units.c++
  refcount.c++
  string-tree.c++
+  encoding.c++
  parse/char.c++
 )
 if(NOT CAPNP_LITE)
@@ -36,6 +37,7 @@ set(kj_headers
  vector.h
  string.h
  string-tree.h
+  encoding.h
  exception.h
  debug.h
  arena.h
@@ -170,6 +172,7 @@ if(BUILD_TESTING)
      async-io-test.c++
      refcount-test.c++
      string-tree-test.c++
+      encoding-test.c++
      arena-test.c++
      units-test.c++
      tuple-test.c++

--- a/c++/src/kj/encoding-test.c++
+++ b/c++/src/kj/encoding-test.c++
+// Copyright (c) 2017 Cloudflare, Inc. and contributors
+// Licensed under the MIT License:
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "encoding.h"
+#include <kj/test.h>
+#include <stdint.h>
+
+namespace kj {
+namespace {
+
+CappedArray<char, sizeof(char    ) * 2 + 1> hex(char     i) { return kj::hex((uint8_t )i); }
+CappedArray<char, sizeof(char16_t) * 2 + 1> hex(char16_t i) { return kj::hex((uint16_t)i); }
+CappedArray<char, sizeof(char32_t) * 2 + 1> hex(char32_t i) { return kj::hex((uint32_t)i); }
+// Hexify chars correctly.
+//
+// TODO(cleanup): Should this go into string.h with the other definitions of hex()?
+
+template <typename T>
+void expectUtf(UtfResult<T> result,
+               ArrayPtr<const Decay<decltype(result[0])>> expected,
+               bool errors = false) {
+  if (errors) {
+    KJ_EXPECT(result.hadErrors);
+  } else {
+    KJ_EXPECT(!result.hadErrors);
+  }
+
+  KJ_EXPECT(result.size() == expected.size(), result.size(), expected.size());
+  for (auto i: kj::zeroTo(kj::min(result.size(), expected.size()))) {
+    KJ_EXPECT(result[i] == expected[i], i, hex(result[i]), hex(expected[i]));
+  }
+}
+
+template <typename T, size_t s>
+void expectUtf(UtfResult<T> result,
+               const Decay<decltype(result[0])> (&expected)[s],
+               bool errors = false) {
+  expectUtf(kj::mv(result), arrayPtr(expected, s - 1), errors);
+}
+
+KJ_TEST("encode UTF-8 to UTF-16") {
+  expectUtf(encodeUtf16(u8"foo"), u"foo");
+  expectUtf(encodeUtf16(u8"Здравствуйте"), u"Здравствуйте");
+  expectUtf(encodeUtf16(u8"中国网络"), u"中国网络");
+  expectUtf(encodeUtf16(u8"😺☁☄🐵"), u"😺☁☄🐵");
+}
+
+KJ_TEST("invalid UTF-8 to UTF-16") {
+  // Disembodied continuation byte.
+  expectUtf(encodeUtf16("\x80"), u"\ufffd", true);
+  expectUtf(encodeUtf16("f\xbfo"), u"f\ufffdo", true);
+  expectUtf(encodeUtf16("f\xbf\x80\xb0o"), u"f\ufffdo", true);
+
+  // Missing continuation bytes.
+  expectUtf(encodeUtf16("\xc2x"), u"\ufffdx", true);
+  expectUtf(encodeUtf16("\xe0x"), u"\ufffdx", true);
+  expectUtf(encodeUtf16("\xe0\xa0x"), u"\ufffdx", true);
+  expectUtf(encodeUtf16("\xf0x"), u"\ufffdx", true);
+  expectUtf(encodeUtf16("\xf0\x90x"), u"\ufffdx", true);
+  expectUtf(encodeUtf16("\xf0\x90\x80x"), u"\ufffdx", true);
+
+  // Overlong sequences.
+  expectUtf(encodeUtf16("\xc0\x80"), u"\ufffd", true);
+  expectUtf(encodeUtf16("\xc1\xbf"), u"\ufffd", true);
+  expectUtf(encodeUtf16("\xc2\x80"), u"\u0080", false);
+  expectUtf(encodeUtf16("\xdf\xbf"), u"\u07ff", false);
+
+  expectUtf(encodeUtf16("\xe0\x80\x80"), u"\ufffd", true);
+  expectUtf(encodeUtf16("\xe0\x9f\xbf"), u"\ufffd", true);
+  expectUtf(encodeUtf16("\xe0\xa0\x80"), u"\u0800", false);
+  expectUtf(encodeUtf16("\xef\xbf\xbf"), u"\uffff", false);
+
+  expectUtf(encodeUtf16("\xf0\x80\x80\x80"), u"\ufffd", true);
+  expectUtf(encodeUtf16("\xf0\x8f\xbf\xbf"), u"\ufffd", true);
+  expectUtf(encodeUtf16("\xf0\x90\x80\x80"), u"\U00010000", false);
+  expectUtf(encodeUtf16("\xf4\x8f\xbf\xbf"), u"\U0010ffff", false);
+
+  // Out of Unicode range.
+  expectUtf(encodeUtf16("\xf5\x80\x80\x80"), u"\ufffd", true);
+  expectUtf(encodeUtf16("\xf8\xbf\x80\x80\x80"), u"\ufffd", true);
+  expectUtf(encodeUtf16("\xfc\xbf\x80\x80\x80\x80"), u"\ufffd", true);
+  expectUtf(encodeUtf16("\xfe\xbf\x80\x80\x80\x80\x80"), u"\ufffd", true);
+  expectUtf(encodeUtf16("\xff\xbf\x80\x80\x80\x80\x80\x80"), u"\ufffd", true);
+}
+
+KJ_TEST("encode UTF-8 to UTF-32") {
+  expectUtf(encodeUtf32(u8"foo"), U"foo");
+  expectUtf(encodeUtf32(u8"Здравствуйте"), U"Здравствуйте");
+  expectUtf(encodeUtf32(u8"中国网络"), U"中国网络");
+  expectUtf(encodeUtf32(u8"😺☁☄🐵"), U"😺☁☄🐵");
+}
+
+KJ_TEST("invalid UTF-8 to UTF-32") {
+  // Disembodied continuation byte.
+  expectUtf(encodeUtf32("\x80"), U"\ufffd", true);
+  expectUtf(encodeUtf32("f\xbfo"), U"f\ufffdo", true);
+  expectUtf(encodeUtf32("f\xbf\x80\xb0o"), U"f\ufffdo", true);
+
+  // Missing continuation bytes.
+  expectUtf(encodeUtf32("\xc2x"), U"\ufffdx", true);
+  expectUtf(encodeUtf32("\xe0x"), U"\ufffdx", true);
+  expectUtf(encodeUtf32("\xe0\xa0x"), U"\ufffdx", true);
+  expectUtf(encodeUtf32("\xf0x"), U"\ufffdx", true);
+  expectUtf(encodeUtf32("\xf0\x90x"), U"\ufffdx", true);
+  expectUtf(encodeUtf32("\xf0\x90\x80x"), U"\ufffdx", true);
+
+  // Overlong sequences.
+  expectUtf(encodeUtf32("\xc0\x80"), U"\ufffd", true);
+  expectUtf(encodeUtf32("\xc1\xbf"), U"\ufffd", true);
+  expectUtf(encodeUtf32("\xc2\x80"), U"\u0080", false);
+  expectUtf(encodeUtf32("\xdf\xbf"), U"\u07ff", false);
+
+  expectUtf(encodeUtf32("\xe0\x80\x80"), U"\ufffd", true);
+  expectUtf(encodeUtf32("\xe0\x9f\xbf"), U"\ufffd", true);
+  expectUtf(encodeUtf32("\xe0\xa0\x80"), U"\u0800", false);
+  expectUtf(encodeUtf32("\xef\xbf\xbf"), U"\uffff", false);
+
+  expectUtf(encodeUtf32("\xf0\x80\x80\x80"), U"\ufffd", true);
+  expectUtf(encodeUtf32("\xf0\x8f\xbf\xbf"), U"\ufffd", true);
+  expectUtf(encodeUtf32("\xf0\x90\x80\x80"), U"\U00010000", false);
+  expectUtf(encodeUtf32("\xf4\x8f\xbf\xbf"), U"\U0010ffff", false);
+
+  // Out of Unicode range.
+  expectUtf(encodeUtf32("\xf5\x80\x80\x80"), U"\ufffd", true);
+  expectUtf(encodeUtf32("\xf8\xbf\x80\x80\x80"), U"\ufffd", true);
+  expectUtf(encodeUtf32("\xfc\xbf\x80\x80\x80\x80"), U"\ufffd", true);
+  expectUtf(encodeUtf32("\xfe\xbf\x80\x80\x80\x80\x80"), U"\ufffd", true);
+  expectUtf(encodeUtf32("\xff\xbf\x80\x80\x80\x80\x80\x80"), U"\ufffd", true);
+}
+
+KJ_TEST("decode UTF-16 to UTF-8") {
+  expectUtf(decodeUtf16(u"foo"), u8"foo");
+  expectUtf(decodeUtf16(u"Здравствуйте"), u8"Здравствуйте");
+  expectUtf(decodeUtf16(u"中国网络"), u8"中国网络");
+  expectUtf(decodeUtf16(u"😺☁☄🐵"), u8"😺☁☄🐵");
+}
+
+KJ_TEST("invalid UTF-16 to UTF-8") {
+  // Surrogates in wrong order.
+  expectUtf(decodeUtf16(u"\xd7ff\xdc00\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true);
+
+  // Missing second surrogate.
+  expectUtf(decodeUtf16(u"f\xd800"), u8"f\ufffd", true);
+  expectUtf(decodeUtf16(u"f\xd800x"), u8"f\ufffdx", true);
+  expectUtf(decodeUtf16(u"f\xd800\xd800x"), u8"f\ufffd\ufffdx", true);
+}
+
+KJ_TEST("decode UTF-32 to UTF-8") {
+  expectUtf(decodeUtf32(U"foo"), u8"foo");
+  expectUtf(decodeUtf32(U"Здравствуйте"), u8"Здравствуйте");
+  expectUtf(decodeUtf32(U"中国网络"), u8"中国网络");
+  expectUtf(decodeUtf32(U"😺☁☄🐵"), u8"😺☁☄🐵");
+}
+
+KJ_TEST("invalid UTF-32 to UTF-8") {
+  // Surrogates rejected.
+  expectUtf(decodeUtf32(U"\xd7ff\xdc00\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true);
+
+  // Even if it would be a valid surrogate pair in UTF-16.
+  expectUtf(decodeUtf32(U"\xd7ff\xd800\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true);
+}
+
+KJ_TEST("tryEncode / tryDecode") {
+  KJ_EXPECT(tryEncodeUtf16("\x80") == nullptr);
+  KJ_EXPECT(ArrayPtr<const char16_t>(KJ_ASSERT_NONNULL(tryEncodeUtf16("foo")))
+         == arrayPtr(u"foo", 3));
+
+  KJ_EXPECT(tryEncodeUtf32("\x80") == nullptr);
+  KJ_EXPECT(ArrayPtr<const char32_t>(KJ_ASSERT_NONNULL(tryEncodeUtf32("foo")))
+         == arrayPtr(U"foo", 3));
+
+  KJ_EXPECT(tryDecodeUtf16(u"\xd800") == nullptr);
+  KJ_EXPECT(KJ_ASSERT_NONNULL(tryDecodeUtf16(u"foo")) == "foo");
+  KJ_EXPECT(tryDecodeUtf32(U"\xd800") == nullptr);
+  KJ_EXPECT(KJ_ASSERT_NONNULL(tryDecodeUtf32(U"foo")) == "foo");
+}
+
+// =======================================================================================
+
+KJ_TEST("hex encoding/decoding") {
+  byte bytes[] = {0x12, 0x34, 0xab, 0xf2};
+
+  KJ_EXPECT(encodeHex(bytes) == "1234abf2");
+  KJ_EXPECT(decodeHex("1234abf2").asPtr() == bytes);
+}
+
+KJ_TEST("URI encoding/decoding") {
+  KJ_EXPECT(encodeUriComponent("foo") == "foo");
+  KJ_EXPECT(encodeUriComponent("foo bar") == "foo%20bar");
+  KJ_EXPECT(encodeUriComponent("\xab\xba") == "%ab%ba");
+  KJ_EXPECT(encodeUriComponent(StringPtr("foo\0bar", 7)) == "foo%00bar");
+
+  KJ_EXPECT(decodeUriComponent("foo%20bar") == "foo bar");
+  KJ_EXPECT(decodeUriComponent("%ab%BA") == "\xab\xba");
+
+  byte bytes[] = {12, 34, 56};
+  KJ_EXPECT(decodeBinaryUriComponent(encodeUriComponent(bytes)).asPtr() == bytes);
+}
+
+KJ_TEST("base64 encoding/decoding") {
+  {
+    auto encoded = encodeBase64(StringPtr("foo").asBytes(), false);
+    KJ_EXPECT(encoded == "Zm9v", encoded, encoded.size());
+    KJ_EXPECT(heapString(decodeBase64(encoded.asArray()).asChars()) == "foo");
+  }
+
+  {
+    auto encoded = encodeBase64(StringPtr("corge").asBytes(), false);
+    KJ_EXPECT(encoded == "Y29yZ2U=", encoded);
+    KJ_EXPECT(heapString(decodeBase64(encoded.asArray()).asChars()) == "corge");
+  }
+
+  KJ_EXPECT(heapString(decodeBase64("Y29yZ2U").asChars()) == "corge");
+  KJ_EXPECT(heapString(decodeBase64("Y\n29y Z@2U=\n").asChars()) == "corge");
+
+  {
+    auto encoded = encodeBase64(StringPtr("corge").asBytes(), true);
+    KJ_EXPECT(encoded == "Y29yZ2U=\n", encoded);
+  }
+
+  StringPtr fullLine = "012345678901234567890123456789012345678901234567890123";
+  {
+    auto encoded = encodeBase64(fullLine.asBytes(), false);
+    KJ_EXPECT(
+        encoded == "MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIz",
+        encoded);
+  }
+  {
+    auto encoded = encodeBase64(fullLine.asBytes(), true);
+    KJ_EXPECT(
+        encoded == "MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIz\n",
+        encoded);
+  }
+
+  String multiLine = str(fullLine, "456");
+  {
+    auto encoded = encodeBase64(multiLine.asBytes(), false);
+    KJ_EXPECT(
+        encoded == "MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2",
+        encoded);
+  }
+  {
+    auto encoded = encodeBase64(multiLine.asBytes(), true);
+    KJ_EXPECT(
+        encoded == "MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIz\n"
+                   "NDU2\n",
+        encoded);
+  }
+}
+
+}  // namespace
+}  // namespace kj
--- a/c++/src/kj/encoding.c++
+++ b/c++/src/kj/encoding.c++
+// Copyright (c) 2017 Cloudflare, Inc.; Sandstorm Development Group, Inc.; and contributors
+// Licensed under the MIT License:
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "encoding.h"
+#include "vector.h"
+#include "debug.h"
+
+namespace kj {
+
+namespace {
+
+#define GOTO_ERROR_IF(cond) if (KJ_UNLIKELY(cond)) goto error
+
+inline void addChar32(Vector<char16_t>& vec, char32_t u) {
+  // Encode as surrogate pair.
+  u -= 0x10000;
+  vec.add(0xd800 | (u >> 10));
+  vec.add(0xdc00 | (u & 0x03ff));
+}
+
+inline void addChar32(Vector<char32_t>& vec, char32_t u) {
+  vec.add(u);
+}
+
+template <typename T>
+UtfResult<Array<T>> encodeUtf(ArrayPtr<const char> text, bool nulTerminate) {
+  Vector<T> result(text.size() + nulTerminate);
+  bool hadErrors = false;
+
+  size_t i = 0;
+  while (i < text.size()) {
+    byte c = text[i++];
+    if (c < 0x80) {
+      // 0xxxxxxx -- ASCII
+      result.add(c);
+      continue;
+    } else if (KJ_UNLIKELY(c < 0xc0)) {
+      // 10xxxxxx -- malformed continuation byte
+      goto error;
+    } else if (c < 0xe0) {
+      // 110xxxxx -- 2-byte
+      byte c2;
+      GOTO_ERROR_IF(i == text.size() || ((c2 = text[i]) & 0xc0) != 0x80); ++i;
+      char16_t u = (static_cast<char16_t>(c  & 0x1f) <<  6)
+                 | (static_cast<char16_t>(c2 & 0x3f)      );
+
+      // Disallow overlong sequence.
+      GOTO_ERROR_IF(u < 0x80);
+
+      result.add(u);
+      continue;
+    } else if (c < 0xf0) {
+      // 1110xxxx -- 3-byte
+      byte c2, c3;
+      GOTO_ERROR_IF(i == text.size() || ((c2 = text[i]) & 0xc0) != 0x80); ++i;
+      GOTO_ERROR_IF(i == text.size() || ((c3 = text[i]) & 0xc0) != 0x80); ++i;
+      char16_t u = (static_cast<char16_t>(c  & 0x0f) << 12)
+                 | (static_cast<char16_t>(c2 & 0x3f) <<  6)
+                 | (static_cast<char16_t>(c3 & 0x3f)      );
+
+      // Disallow overlong sequence.
+      GOTO_ERROR_IF(u < 0x0800);
+
+      // Disallow surrogate pair code points.
+      GOTO_ERROR_IF((u & 0xf800) == 0xd800);
+
+      result.add(u);
+      continue;
+    } else if (c < 0xf8) {
+      // 11110xxx -- 4-byte
+      byte c2, c3, c4;
+      GOTO_ERROR_IF(i == text.size() || ((c2 = text[i]) & 0xc0) != 0x80); ++i;
+      GOTO_ERROR_IF(i == text.size() || ((c3 = text[i]) & 0xc0) != 0x80); ++i;
+      GOTO_ERROR_IF(i == text.size() || ((c4 = text[i]) & 0xc0) != 0x80); ++i;
+      char32_t u = (static_cast<char32_t>(c  & 0x07) << 18)
+                 | (static_cast<char32_t>(c2 & 0x3f) << 12)
+                 | (static_cast<char32_t>(c3 & 0x3f) <<  6)
+                 | (static_cast<char32_t>(c4 & 0x3f)      );
+
+      // Disallow overlong sequence.
+      GOTO_ERROR_IF(u < 0x10000);
+
+      // Unicode ends at U+10FFFF
+      GOTO_ERROR_IF(u >= 0x110000);
+
+      addChar32(result, u);
+      continue;
+    } else {
+      // 5-byte and 6-byte sequences are not legal as they'd result in codepoints outside the
+      // range of Unicode.
+      goto error;
+    }
+
+  error:
+    result.add(0xfffd);
+    hadErrors = true;
+    // Ignore all continuation bytes.
+    while (i < text.size() && (text[i] & 0xc0) == 0x80) {
+      ++i;
+    }
+  }
+
+  if (nulTerminate) result.add(0);
+
+  return { result.releaseAsArray(), hadErrors };
+}
+
+}  // namespace
+
+UtfResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate) {
+  return encodeUtf<char16_t>(text, nulTerminate);
+}
+
+UtfResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate) {
+  return encodeUtf<char32_t>(text, nulTerminate);
+}
+
+Maybe<Array<char16_t>> tryEncodeUtf16(ArrayPtr<const char> text, bool nulTerminate) {
+  auto result = encodeUtf16(text, nulTerminate);
+  if (result.hadErrors) {
+    return nullptr;
+  } else {
+    return kj::mv(result);
+  }
+}
+
+Maybe<Array<char32_t>> tryEncodeUtf32(ArrayPtr<const char> text, bool nulTerminate) {
+  auto result = encodeUtf32(text, nulTerminate);
+  if (result.hadErrors) {
+    return nullptr;
+  } else {
+    return kj::mv(result);
+  }
+}
+
+UtfResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) {
+  Vector<char> result(utf16.size() + 1);
+  bool hadErrors = false;
+
+  size_t i = 0;
+  while (i < utf16.size()) {
+    char16_t u = utf16[i++];
+
+    if (u < 0x80) {
+      result.add(u);
+      continue;
+    } else if (u < 0x0800) {
+      result.addAll<std::initializer_list<char>>({
+        static_cast<char>(((u >>  6)       ) | 0xc0),
+        static_cast<char>(((u      ) & 0x3f) | 0x80)
+      });
+      continue;
+    } else if ((u & 0xf800) == 0xd800) {
+      // surrogate pair
+      char16_t u2;
+      GOTO_ERROR_IF(i == utf16.size()                       // missing second half
+                 || (u & 0x0400) != 0                       // first half in wrong range
+                 || ((u2 = utf16[i]) & 0xfc00) != 0xdc00);  // second half in wrong range
+      ++i;
+
+      char32_t u32 = (((u & 0x03ff) << 10) | (u2 & 0x03ff)) + 0x10000;
+      result.addAll<std::initializer_list<char>>({
+        static_cast<char>(((u32 >> 18)       ) | 0xf0),
+        static_cast<char>(((u32 >> 12) & 0x3f) | 0x80),
+        static_cast<char>(((u32 >>  6) & 0x3f) | 0x80),
+        static_cast<char>(((u32      ) & 0x3f) | 0x80)
+      });
+      continue;
+    } else {
+      result.addAll<std::initializer_list<char>>({
+        static_cast<char>(((u >> 12)       ) | 0xe0),
+        static_cast<char>(((u >>  6) & 0x3f) | 0x80),
+        static_cast<char>(((u      ) & 0x3f) | 0x80)
+      });
+      continue;
+    }
+
+  error:
+    result.addAll(StringPtr(u8"\ufffd"));
+    hadErrors = true;
+  }
+
+  result.add(0);
+  return { String(result.releaseAsArray()), hadErrors };
+}
+
+UtfResult<String> decodeUtf32(ArrayPtr<const char32_t> utf16) {
+  Vector<char> result(utf16.size() + 1);
+  bool hadErrors = false;
+
+  size_t i = 0;
+  while (i < utf16.size()) {
+    char32_t u = utf16[i++];
+
+    if (u < 0x80) {
+      result.add(u);
+      continue;
+    } else if (u < 0x0800) {
+      result.addAll<std::initializer_list<char>>({
+        static_cast<char>(((u >>  6)       ) | 0xc0),
+        static_cast<char>(((u      ) & 0x3f) | 0x80)
+      });
+      continue;
+    } else if (u < 0x10000) {
+      GOTO_ERROR_IF((u & 0xfffff800) == 0xd800);  // no surrogates allowed in utf-32
+      result.addAll<std::initializer_list<char>>({
+        static_cast<char>(((u >> 12)       ) | 0xe0),
+        static_cast<char>(((u >>  6) & 0x3f) | 0x80),
+        static_cast<char>(((u      ) & 0x3f) | 0x80)
+      });
+      continue;
+    } else {
+      GOTO_ERROR_IF(u >= 0x110000);  // outside Unicode range
+      result.addAll<std::initializer_list<char>>({
+        static_cast<char>(((u >> 18)       ) | 0xf0),
+        static_cast<char>(((u >> 12) & 0x3f) | 0x80),
+        static_cast<char>(((u >>  6) & 0x3f) | 0x80),
+        static_cast<char>(((u      ) & 0x3f) | 0x80)
+      });
+      continue;
+    }
+
+  error:
+    result.addAll(StringPtr(u8"\ufffd"));
+    hadErrors = true;
+  }
+
+  result.add(0);
+  return { String(result.releaseAsArray()), hadErrors };
+}
+
+Maybe<String> tryDecodeUtf16(ArrayPtr<const char16_t> utf16) {
+  auto result = decodeUtf16(utf16);
+  if (result.hadErrors) {
+    return nullptr;
+  } else {
+    return kj::mv(result);
+  }
+}
+Maybe<String> tryDecodeUtf32(ArrayPtr<const char32_t> utf32) {
+  auto result = decodeUtf32(utf32);
+  if (result.hadErrors) {
+    return nullptr;
+  } else {
+    return kj::mv(result);
+  }
+}
+
+// =======================================================================================
+
+namespace {
+  const char HEX_DIGITS[] = "0123456789abcdef";
+}
+
+String encodeHex(ArrayPtr<const byte> input) {
+  return strArray(KJ_MAP(b, input) {
+    return heapArray<char>({HEX_DIGITS[b/16], HEX_DIGITS[b%16]});
+  }, "");
+}
+
+static uint fromDigit(char c) {
+  if ('0' <= c && c <= '9') {
+    return c - '0';
+  } else if ('a' <= c && c <= 'z') {
+    return c - ('a' - 10);
+  } else if ('A' <= c && c <= 'Z') {
+    return c - ('A' - 10);
+  } else {
+    return 0;
+  }
+}
+
+Array<byte> decodeHex(ArrayPtr<const char> text) {
+  auto result = heapArray<byte>(text.size() / 2);
+
+  for (auto i: kj::indices(result)) {
+    result[i] = (fromDigit(text[i*2]) << 4)
+              | (fromDigit(text[i*2+1]));
+  }
+
+  return result;
+}
+
+String encodeUriComponent(ArrayPtr<const byte> bytes) {
+  Vector<char> result(bytes.size() + 1);
+  for (byte b: bytes) {
+    if (('A' <= b && b <= 'Z') || ('a' <= b && b <= 'z') || ('0' <= b && b <= '9') ||
+        b == '-' || b == '_' || b == '.' || b == '!' || b == '~' || b == '*' || b == '\'' ||
+        b == '(' || b == ')') {
+      result.add(b);
+    } else {
+      result.add('%');
+      result.add(HEX_DIGITS[b/16]);
+      result.add(HEX_DIGITS[b%16]);
+    }
+  }
+  result.add('\0');
+  return String(result.releaseAsArray());
+}
+
+Array<byte> decodeBinaryUriComponent(ArrayPtr<const char> text, bool nulTerminate) {
+  Vector<byte> result(text.size() + nulTerminate);
+
+  const char* ptr = text.begin();
+  const char* end = text.end();
+  while (ptr < end) {
+    if (*ptr == '%') {
+      ++ptr;
+      if (ptr == end) break;
+      byte b = fromDigit(*ptr++) << 4;
+      if (ptr == end) break;
+      b |= fromDigit(*ptr++);
+      result.add(b);
+    } else {
+      result.add(*ptr++);
+    }
+  }
+
+  if (nulTerminate) result.add(0);
+  return result.releaseAsArray();
+}
+
+// =======================================================================================
+// This code is derived from libb64 which has been placed in the public domain.
+// For details, see http://sourceforge.net/projects/libb64
+
+// -------------------------------------------------------------------
+// Encoder
+
+namespace {
+
+typedef enum {
+  step_A, step_B, step_C
+} base64_encodestep;
+
+typedef struct {
+  base64_encodestep step;
+  char result;
+  int stepcount;
+} base64_encodestate;
+
+const int CHARS_PER_LINE = 72;
+
+void base64_init_encodestate(base64_encodestate* state_in) {
+  state_in->step = step_A;
+  state_in->result = 0;
+  state_in->stepcount = 0;
+}
+
+char base64_encode_value(char value_in) {
+  static const char* encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+  if (value_in > 63) return '=';
+  return encoding[(int)value_in];
+}
+
+int base64_encode_block(const char* plaintext_in, int length_in,
+                        char* code_out, base64_encodestate* state_in, bool breakLines) {
+  const char* plainchar = plaintext_in;
+  const char* const plaintextend = plaintext_in + length_in;
+  char* codechar = code_out;
+  char result;
+  char fragment;
+
+  result = state_in->result;
+
+  switch (state_in->step) {
+    while (1) {
+  case step_A:
+      if (plainchar == plaintextend) {
+        state_in->result = result;
+        state_in->step = step_A;
+        return codechar - code_out;
+      }
+      fragment = *plainchar++;
+      result = (fragment & 0x0fc) >> 2;
+      *codechar++ = base64_encode_value(result);
+      result = (fragment & 0x003) << 4;
+  case step_B:
+      if (plainchar == plaintextend) {
+        state_in->result = result;
+        state_in->step = step_B;
+        return codechar - code_out;
+      }
+      fragment = *plainchar++;
+      result |= (fragment & 0x0f0) >> 4;
+      *codechar++ = base64_encode_value(result);
+      result = (fragment & 0x00f) << 2;
+  case step_C:
+      if (plainchar == plaintextend) {
+        state_in->result = result;
+        state_in->step = step_C;
+        return codechar - code_out;
+      }
+      fragment = *plainchar++;
+      result |= (fragment & 0x0c0) >> 6;
+      *codechar++ = base64_encode_value(result);
+      result  = (fragment & 0x03f) >> 0;
+      *codechar++ = base64_encode_value(result);
+
+      ++(state_in->stepcount);
+      if (breakLines && state_in->stepcount == CHARS_PER_LINE/4) {
+        *codechar++ = '\n';
+        state_in->stepcount = 0;
+      }
+    }
+  }
+  /* control should not reach here */
+  return codechar - code_out;
+}
+
+int base64_encode_blockend(char* code_out, base64_encodestate* state_in, bool breakLines) {
+  char* codechar = code_out;
+
+  switch (state_in->step) {
+  case step_B:
+    *codechar++ = base64_encode_value(state_in->result);
+    *codechar++ = '=';
+    *codechar++ = '=';
+    ++state_in->stepcount;
+    break;
+  case step_C:
+    *codechar++ = base64_encode_value(state_in->result);
+    *codechar++ = '=';
+    ++state_in->stepcount;
+    break;
+  case step_A:
+    break;
+  }
+  if (breakLines && state_in->stepcount > 0) {
+    *codechar++ = '\n';
+  }
+
+  return codechar - code_out;
+}
+
+}  // namespace
+
+String encodeBase64(ArrayPtr<const byte> input, bool breakLines) {
+  /* set up a destination buffer large enough to hold the encoded data */
+  // equivalent to ceil(input.size() / 3) * 4
+  auto numChars = (input.size() + 2) / 3 * 4;
+  if (breakLines) {
+    // Add space for newline characters.
+    uint lineCount = numChars / CHARS_PER_LINE;
+    if (numChars % CHARS_PER_LINE > 0) {
+      // Partial line.
+      ++lineCount;
+    }
+    numChars = numChars + lineCount;
+  }
+  auto output = heapString(numChars);
+  /* keep track of our encoded position */
+  char* c = output.begin();
+  /* store the number of bytes encoded by a single call */
+  int cnt = 0;
+  size_t total = 0;
+  /* we need an encoder state */
+  base64_encodestate s;
+
+  /*---------- START ENCODING ----------*/
+  /* initialise the encoder state */
+  base64_init_encodestate(&s);
+  /* gather data from the input and send it to the output */
+  cnt = base64_encode_block((const char *)input.begin(), input.size(), c, &s, breakLines);
+  c += cnt;
+  total += cnt;
+
+  /* since we have encoded the entire input string, we know that
+     there is no more input data; finalise the encoding */
+  cnt = base64_encode_blockend(c, &s, breakLines);
+  c += cnt;
+  total += cnt;
+  /*---------- STOP ENCODING  ----------*/
+
+  KJ_ASSERT(total == output.size(), total, output.size());
+
+  return output;
+}
+
+// -------------------------------------------------------------------
+// Decoder
+
+namespace {
+
+typedef enum {
+  step_a, step_b, step_c, step_d
+} base64_decodestep;
+
+typedef struct {
+  base64_decodestep step;
+  char plainchar;
+} base64_decodestate;
+
+int base64_decode_value(char value_in) {
+  static const char decoding[] = {
+    62,-1,-1,-1,63,52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-2,-1,-1,-1,
+    0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,-1,
+    26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51};
+  static const char decoding_size = sizeof(decoding);
+  value_in -= 43;
+  if (value_in < 0 || value_in > decoding_size) return -1;
+  return decoding[(int)value_in];
+}
+
+void base64_init_decodestate(base64_decodestate* state_in) {
+  state_in->step = step_a;
+  state_in->plainchar = 0;
+}
+
+int base64_decode_block(const char* code_in, const int length_in,
+                        char* plaintext_out, base64_decodestate* state_in) {
+  const char* codechar = code_in;
+  char* plainchar = plaintext_out;
+  char fragment;
+
+  *plainchar = state_in->plainchar;
+
+  switch (state_in->step)
+  {
+    while (1)
+    {
+  case step_a:
+      do {
+        if (codechar == code_in+length_in) {
+          state_in->step = step_a;
+          state_in->plainchar = *plainchar;
+          return plainchar - plaintext_out;
+        }
+        fragment = (char)base64_decode_value(*codechar++);
+      } while (fragment < 0);
+      *plainchar    = (fragment & 0x03f) << 2;
+  case step_b:
+      do {
+        if (codechar == code_in+length_in) {
+          state_in->step = step_b;
+          state_in->plainchar = *plainchar;
+          return plainchar - plaintext_out;
+        }
+        fragment = (char)base64_decode_value(*codechar++);
+      } while (fragment < 0);
+      *plainchar++ |= (fragment & 0x030) >> 4;
+      *plainchar    = (fragment & 0x00f) << 4;
+  case step_c:
+      do {
+        if (codechar == code_in+length_in) {
+          state_in->step = step_c;
+          state_in->plainchar = *plainchar;
+          return plainchar - plaintext_out;
+        }
+        fragment = (char)base64_decode_value(*codechar++);
+      } while (fragment < 0);
+      *plainchar++ |= (fragment & 0x03c) >> 2;
+      *plainchar    = (fragment & 0x003) << 6;
+  case step_d:
+      do {
+        if (codechar == code_in+length_in) {
+          state_in->step = step_d;
+          state_in->plainchar = *plainchar;
+          return plainchar - plaintext_out;
+        }
+        fragment = (char)base64_decode_value(*codechar++);
+      } while (fragment < 0);
+      *plainchar++   |= (fragment & 0x03f);
+    }
+  }
+  /* control should not reach here */
+  return plainchar - plaintext_out;
+}
+
+}  // namespace
+
+Array<byte> decodeBase64(ArrayPtr<const char> input) {
+  base64_decodestate state;
+  base64_init_decodestate(&state);
+
+  auto output = heapArray<byte>((input.size() * 6 + 7) / 8);
+
+  size_t n = base64_decode_block(input.begin(), input.size(),
+      reinterpret_cast<char*>(output.begin()), &state);
+
+  if (n < output.size()) {
+    auto copy = heapArray<byte>(n);
+    memcpy(copy.begin(), output.begin(), n);
+    output = kj::mv(copy);
+  }
+
+  return output;
+}
+
+} // namespace kj
--- a/c++/src/kj/encoding.h
+++ b/c++/src/kj/encoding.h
+// Copyright (c) 2017 Cloudflare, Inc. and contributors
+// Licensed under the MIT License:
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef KJ_ENCODING_H_
+#define KJ_ENCODING_H_
+// Functions for encoding/decoding bytes and text in common formats, including:
+// - UTF-{8,16,32}
+// - Hex
+// - URI encoding
+// - Base64
+
+#if defined(__GNUC__) && !KJ_HEADER_WARNINGS
+#pragma GCC system_header
+#endif
+
+#include "string.h"
+
+namespace kj {
+
+template <typename ResultType>
+struct UtfResult: public ResultType {
+  // Equivalent to ResultType (a String or wide-char array) for all intents and purposes, except
+  // that the bool `hadErrors` can be inspected to see if any errors were encountered in the input,
+  // resulting in instances of the replacement character (U+FFFD) in the output.
+
+  inline UtfResult(ResultType&& result, bool hadErrors)
+      : ResultType(kj::mv(result)), hadErrors(hadErrors) {}
+
+  const bool hadErrors;
+  // If true, then invalid sequences were detected in the input and were replaced with the Unicode
+  // replacement character (U+FFFD) in the output. Many applications will chose to ignore this
+  // boolean and continue on with the damaged data.
+};
+
+UtfResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false);
+UtfResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false);
+Maybe<Array<char16_t>> tryEncodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false);
+Maybe<Array<char32_t>> tryEncodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false);
+// Convert UTF-8 text (which KJ strings use) to UTF-16 or UTF-32.
+//
+// If `nulTerminate` is true, an extra NUL character will be added to the end of the output.
+//
+// The `try` versions return null if the input is invalid; the non-`try` versions return data
+// containing the Unicode replacement character (U+FFFD).
+//
+// The returned arrays are in platform-native endianness (otherwise they wouldn't really be
+// char16_t / char32_t).
+
+UtfResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16);
+UtfResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32);
+Maybe<String> tryDecodeUtf16(ArrayPtr<const char16_t> utf16);
+Maybe<String> tryDecodeUtf32(ArrayPtr<const char32_t> utf32);
+// Convert UTF-16 or UTF-32 to UTF-8 (which KJ strings use).
+//
+// The input should NOT include a NUL terminator; any NUL characters in the input array will be
+// preserved in the output.
+//
+// The `try` versions return null if the input is invalid; the non-`try` versions return data
+// containing the Unicode replacement character (U+FFFD).
+//
+// The input must be in platform-native endianness. BOMs are NOT recognized by these functions.
+
+String encodeHex(ArrayPtr<const byte> bytes);
+Array<byte> decodeHex(ArrayPtr<const char> text);
+// Encode/decode bytes as hex strings.
+
+String encodeUriComponent(ArrayPtr<const byte> bytes);
+String encodeUriComponent(ArrayPtr<const char> bytes);
+Array<byte> decodeBinaryUriComponent(ArrayPtr<const char> text, bool nulTerminate = false);
+String decodeUriComponent(ArrayPtr<const char> text);
+// Encode/decode URI components using % escapes. See Javascript's encodeURIComponent().
+
+String encodeBase64(ArrayPtr<const byte> bytes, bool breakLines = false);
+// Encode the given bytes as base64 text. If `breakLines` is true, line breaks will be inserted
+// into the output every 72 characters (e.g. for encoding e-mail bodies).
+
+Array<byte> decodeBase64(ArrayPtr<const char> text);
+// Decode base64 text. Non-base64 characters are ignored.
+
+// =======================================================================================
+// inline implementation details
+
+inline String encodeUriComponent(ArrayPtr<const char> text) {
+  return encodeUriComponent(text.asBytes());
+}
+inline String decodeUriComponent(ArrayPtr<const char> text) {
+  return String(decodeBinaryUriComponent(text, true).releaseAsChars());
+}
+
+// If you pass a string literal to a function taking ArrayPtr<const char>, it'll include the NUL
+// termintator, which is surprising. Let's add overloads that avoid that. In practice this probably
+// only even matters for encoding-test.c++.
+
+template <size_t s>
+inline UtfResult<Array<char16_t>> encodeUtf16(const char (&text)[s], bool nulTerminate = false) {
+  return encodeUtf16(arrayPtr(text, s - 1), nulTerminate);
+}
+template <size_t s>
+inline UtfResult<Array<char32_t>> encodeUtf32(const char (&text)[s], bool nulTerminate = false) {
+  return encodeUtf32(arrayPtr(text, s - 1), nulTerminate);
+}
+template <size_t s>
+inline Maybe<Array<char16_t>> tryEncodeUtf16(const char (&text)[s], bool nulTerminate = false) {
+  return tryEncodeUtf16(arrayPtr(text, s - 1), nulTerminate);
+}
+template <size_t s>
+inline Maybe<Array<char32_t>> tryEncodeUtf32(const char (&text)[s], bool nulTerminate = false) {
+  return tryEncodeUtf32(arrayPtr(text, s - 1), nulTerminate);
+}
+template <size_t s>
+inline UtfResult<String> decodeUtf16(const char16_t (&utf16)[s]) {
+  return decodeUtf16(arrayPtr(utf16, s - 1));
+}
+template <size_t s>
+inline UtfResult<String> decodeUtf32(const char32_t (&utf32)[s]) {
+  return decodeUtf32(arrayPtr(utf32, s - 1));
+}
+template <size_t s>
+inline Maybe<String> tryDecodeUtf16(const char16_t (&utf16)[s]) {
+  return tryDecodeUtf16(arrayPtr(utf16, s - 1));
+}
+template <size_t s>
+inline Maybe<String> tryDecodeUtf32(const char32_t (&utf32)[s]) {
+  return tryDecodeUtf32(arrayPtr(utf32, s - 1));
+}
+template <size_t s>
+inline Array<byte> decodeHex(const char (&text)[s]) {
+  return decodeHex(arrayPtr(text, s - 1));
+}
+template <size_t s>
+inline String encodeUriComponent(const char (&text)[s]) {
+  return encodeUriComponent(arrayPtr(text, s - 1));
+}
+template <size_t s>
+inline Array<byte> decodeBinaryUriComponent(const char (&text)[s]) {
+  return decodeBinaryUriComponent(arrayPtr(text, s - 1));
+}
+template <size_t s>
+inline String decodeUriComponent(const char (&text)[s]) {
+  return String(decodeBinaryUriComponent(arrayPtr(text, s - 1), true).releaseAsChars());
+}
+template <size_t s>
+Array<byte> decodeBase64(const char (&text)[s]) {
+  return decodeBase64(arrayPtr(text, s - 1));
+}
+
+} // namespace kj
+
+#endif // KJ_ENCODING_H_