Merge pull request #483 from sandstorm-io/kj-encoding

Add KJ utility functions to encode/decode blobs in common formats.

Merge pull request #483 from sandstorm-io/kj-encoding
Add KJ utility functions to encode/decode blobs in common formats.
c45bd150 · Kenton Varda · GitHub · 35ffb4cd · 745049be · c45bd150
Commit c45bd150 authored May 23, 2017 by Kenton Varda Committed by GitHub May 23, 2017
9 changed files
--- a/c++/Makefile.am
+++ b/c++/Makefile.am
@@ -128,6 +128,7 @@ includekj_HEADERS =                                            \
  src/kj/vector.h                                              \
  src/kj/string.h                                              \
  src/kj/string-tree.h                                         \
+  src/kj/encoding.h                                            \
  src/kj/exception.h                                           \
  src/kj/debug.h                                               \
  src/kj/arena.h                                               \
@@ -218,6 +219,7 @@ libkj_la_SOURCES=                                              \
  src/kj/array.c++                                             \
  src/kj/string.c++                                            \
  src/kj/string-tree.c++                                       \
+  src/kj/encoding.c++                                          \
  src/kj/exception.c++                                         \
  src/kj/debug.c++                                             \
  src/kj/arena.c++                                             \
@@ -451,6 +453,7 @@ capnp_test_SOURCES =                                           \
  src/kj/array-test.c++                                        \
  src/kj/string-test.c++                                       \
  src/kj/string-tree-test.c++                                  \
+  src/kj/encoding-test.c++                                     \
  src/kj/exception-test.c++                                    \
  src/kj/debug-test.c++                                        \
  src/kj/arena-test.c++                                        \

--- a/c++/src/capnp/compat/json.h
+++ b/c++/src/capnp/compat/json.h
@@ -193,6 +193,20 @@ public:
  void addFieldHandler(StructSchema::Field field, Handler<T>& handler);
  // Matches only the specific field. T can be a dynamic type. T must match the field's type.

+  // ---------------------------------------------------------------------------
+  // Hack to support string literal parameters
+
+  template <size_t size, typename... Params>
+  auto decode(const char (&input)[size], Params&&... params) const
+      -> decltype(decode(kj::arrayPtr(input, size), kj::fwd<Params>(params)...)) {
+    return decode(kj::arrayPtr(input, size - 1), kj::fwd<Params>(params)...);
+  }
+  template <size_t size, typename... Params>
+  auto decodeRaw(const char (&input)[size], Params&&... params) const
+      -> decltype(decodeRaw(kj::arrayPtr(input, size), kj::fwd<Params>(params)...)) {
+    return decodeRaw(kj::arrayPtr(input, size - 1), kj::fwd<Params>(params)...);
+  }
+
 private:
  class HandlerBase;
  struct Impl;

--- a/c++/src/capnp/compiler/node-translator.c++
+++ b/c++/src/capnp/compiler/node-translator.c++
@@ -24,6 +24,7 @@
 #include <capnp/serialize.h>
 #include <kj/debug.h>
 #include <kj/arena.h>
+#include <kj/encoding.h>
 #include <set>
 #include <map>
 #include <stdlib.h>
@@ -2408,36 +2409,7 @@ uint64_t NodeTranslator::compileParamList(
 static const char HEXDIGITS[] = "0123456789abcdef";

 static kj::StringTree stringLiteral(kj::StringPtr chars) {
-  // TODO(cleanup): This code keeps coming up. Put somewhere common?
-
-  kj::Vector<char> escaped(chars.size());
-
-  for (char c: chars) {
-    switch (c) {
-      case '\a': escaped.addAll(kj::StringPtr("\\a")); break;
-      case '\b': escaped.addAll(kj::StringPtr("\\b")); break;
-      case '\f': escaped.addAll(kj::StringPtr("\\f")); break;
-      case '\n': escaped.addAll(kj::StringPtr("\\n")); break;
-      case '\r': escaped.addAll(kj::StringPtr("\\r")); break;
-      case '\t': escaped.addAll(kj::StringPtr("\\t")); break;
-      case '\v': escaped.addAll(kj::StringPtr("\\v")); break;
-      case '\'': escaped.addAll(kj::StringPtr("\\\'")); break;
-      case '\"': escaped.addAll(kj::StringPtr("\\\"")); break;
-      case '\\': escaped.addAll(kj::StringPtr("\\\\")); break;
-      default:
-        if (c < 0x20) {
-          escaped.add('\\');
-          escaped.add('x');
-          uint8_t c2 = c;
-          escaped.add(HEXDIGITS[c2 / 16]);
-          escaped.add(HEXDIGITS[c2 % 16]);
-        } else {
-          escaped.add(c);
-        }
-        break;
-    }
-  }
-  return kj::strTree('"', escaped, '"');
+  return kj::strTree('"', kj::encodeCEscape(chars), '"');
 }

 static kj::StringTree binaryLiteral(Data::Reader data) {

--- a/c++/src/capnp/stringify.c++
+++ b/c++/src/capnp/stringify.c++
@@ -22,13 +22,12 @@
 #include "dynamic.h"
 #include <kj/debug.h>
 #include <kj/vector.h>
+#include <kj/encoding.h>

 namespace capnp {

 namespace {

-static const char HEXDIGITS[] = "0123456789abcdef";
-
 enum PrintMode {
  BARE,
  // The value is planned to be printed on its own line, unless it is very short and contains
@@ -150,34 +149,7 @@ static kj::StringTree print(const DynamicValue::Reader& value,
        chars = value.as<Text>();
      }

-      kj::Vector<char> escaped(chars.size());
-
-      for (char c: chars) {
-        switch (c) {
-          case '\a': escaped.addAll(kj::StringPtr("\\a")); break;
-          case '\b': escaped.addAll(kj::StringPtr("\\b")); break;
-          case '\f': escaped.addAll(kj::StringPtr("\\f")); break;
-          case '\n': escaped.addAll(kj::StringPtr("\\n")); break;
-          case '\r': escaped.addAll(kj::StringPtr("\\r")); break;
-          case '\t': escaped.addAll(kj::StringPtr("\\t")); break;
-          case '\v': escaped.addAll(kj::StringPtr("\\v")); break;
-          case '\'': escaped.addAll(kj::StringPtr("\\\'")); break;
-          case '\"': escaped.addAll(kj::StringPtr("\\\"")); break;
-          case '\\': escaped.addAll(kj::StringPtr("\\\\")); break;
-          default:
-            if (c < 0x20) {
-              escaped.add('\\');
-              escaped.add('x');
-              uint8_t c2 = c;
-              escaped.add(HEXDIGITS[c2 / 16]);
-              escaped.add(HEXDIGITS[c2 % 16]);
-            } else {
-              escaped.add(c);
-            }
-            break;
-        }
-      }
-      return kj::strTree('"', escaped, '"');
+      return kj::strTree('"', kj::encodeCEscape(chars), '"');
    }
    case DynamicValue::LIST: {
      auto listValue = value.as<DynamicList>();

--- a/c++/src/kj/CMakeLists.txt
+++ b/c++/src/kj/CMakeLists.txt
@@ -19,6 +19,7 @@ set(kj_sources_heavy
  units.c++
  refcount.c++
  string-tree.c++
+  encoding.c++
  parse/char.c++
 )
 if(NOT CAPNP_LITE)
@@ -36,6 +37,7 @@ set(kj_headers
  vector.h
  string.h
  string-tree.h
+  encoding.h
  exception.h
  debug.h
  arena.h
@@ -170,6 +172,7 @@ if(BUILD_TESTING)
      async-io-test.c++
      refcount-test.c++
      string-tree-test.c++
+      encoding-test.c++
      arena-test.c++
      units-test.c++
      tuple-test.c++

--- a/c++/src/kj/common.h
+++ b/c++/src/kj/common.h
@@ -455,6 +455,10 @@ T refIfLvalue(T&&);
 //     KJ_DECLTYPE_REF(i) i3(i);                  // i3 has type int&.
 //     KJ_DECLTYPE_REF(kj::mv(i)) i4(kj::mv(i));  // i4 has type int.

+template <typename T, typename U> struct IsSameType_ { static constexpr bool value = false; };
+template <typename T> struct IsSameType_<T, T> { static constexpr bool value = true; };
+template <typename T, typename U> constexpr bool isSameType() { return IsSameType_<T, U>::value; }
+
 template <typename T>
 struct CanConvert_ {
  static int sfinae(T);
@@ -911,7 +915,6 @@ public:
    return value;
  }

-private:  // internal interface used by friends only
  inline NullableValue() noexcept: isSet(false) {}
  inline NullableValue(T&& t) noexcept(noexcept(T(instance<T&&>())))
      : isSet(true) {
@@ -1244,8 +1247,31 @@ public:
      : ptr(init.begin()), size_(init.size()) {}

  template <size_t size>
-  inline constexpr ArrayPtr(T (&native)[size]): ptr(native), size_(size) {}
-  // Construct an ArrayPtr from a native C-style array.
+  inline constexpr ArrayPtr(T (&native)[size]): ptr(native), size_(size) {
+    // Construct an ArrayPtr from a native C-style array.
+    //
+    // We disable this constructor for const char arrays because otherwise you would be able to
+    // implicitly convert a character literal to ArrayPtr<const char>, which sounds really great,
+    // except that the NUL terminator would be included, which probably isn't what you intended.
+    //
+    // TODO(someday): Maybe we should support character literals but explicitly chop off the NUL
+    //   terminator. This could do the wrong thing if someone tries to construct an
+    //   ArrayPtr<const char> from a non-NUL-terminated char array, but evidence suggests that all
+    //   real use cases are in fact intending to remove the NUL terminator. It's convenient to be
+    //   able to specify ArrayPtr<const char> as a parameter type and be able to accept strings
+    //   as input in addition to arrays. Currently, you'll need overloading to support string
+    //   literals in this case, but if you overload StringPtr, then you'll find that several
+    //   conversions (e.g. from String and from a literal char array) become ambiguous! You end up
+    //   having to overload for literal char arrays specifically which is cumbersome.
+
+    static_assert(!isSameType<T, const char>(),
+        "Can't implicitly convert literal char array to ArrayPtr because we don't know if "
+        "you meant to include the NUL terminator. We may change this in the future to "
+        "automatically drop the NUL terminator. For now, try explicitly converting to StringPtr, "
+        "which can in turn implicitly convert to ArrayPtr<const char>.");
+    static_assert(!isSameType<T, const char16_t>(), "see above");
+    static_assert(!isSameType<T, const char32_t>(), "see above");
+  }

  inline operator ArrayPtr<const T>() const {
    return ArrayPtr<const T>(ptr, size_);

--- a/c++/src/kj/encoding-test.c++
+++ b/c++/src/kj/encoding-test.c++
--- a/c++/src/kj/encoding.c++
+++ b/c++/src/kj/encoding.c++
--- a/c++/src/kj/encoding.h
+++ b/c++/src/kj/encoding.h
+// Copyright (c) 2017 Cloudflare, Inc. and contributors
+// Licensed under the MIT License:
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef KJ_ENCODING_H_
+#define KJ_ENCODING_H_
+// Functions for encoding/decoding bytes and text in common formats, including:
+// - UTF-{8,16,32}
+// - Hex
+// - URI encoding
+// - Base64
+
+#if defined(__GNUC__) && !KJ_HEADER_WARNINGS
+#pragma GCC system_header
+#endif
+
+#include "string.h"
+
+namespace kj {
+
+template <typename ResultType>
+struct EncodingResult: public ResultType {
+  // Equivalent to ResultType (a String or wide-char array) for all intents and purposes, except
+  // that the bool `hadErrors` can be inspected to see if any errors were encountered in the input.
+  // Each encoding/decoding function that returns this type will "work around" errors in some way,
+  // so an application doesn't strictly have to check for errors. E.g. the Unicode functions
+  // replace errors with U+FFFD in the output.
+  //
+  // Through magic, KJ_IF_MAYBE() and KJ_{REQUIRE,ASSERT}_NONNULL() work on EncodingResult<T>
+  // exactly if it were a Maybe<T> that is null in case of errors.
+
+  inline EncodingResult(ResultType&& result, bool hadErrors)
+      : ResultType(kj::mv(result)), hadErrors(hadErrors) {}
+
+  const bool hadErrors;
+};
+
+EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false);
+EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false);
+// Convert UTF-8 text (which KJ strings use) to UTF-16 or UTF-32.
+//
+// If `nulTerminate` is true, an extra NUL character will be added to the end of the output.
+//
+// The `try` versions return null if the input is invalid; the non-`try` versions return data
+// containing the Unicode replacement character (U+FFFD).
+//
+// The returned arrays are in platform-native endianness (otherwise they wouldn't really be
+// char16_t / char32_t).
+
+EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16);
+EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32);
+// Convert UTF-16 or UTF-32 to UTF-8 (which KJ strings use).
+//
+// The input should NOT include a NUL terminator; any NUL characters in the input array will be
+// preserved in the output.
+//
+// The `try` versions return null if the input is invalid; the non-`try` versions return data
+// containing the Unicode replacement character (U+FFFD).
+//
+// The input must be in platform-native endianness. BOMs are NOT recognized by these functions.
+
+String encodeHex(ArrayPtr<const byte> bytes);
+EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text);
+// Encode/decode bytes as hex strings.
+
+String encodeUriComponent(ArrayPtr<const byte> bytes);
+String encodeUriComponent(ArrayPtr<const char> bytes);
+EncodingResult<Array<byte>> decodeBinaryUriComponent(
+    ArrayPtr<const char> text, bool nulTerminate = false);
+EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text);
+// Encode/decode URI components using % escapes. See Javascript's encodeURIComponent().
+
+String encodeCEscape(ArrayPtr<const byte> bytes);
+String encodeCEscape(ArrayPtr<const char> bytes);
+EncodingResult<Array<byte>> decodeBinaryCEscape(
+    ArrayPtr<const char> text, bool nulTerminate = false);
+EncodingResult<String> decodeCEscape(ArrayPtr<const char> text);
+
+String encodeBase64(ArrayPtr<const byte> bytes, bool breakLines = false);
+// Encode the given bytes as base64 text. If `breakLines` is true, line breaks will be inserted
+// into the output every 72 characters (e.g. for encoding e-mail bodies).
+
+Array<byte> decodeBase64(ArrayPtr<const char> text);
+// Decode base64 text. Non-base64 characters are ignored and padding characters are not requried;
+// as such, this function never fails.
+
+// =======================================================================================
+// inline implementation details
+
+namespace _ {  // private
+
+template <typename T>
+NullableValue<T> readMaybe(EncodingResult<T>&& value) {
+  if (value.hadErrors) {
+    return nullptr;
+  } else {
+    return kj::mv(value);
+  }
+}
+
+template <typename T>
+T* readMaybe(EncodingResult<T>& value) {
+  if (value.hadErrors) {
+    return nullptr;
+  } else {
+    return &value;
+  }
+}
+
+template <typename T>
+const T* readMaybe(const EncodingResult<T>& value) {
+  if (value.hadErrors) {
+    return nullptr;
+  } else {
+    return &value;
+  }
+}
+
+}  // namespace _ (private)
+
+inline String encodeUriComponent(ArrayPtr<const char> text) {
+  return encodeUriComponent(text.asBytes());
+}
+inline EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text) {
+  auto result = decodeBinaryUriComponent(text, true);
+  return { String(result.releaseAsChars()), result.hadErrors };
+}
+
+inline String encodeCEscape(ArrayPtr<const char> text) {
+  return encodeCEscape(text.asBytes());
+}
+inline EncodingResult<String> decodeCEscape(ArrayPtr<const char> text) {
+  auto result = decodeBinaryCEscape(text, true);
+  return { String(result.releaseAsChars()), result.hadErrors };
+}
+
+// If you pass a string literal to a function taking ArrayPtr<const char>, it'll include the NUL
+// termintator, which is surprising. Let's add overloads that avoid that. In practice this probably
+// only even matters for encoding-test.c++.
+
+template <size_t s>
+inline EncodingResult<Array<char16_t>> encodeUtf16(const char (&text)[s], bool nulTerminate=false) {
+  return encodeUtf16(arrayPtr(text, s - 1), nulTerminate);
+}
+template <size_t s>
+inline EncodingResult<Array<char32_t>> encodeUtf32(const char (&text)[s], bool nulTerminate=false) {
+  return encodeUtf32(arrayPtr(text, s - 1), nulTerminate);
+}
+template <size_t s>
+inline EncodingResult<String> decodeUtf16(const char16_t (&utf16)[s]) {
+  return decodeUtf16(arrayPtr(utf16, s - 1));
+}
+template <size_t s>
+inline EncodingResult<String> decodeUtf32(const char32_t (&utf32)[s]) {
+  return decodeUtf32(arrayPtr(utf32, s - 1));
+}
+template <size_t s>
+inline EncodingResult<Array<byte>> decodeHex(const char (&text)[s]) {
+  return decodeHex(arrayPtr(text, s - 1));
+}
+template <size_t s>
+inline String encodeUriComponent(const char (&text)[s]) {
+  return encodeUriComponent(arrayPtr(text, s - 1));
+}
+template <size_t s>
+inline Array<byte> decodeBinaryUriComponent(const char (&text)[s]) {
+  return decodeBinaryUriComponent(arrayPtr(text, s - 1));
+}
+template <size_t s>
+inline EncodingResult<String> decodeUriComponent(const char (&text)[s]) {
+  return decodeUriComponent(arrayPtr(text, s-1));
+}
+template <size_t s>
+inline String encodeCEscape(const char (&text)[s]) {
+  return encodeCEscape(arrayPtr(text, s - 1));
+}
+template <size_t s>
+inline EncodingResult<Array<byte>> decodeBinaryCEscape(const char (&text)[s]) {
+  return decodeBinaryCEscape(arrayPtr(text, s - 1));
+}
+template <size_t s>
+inline EncodingResult<String> decodeCEscape(const char (&text)[s]) {
+  return decodeCEscape(arrayPtr(text, s-1));
+}
+template <size_t s>
+Array<byte> decodeBase64(const char (&text)[s]) {
+  return decodeBase64(arrayPtr(text, s - 1));
+}
+
+} // namespace kj
+
+#endif // KJ_ENCODING_H_