Commit c45bd150 authored by Kenton Varda's avatar Kenton Varda Committed by GitHub

Merge pull request #483 from sandstorm-io/kj-encoding

Add KJ utility functions to encode/decode blobs in common formats.
parents 35ffb4cd 745049be
...@@ -128,6 +128,7 @@ includekj_HEADERS = \ ...@@ -128,6 +128,7 @@ includekj_HEADERS = \
src/kj/vector.h \ src/kj/vector.h \
src/kj/string.h \ src/kj/string.h \
src/kj/string-tree.h \ src/kj/string-tree.h \
src/kj/encoding.h \
src/kj/exception.h \ src/kj/exception.h \
src/kj/debug.h \ src/kj/debug.h \
src/kj/arena.h \ src/kj/arena.h \
...@@ -218,6 +219,7 @@ libkj_la_SOURCES= \ ...@@ -218,6 +219,7 @@ libkj_la_SOURCES= \
src/kj/array.c++ \ src/kj/array.c++ \
src/kj/string.c++ \ src/kj/string.c++ \
src/kj/string-tree.c++ \ src/kj/string-tree.c++ \
src/kj/encoding.c++ \
src/kj/exception.c++ \ src/kj/exception.c++ \
src/kj/debug.c++ \ src/kj/debug.c++ \
src/kj/arena.c++ \ src/kj/arena.c++ \
...@@ -451,6 +453,7 @@ capnp_test_SOURCES = \ ...@@ -451,6 +453,7 @@ capnp_test_SOURCES = \
src/kj/array-test.c++ \ src/kj/array-test.c++ \
src/kj/string-test.c++ \ src/kj/string-test.c++ \
src/kj/string-tree-test.c++ \ src/kj/string-tree-test.c++ \
src/kj/encoding-test.c++ \
src/kj/exception-test.c++ \ src/kj/exception-test.c++ \
src/kj/debug-test.c++ \ src/kj/debug-test.c++ \
src/kj/arena-test.c++ \ src/kj/arena-test.c++ \
......
...@@ -193,6 +193,20 @@ public: ...@@ -193,6 +193,20 @@ public:
void addFieldHandler(StructSchema::Field field, Handler<T>& handler); void addFieldHandler(StructSchema::Field field, Handler<T>& handler);
// Matches only the specific field. T can be a dynamic type. T must match the field's type. // Matches only the specific field. T can be a dynamic type. T must match the field's type.
// ---------------------------------------------------------------------------
// Hack to support string literal parameters
template <size_t size, typename... Params>
auto decode(const char (&input)[size], Params&&... params) const
-> decltype(decode(kj::arrayPtr(input, size), kj::fwd<Params>(params)...)) {
return decode(kj::arrayPtr(input, size - 1), kj::fwd<Params>(params)...);
}
template <size_t size, typename... Params>
auto decodeRaw(const char (&input)[size], Params&&... params) const
-> decltype(decodeRaw(kj::arrayPtr(input, size), kj::fwd<Params>(params)...)) {
return decodeRaw(kj::arrayPtr(input, size - 1), kj::fwd<Params>(params)...);
}
private: private:
class HandlerBase; class HandlerBase;
struct Impl; struct Impl;
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <capnp/serialize.h> #include <capnp/serialize.h>
#include <kj/debug.h> #include <kj/debug.h>
#include <kj/arena.h> #include <kj/arena.h>
#include <kj/encoding.h>
#include <set> #include <set>
#include <map> #include <map>
#include <stdlib.h> #include <stdlib.h>
...@@ -2408,36 +2409,7 @@ uint64_t NodeTranslator::compileParamList( ...@@ -2408,36 +2409,7 @@ uint64_t NodeTranslator::compileParamList(
static const char HEXDIGITS[] = "0123456789abcdef"; static const char HEXDIGITS[] = "0123456789abcdef";
static kj::StringTree stringLiteral(kj::StringPtr chars) { static kj::StringTree stringLiteral(kj::StringPtr chars) {
// TODO(cleanup): This code keeps coming up. Put somewhere common? return kj::strTree('"', kj::encodeCEscape(chars), '"');
kj::Vector<char> escaped(chars.size());
for (char c: chars) {
switch (c) {
case '\a': escaped.addAll(kj::StringPtr("\\a")); break;
case '\b': escaped.addAll(kj::StringPtr("\\b")); break;
case '\f': escaped.addAll(kj::StringPtr("\\f")); break;
case '\n': escaped.addAll(kj::StringPtr("\\n")); break;
case '\r': escaped.addAll(kj::StringPtr("\\r")); break;
case '\t': escaped.addAll(kj::StringPtr("\\t")); break;
case '\v': escaped.addAll(kj::StringPtr("\\v")); break;
case '\'': escaped.addAll(kj::StringPtr("\\\'")); break;
case '\"': escaped.addAll(kj::StringPtr("\\\"")); break;
case '\\': escaped.addAll(kj::StringPtr("\\\\")); break;
default:
if (c < 0x20) {
escaped.add('\\');
escaped.add('x');
uint8_t c2 = c;
escaped.add(HEXDIGITS[c2 / 16]);
escaped.add(HEXDIGITS[c2 % 16]);
} else {
escaped.add(c);
}
break;
}
}
return kj::strTree('"', escaped, '"');
} }
static kj::StringTree binaryLiteral(Data::Reader data) { static kj::StringTree binaryLiteral(Data::Reader data) {
......
...@@ -22,13 +22,12 @@ ...@@ -22,13 +22,12 @@
#include "dynamic.h" #include "dynamic.h"
#include <kj/debug.h> #include <kj/debug.h>
#include <kj/vector.h> #include <kj/vector.h>
#include <kj/encoding.h>
namespace capnp { namespace capnp {
namespace { namespace {
static const char HEXDIGITS[] = "0123456789abcdef";
enum PrintMode { enum PrintMode {
BARE, BARE,
// The value is planned to be printed on its own line, unless it is very short and contains // The value is planned to be printed on its own line, unless it is very short and contains
...@@ -150,34 +149,7 @@ static kj::StringTree print(const DynamicValue::Reader& value, ...@@ -150,34 +149,7 @@ static kj::StringTree print(const DynamicValue::Reader& value,
chars = value.as<Text>(); chars = value.as<Text>();
} }
kj::Vector<char> escaped(chars.size()); return kj::strTree('"', kj::encodeCEscape(chars), '"');
for (char c: chars) {
switch (c) {
case '\a': escaped.addAll(kj::StringPtr("\\a")); break;
case '\b': escaped.addAll(kj::StringPtr("\\b")); break;
case '\f': escaped.addAll(kj::StringPtr("\\f")); break;
case '\n': escaped.addAll(kj::StringPtr("\\n")); break;
case '\r': escaped.addAll(kj::StringPtr("\\r")); break;
case '\t': escaped.addAll(kj::StringPtr("\\t")); break;
case '\v': escaped.addAll(kj::StringPtr("\\v")); break;
case '\'': escaped.addAll(kj::StringPtr("\\\'")); break;
case '\"': escaped.addAll(kj::StringPtr("\\\"")); break;
case '\\': escaped.addAll(kj::StringPtr("\\\\")); break;
default:
if (c < 0x20) {
escaped.add('\\');
escaped.add('x');
uint8_t c2 = c;
escaped.add(HEXDIGITS[c2 / 16]);
escaped.add(HEXDIGITS[c2 % 16]);
} else {
escaped.add(c);
}
break;
}
}
return kj::strTree('"', escaped, '"');
} }
case DynamicValue::LIST: { case DynamicValue::LIST: {
auto listValue = value.as<DynamicList>(); auto listValue = value.as<DynamicList>();
......
...@@ -19,6 +19,7 @@ set(kj_sources_heavy ...@@ -19,6 +19,7 @@ set(kj_sources_heavy
units.c++ units.c++
refcount.c++ refcount.c++
string-tree.c++ string-tree.c++
encoding.c++
parse/char.c++ parse/char.c++
) )
if(NOT CAPNP_LITE) if(NOT CAPNP_LITE)
...@@ -36,6 +37,7 @@ set(kj_headers ...@@ -36,6 +37,7 @@ set(kj_headers
vector.h vector.h
string.h string.h
string-tree.h string-tree.h
encoding.h
exception.h exception.h
debug.h debug.h
arena.h arena.h
...@@ -170,6 +172,7 @@ if(BUILD_TESTING) ...@@ -170,6 +172,7 @@ if(BUILD_TESTING)
async-io-test.c++ async-io-test.c++
refcount-test.c++ refcount-test.c++
string-tree-test.c++ string-tree-test.c++
encoding-test.c++
arena-test.c++ arena-test.c++
units-test.c++ units-test.c++
tuple-test.c++ tuple-test.c++
......
...@@ -455,6 +455,10 @@ T refIfLvalue(T&&); ...@@ -455,6 +455,10 @@ T refIfLvalue(T&&);
// KJ_DECLTYPE_REF(i) i3(i); // i3 has type int&. // KJ_DECLTYPE_REF(i) i3(i); // i3 has type int&.
// KJ_DECLTYPE_REF(kj::mv(i)) i4(kj::mv(i)); // i4 has type int. // KJ_DECLTYPE_REF(kj::mv(i)) i4(kj::mv(i)); // i4 has type int.
template <typename T, typename U> struct IsSameType_ { static constexpr bool value = false; };
template <typename T> struct IsSameType_<T, T> { static constexpr bool value = true; };
template <typename T, typename U> constexpr bool isSameType() { return IsSameType_<T, U>::value; }
template <typename T> template <typename T>
struct CanConvert_ { struct CanConvert_ {
static int sfinae(T); static int sfinae(T);
...@@ -911,7 +915,6 @@ public: ...@@ -911,7 +915,6 @@ public:
return value; return value;
} }
private: // internal interface used by friends only
inline NullableValue() noexcept: isSet(false) {} inline NullableValue() noexcept: isSet(false) {}
inline NullableValue(T&& t) noexcept(noexcept(T(instance<T&&>()))) inline NullableValue(T&& t) noexcept(noexcept(T(instance<T&&>())))
: isSet(true) { : isSet(true) {
...@@ -1244,8 +1247,31 @@ public: ...@@ -1244,8 +1247,31 @@ public:
: ptr(init.begin()), size_(init.size()) {} : ptr(init.begin()), size_(init.size()) {}
template <size_t size> template <size_t size>
inline constexpr ArrayPtr(T (&native)[size]): ptr(native), size_(size) {} inline constexpr ArrayPtr(T (&native)[size]): ptr(native), size_(size) {
// Construct an ArrayPtr from a native C-style array. // Construct an ArrayPtr from a native C-style array.
//
// We disable this constructor for const char arrays because otherwise you would be able to
// implicitly convert a character literal to ArrayPtr<const char>, which sounds really great,
// except that the NUL terminator would be included, which probably isn't what you intended.
//
// TODO(someday): Maybe we should support character literals but explicitly chop off the NUL
// terminator. This could do the wrong thing if someone tries to construct an
// ArrayPtr<const char> from a non-NUL-terminated char array, but evidence suggests that all
// real use cases are in fact intending to remove the NUL terminator. It's convenient to be
// able to specify ArrayPtr<const char> as a parameter type and be able to accept strings
// as input in addition to arrays. Currently, you'll need overloading to support string
// literals in this case, but if you overload StringPtr, then you'll find that several
// conversions (e.g. from String and from a literal char array) become ambiguous! You end up
// having to overload for literal char arrays specifically which is cumbersome.
static_assert(!isSameType<T, const char>(),
"Can't implicitly convert literal char array to ArrayPtr because we don't know if "
"you meant to include the NUL terminator. We may change this in the future to "
"automatically drop the NUL terminator. For now, try explicitly converting to StringPtr, "
"which can in turn implicitly convert to ArrayPtr<const char>.");
static_assert(!isSameType<T, const char16_t>(), "see above");
static_assert(!isSameType<T, const char32_t>(), "see above");
}
inline operator ArrayPtr<const T>() const { inline operator ArrayPtr<const T>() const {
return ArrayPtr<const T>(ptr, size_); return ArrayPtr<const T>(ptr, size_);
......
// Copyright (c) 2017 Cloudflare, Inc. and contributors
// Licensed under the MIT License:
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include "encoding.h"
#include <kj/test.h>
#include <stdint.h>
namespace kj {
namespace {
CappedArray<char, sizeof(char ) * 2 + 1> hex(byte i) { return kj::hex((uint8_t )i); }
CappedArray<char, sizeof(char ) * 2 + 1> hex(char i) { return kj::hex((uint8_t )i); }
CappedArray<char, sizeof(char16_t) * 2 + 1> hex(char16_t i) { return kj::hex((uint16_t)i); }
CappedArray<char, sizeof(char32_t) * 2 + 1> hex(char32_t i) { return kj::hex((uint32_t)i); }
// Hexify chars correctly.
//
// TODO(cleanup): Should this go into string.h with the other definitions of hex()?
template <typename T, typename U>
void expectResImpl(EncodingResult<T> result,
ArrayPtr<const U> expected,
bool errors = false) {
if (errors) {
KJ_EXPECT(result.hadErrors);
} else {
KJ_EXPECT(!result.hadErrors);
}
KJ_EXPECT(result.size() == expected.size(), result.size(), expected.size());
for (auto i: kj::zeroTo(kj::min(result.size(), expected.size()))) {
KJ_EXPECT(result[i] == expected[i], i, hex(result[i]), hex(expected[i]));
}
}
template <typename T, typename U, size_t s>
void expectRes(EncodingResult<T> result,
const U (&expected)[s],
bool errors = false) {
expectResImpl(kj::mv(result), arrayPtr(expected, s - 1), errors);
}
template <typename T, size_t s>
void expectRes(EncodingResult<T> result,
byte (&expected)[s],
bool errors = false) {
expectResImpl(kj::mv(result), arrayPtr<const byte>(expected, s), errors);
}
KJ_TEST("encode UTF-8 to UTF-16") {
expectRes(encodeUtf16(u8"foo"), u"foo");
expectRes(encodeUtf16(u8"Здравствуйте"), u"Здравствуйте");
expectRes(encodeUtf16(u8"中国网络"), u"中国网络");
expectRes(encodeUtf16(u8"😺☁☄🐵"), u"😺☁☄🐵");
}
KJ_TEST("invalid UTF-8 to UTF-16") {
// Disembodied continuation byte.
expectRes(encodeUtf16("\x80"), u"\ufffd", true);
expectRes(encodeUtf16("f\xbfo"), u"f\ufffdo", true);
expectRes(encodeUtf16("f\xbf\x80\xb0o"), u"f\ufffdo", true);
// Missing continuation bytes.
expectRes(encodeUtf16("\xc2x"), u"\ufffdx", true);
expectRes(encodeUtf16("\xe0x"), u"\ufffdx", true);
expectRes(encodeUtf16("\xe0\xa0x"), u"\ufffdx", true);
expectRes(encodeUtf16("\xf0x"), u"\ufffdx", true);
expectRes(encodeUtf16("\xf0\x90x"), u"\ufffdx", true);
expectRes(encodeUtf16("\xf0\x90\x80x"), u"\ufffdx", true);
// Overlong sequences.
expectRes(encodeUtf16("\xc0\x80"), u"\ufffd", true);
expectRes(encodeUtf16("\xc1\xbf"), u"\ufffd", true);
expectRes(encodeUtf16("\xc2\x80"), u"\u0080", false);
expectRes(encodeUtf16("\xdf\xbf"), u"\u07ff", false);
expectRes(encodeUtf16("\xe0\x80\x80"), u"\ufffd", true);
expectRes(encodeUtf16("\xe0\x9f\xbf"), u"\ufffd", true);
expectRes(encodeUtf16("\xe0\xa0\x80"), u"\u0800", false);
expectRes(encodeUtf16("\xef\xbf\xbe"), u"\ufffe", false);
// Due to a classic off-by-one error, GCC 4.x rather hilariously encodes '\uffff' as the
// "surrogate pair" 0xd7ff, 0xdfff: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=41698
if (kj::size(u"\uffff") == 2) {
expectRes(encodeUtf16("\xef\xbf\xbf"), u"\uffff", false);
}
expectRes(encodeUtf16("\xf0\x80\x80\x80"), u"\ufffd", true);
expectRes(encodeUtf16("\xf0\x8f\xbf\xbf"), u"\ufffd", true);
expectRes(encodeUtf16("\xf0\x90\x80\x80"), u"\U00010000", false);
expectRes(encodeUtf16("\xf4\x8f\xbf\xbf"), u"\U0010ffff", false);
// Out of Unicode range.
expectRes(encodeUtf16("\xf5\x80\x80\x80"), u"\ufffd", true);
expectRes(encodeUtf16("\xf8\xbf\x80\x80\x80"), u"\ufffd", true);
expectRes(encodeUtf16("\xfc\xbf\x80\x80\x80\x80"), u"\ufffd", true);
expectRes(encodeUtf16("\xfe\xbf\x80\x80\x80\x80\x80"), u"\ufffd", true);
expectRes(encodeUtf16("\xff\xbf\x80\x80\x80\x80\x80\x80"), u"\ufffd", true);
}
KJ_TEST("encode UTF-8 to UTF-32") {
expectRes(encodeUtf32(u8"foo"), U"foo");
expectRes(encodeUtf32(u8"Здравствуйте"), U"Здравствуйте");
expectRes(encodeUtf32(u8"中国网络"), U"中国网络");
expectRes(encodeUtf32(u8"😺☁☄🐵"), U"😺☁☄🐵");
}
KJ_TEST("invalid UTF-8 to UTF-32") {
// Disembodied continuation byte.
expectRes(encodeUtf32("\x80"), U"\ufffd", true);
expectRes(encodeUtf32("f\xbfo"), U"f\ufffdo", true);
expectRes(encodeUtf32("f\xbf\x80\xb0o"), U"f\ufffdo", true);
// Missing continuation bytes.
expectRes(encodeUtf32("\xc2x"), U"\ufffdx", true);
expectRes(encodeUtf32("\xe0x"), U"\ufffdx", true);
expectRes(encodeUtf32("\xe0\xa0x"), U"\ufffdx", true);
expectRes(encodeUtf32("\xf0x"), U"\ufffdx", true);
expectRes(encodeUtf32("\xf0\x90x"), U"\ufffdx", true);
expectRes(encodeUtf32("\xf0\x90\x80x"), U"\ufffdx", true);
// Overlong sequences.
expectRes(encodeUtf32("\xc0\x80"), U"\ufffd", true);
expectRes(encodeUtf32("\xc1\xbf"), U"\ufffd", true);
expectRes(encodeUtf32("\xc2\x80"), U"\u0080", false);
expectRes(encodeUtf32("\xdf\xbf"), U"\u07ff", false);
expectRes(encodeUtf32("\xe0\x80\x80"), U"\ufffd", true);
expectRes(encodeUtf32("\xe0\x9f\xbf"), U"\ufffd", true);
expectRes(encodeUtf32("\xe0\xa0\x80"), U"\u0800", false);
expectRes(encodeUtf32("\xef\xbf\xbf"), U"\uffff", false);
expectRes(encodeUtf32("\xf0\x80\x80\x80"), U"\ufffd", true);
expectRes(encodeUtf32("\xf0\x8f\xbf\xbf"), U"\ufffd", true);
expectRes(encodeUtf32("\xf0\x90\x80\x80"), U"\U00010000", false);
expectRes(encodeUtf32("\xf4\x8f\xbf\xbf"), U"\U0010ffff", false);
// Out of Unicode range.
expectRes(encodeUtf32("\xf5\x80\x80\x80"), U"\ufffd", true);
expectRes(encodeUtf32("\xf8\xbf\x80\x80\x80"), U"\ufffd", true);
expectRes(encodeUtf32("\xfc\xbf\x80\x80\x80\x80"), U"\ufffd", true);
expectRes(encodeUtf32("\xfe\xbf\x80\x80\x80\x80\x80"), U"\ufffd", true);
expectRes(encodeUtf32("\xff\xbf\x80\x80\x80\x80\x80\x80"), U"\ufffd", true);
}
KJ_TEST("decode UTF-16 to UTF-8") {
expectRes(decodeUtf16(u"foo"), u8"foo");
expectRes(decodeUtf16(u"Здравствуйте"), u8"Здравствуйте");
expectRes(decodeUtf16(u"中国网络"), u8"中国网络");
expectRes(decodeUtf16(u"😺☁☄🐵"), u8"😺☁☄🐵");
}
KJ_TEST("invalid UTF-16 to UTF-8") {
// Surrogates in wrong order.
expectRes(decodeUtf16(u"\xd7ff\xdc00\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true);
// Missing second surrogate.
expectRes(decodeUtf16(u"f\xd800"), u8"f\ufffd", true);
expectRes(decodeUtf16(u"f\xd800x"), u8"f\ufffdx", true);
expectRes(decodeUtf16(u"f\xd800\xd800x"), u8"f\ufffd\ufffdx", true);
}
KJ_TEST("decode UTF-32 to UTF-8") {
expectRes(decodeUtf32(U"foo"), u8"foo");
expectRes(decodeUtf32(U"Здравствуйте"), u8"Здравствуйте");
expectRes(decodeUtf32(U"中国网络"), u8"中国网络");
expectRes(decodeUtf32(U"😺☁☄🐵"), u8"😺☁☄🐵");
}
KJ_TEST("invalid UTF-32 to UTF-8") {
// Surrogates rejected.
expectRes(decodeUtf32(U"\xd7ff\xdc00\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true);
// Even if it would be a valid surrogate pair in UTF-16.
expectRes(decodeUtf32(U"\xd7ff\xd800\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true);
}
KJ_TEST("EncodingResult as a Maybe") {
KJ_IF_MAYBE(result, encodeUtf16("\x80")) {
KJ_FAIL_EXPECT("expected failure");
}
KJ_IF_MAYBE(result, encodeUtf16("foo")) {
// good
} else {
KJ_FAIL_EXPECT("expected success");
}
KJ_EXPECT(KJ_ASSERT_NONNULL(decodeUtf16(u"foo")) == "foo");
}
// =======================================================================================
KJ_TEST("hex encoding/decoding") {
byte bytes[] = {0x12, 0x34, 0xab, 0xf2};
KJ_EXPECT(encodeHex(bytes) == "1234abf2");
expectRes(decodeHex("1234abf2"), bytes);
expectRes(decodeHex("1234abf21"), bytes, true);
bytes[2] = 0xa0;
expectRes(decodeHex("1234axf2"), bytes, true);
bytes[2] = 0x0b;
expectRes(decodeHex("1234xbf2"), bytes, true);
}
KJ_TEST("URI encoding/decoding") {
KJ_EXPECT(encodeUriComponent("foo") == "foo");
KJ_EXPECT(encodeUriComponent("foo bar") == "foo%20bar");
KJ_EXPECT(encodeUriComponent("\xab\xba") == "%ab%ba");
KJ_EXPECT(encodeUriComponent(StringPtr("foo\0bar", 7)) == "foo%00bar");
expectRes(decodeUriComponent("foo%20bar"), "foo bar");
expectRes(decodeUriComponent("%ab%BA"), "\xab\xba");
expectRes(decodeUriComponent("foo%1xxx"), "foo\1xxx", true);
expectRes(decodeUriComponent("foo%1"), "foo\1", true);
expectRes(decodeUriComponent("foo%xxx"), "fooxxx", true);
expectRes(decodeUriComponent("foo%"), "foo", true);
byte bytes[] = {12, 34, 56};
KJ_EXPECT(decodeBinaryUriComponent(encodeUriComponent(bytes)).asPtr() == bytes);
}
KJ_TEST("C escape encoding/decoding") {
KJ_EXPECT(encodeCEscape("fooo\a\b\f\n\r\t\v\'\"\\bar") ==
"fooo\\a\\b\\f\\n\\r\\t\\v\\\'\\\"\\\\bar");
KJ_EXPECT(encodeCEscape("foo\x01\x7fxxx") ==
"foo\\001\\177xxx");
expectRes(decodeCEscape("fooo\\a\\b\\f\\n\\r\\t\\v\\\'\\\"\\\\bar"),
"fooo\a\b\f\n\r\t\v\'\"\\bar");
expectRes(decodeCEscape("foo\\x01\\x7fxxx"), "foo\x01\x7fxxx");
expectRes(decodeCEscape("foo\\001\\177234"), "foo\001\177234");
expectRes(decodeCEscape("foo\\x1"), "foo\x1");
expectRes(decodeCEscape("foo\\1"), "foo\1");
expectRes(decodeCEscape("foo\\u1234bar"), u8"foo\u1234bar");
expectRes(decodeCEscape("foo\\U00045678bar"), u8"foo\U00045678bar");
// Error cases.
expectRes(decodeCEscape("foo\\"), "foo", true);
expectRes(decodeCEscape("foo\\x123x"), u8"foo\x23x", true);
expectRes(decodeCEscape("foo\\u12"), u8"foo\u0012", true);
expectRes(decodeCEscape("foo\\u12xxx"), u8"foo\u0012xxx", true);
expectRes(decodeCEscape("foo\\U12"), u8"foo\u0012", true);
expectRes(decodeCEscape("foo\\U12xxxxxxxx"), u8"foo\u0012xxxxxxxx", true);
}
KJ_TEST("base64 encoding/decoding") {
{
auto encoded = encodeBase64(StringPtr("foo").asBytes(), false);
KJ_EXPECT(encoded == "Zm9v", encoded, encoded.size());
KJ_EXPECT(heapString(decodeBase64(encoded.asArray()).asChars()) == "foo");
}
{
auto encoded = encodeBase64(StringPtr("corge").asBytes(), false);
KJ_EXPECT(encoded == "Y29yZ2U=", encoded);
KJ_EXPECT(heapString(decodeBase64(encoded.asArray()).asChars()) == "corge");
}
KJ_EXPECT(heapString(decodeBase64("Y29yZ2U").asChars()) == "corge");
KJ_EXPECT(heapString(decodeBase64("Y\n29y Z@2U=\n").asChars()) == "corge");
{
auto encoded = encodeBase64(StringPtr("corge").asBytes(), true);
KJ_EXPECT(encoded == "Y29yZ2U=\n", encoded);
}
StringPtr fullLine = "012345678901234567890123456789012345678901234567890123";
{
auto encoded = encodeBase64(fullLine.asBytes(), false);
KJ_EXPECT(
encoded == "MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIz",
encoded);
}
{
auto encoded = encodeBase64(fullLine.asBytes(), true);
KJ_EXPECT(
encoded == "MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIz\n",
encoded);
}
String multiLine = str(fullLine, "456");
{
auto encoded = encodeBase64(multiLine.asBytes(), false);
KJ_EXPECT(
encoded == "MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2",
encoded);
}
{
auto encoded = encodeBase64(multiLine.asBytes(), true);
KJ_EXPECT(
encoded == "MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIz\n"
"NDU2\n",
encoded);
}
}
} // namespace
} // namespace kj
// Copyright (c) 2017 Cloudflare, Inc.; Sandstorm Development Group, Inc.; and contributors
// Licensed under the MIT License:
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include "encoding.h"
#include "vector.h"
#include "debug.h"
namespace kj {
namespace {
#define GOTO_ERROR_IF(cond) if (KJ_UNLIKELY(cond)) goto error
inline void addChar32(Vector<char16_t>& vec, char32_t u) {
// Encode as surrogate pair.
u -= 0x10000;
vec.add(0xd800 | (u >> 10));
vec.add(0xdc00 | (u & 0x03ff));
}
inline void addChar32(Vector<char32_t>& vec, char32_t u) {
vec.add(u);
}
template <typename T>
EncodingResult<Array<T>> encodeUtf(ArrayPtr<const char> text, bool nulTerminate) {
Vector<T> result(text.size() + nulTerminate);
bool hadErrors = false;
size_t i = 0;
while (i < text.size()) {
byte c = text[i++];
if (c < 0x80) {
// 0xxxxxxx -- ASCII
result.add(c);
continue;
} else if (KJ_UNLIKELY(c < 0xc0)) {
// 10xxxxxx -- malformed continuation byte
goto error;
} else if (c < 0xe0) {
// 110xxxxx -- 2-byte
byte c2;
GOTO_ERROR_IF(i == text.size() || ((c2 = text[i]) & 0xc0) != 0x80); ++i;
char16_t u = (static_cast<char16_t>(c & 0x1f) << 6)
| (static_cast<char16_t>(c2 & 0x3f) );
// Disallow overlong sequence.
GOTO_ERROR_IF(u < 0x80);
result.add(u);
continue;
} else if (c < 0xf0) {
// 1110xxxx -- 3-byte
byte c2, c3;
GOTO_ERROR_IF(i == text.size() || ((c2 = text[i]) & 0xc0) != 0x80); ++i;
GOTO_ERROR_IF(i == text.size() || ((c3 = text[i]) & 0xc0) != 0x80); ++i;
char16_t u = (static_cast<char16_t>(c & 0x0f) << 12)
| (static_cast<char16_t>(c2 & 0x3f) << 6)
| (static_cast<char16_t>(c3 & 0x3f) );
// Disallow overlong sequence.
GOTO_ERROR_IF(u < 0x0800);
// Disallow surrogate pair code points.
GOTO_ERROR_IF((u & 0xf800) == 0xd800);
result.add(u);
continue;
} else if (c < 0xf8) {
// 11110xxx -- 4-byte
byte c2, c3, c4;
GOTO_ERROR_IF(i == text.size() || ((c2 = text[i]) & 0xc0) != 0x80); ++i;
GOTO_ERROR_IF(i == text.size() || ((c3 = text[i]) & 0xc0) != 0x80); ++i;
GOTO_ERROR_IF(i == text.size() || ((c4 = text[i]) & 0xc0) != 0x80); ++i;
char32_t u = (static_cast<char32_t>(c & 0x07) << 18)
| (static_cast<char32_t>(c2 & 0x3f) << 12)
| (static_cast<char32_t>(c3 & 0x3f) << 6)
| (static_cast<char32_t>(c4 & 0x3f) );
// Disallow overlong sequence.
GOTO_ERROR_IF(u < 0x10000);
// Unicode ends at U+10FFFF
GOTO_ERROR_IF(u >= 0x110000);
addChar32(result, u);
continue;
} else {
// 5-byte and 6-byte sequences are not legal as they'd result in codepoints outside the
// range of Unicode.
goto error;
}
error:
result.add(0xfffd);
hadErrors = true;
// Ignore all continuation bytes.
while (i < text.size() && (text[i] & 0xc0) == 0x80) {
++i;
}
}
if (nulTerminate) result.add(0);
return { result.releaseAsArray(), hadErrors };
}
} // namespace
EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate) {
return encodeUtf<char16_t>(text, nulTerminate);
}
EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate) {
return encodeUtf<char32_t>(text, nulTerminate);
}
EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) {
Vector<char> result(utf16.size() + 1);
bool hadErrors = false;
size_t i = 0;
while (i < utf16.size()) {
char16_t u = utf16[i++];
if (u < 0x80) {
result.add(u);
continue;
} else if (u < 0x0800) {
result.addAll<std::initializer_list<char>>({
static_cast<char>(((u >> 6) ) | 0xc0),
static_cast<char>(((u ) & 0x3f) | 0x80)
});
continue;
} else if ((u & 0xf800) == 0xd800) {
// surrogate pair
char16_t u2;
GOTO_ERROR_IF(i == utf16.size() // missing second half
|| (u & 0x0400) != 0 // first half in wrong range
|| ((u2 = utf16[i]) & 0xfc00) != 0xdc00); // second half in wrong range
++i;
char32_t u32 = (((u & 0x03ff) << 10) | (u2 & 0x03ff)) + 0x10000;
result.addAll<std::initializer_list<char>>({
static_cast<char>(((u32 >> 18) ) | 0xf0),
static_cast<char>(((u32 >> 12) & 0x3f) | 0x80),
static_cast<char>(((u32 >> 6) & 0x3f) | 0x80),
static_cast<char>(((u32 ) & 0x3f) | 0x80)
});
continue;
} else {
result.addAll<std::initializer_list<char>>({
static_cast<char>(((u >> 12) ) | 0xe0),
static_cast<char>(((u >> 6) & 0x3f) | 0x80),
static_cast<char>(((u ) & 0x3f) | 0x80)
});
continue;
}
error:
result.addAll(StringPtr(u8"\ufffd"));
hadErrors = true;
}
result.add(0);
return { String(result.releaseAsArray()), hadErrors };
}
EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf16) {
Vector<char> result(utf16.size() + 1);
bool hadErrors = false;
size_t i = 0;
while (i < utf16.size()) {
char32_t u = utf16[i++];
if (u < 0x80) {
result.add(u);
continue;
} else if (u < 0x0800) {
result.addAll<std::initializer_list<char>>({
static_cast<char>(((u >> 6) ) | 0xc0),
static_cast<char>(((u ) & 0x3f) | 0x80)
});
continue;
} else if (u < 0x10000) {
GOTO_ERROR_IF((u & 0xfffff800) == 0xd800); // no surrogates allowed in utf-32
result.addAll<std::initializer_list<char>>({
static_cast<char>(((u >> 12) ) | 0xe0),
static_cast<char>(((u >> 6) & 0x3f) | 0x80),
static_cast<char>(((u ) & 0x3f) | 0x80)
});
continue;
} else {
GOTO_ERROR_IF(u >= 0x110000); // outside Unicode range
result.addAll<std::initializer_list<char>>({
static_cast<char>(((u >> 18) ) | 0xf0),
static_cast<char>(((u >> 12) & 0x3f) | 0x80),
static_cast<char>(((u >> 6) & 0x3f) | 0x80),
static_cast<char>(((u ) & 0x3f) | 0x80)
});
continue;
}
error:
result.addAll(StringPtr(u8"\ufffd"));
hadErrors = true;
}
result.add(0);
return { String(result.releaseAsArray()), hadErrors };
}
// =======================================================================================
namespace {
const char HEX_DIGITS[] = "0123456789abcdef";
static Maybe<uint> tryFromHexDigit(char c) {
if ('0' <= c && c <= '9') {
return c - '0';
} else if ('a' <= c && c <= 'f') {
return c - ('a' - 10);
} else if ('A' <= c && c <= 'F') {
return c - ('A' - 10);
} else {
return nullptr;
}
}
static Maybe<uint> tryFromOctDigit(char c) {
if ('0' <= c && c <= '7') {
return c - '0';
} else {
return nullptr;
}
}
} // namespace
String encodeHex(ArrayPtr<const byte> input) {
return strArray(KJ_MAP(b, input) {
return heapArray<char>({HEX_DIGITS[b/16], HEX_DIGITS[b%16]});
}, "");
}
EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text) {
auto result = heapArray<byte>(text.size() / 2);
bool hadErrors = text.size() % 2;
for (auto i: kj::indices(result)) {
byte b = 0;
KJ_IF_MAYBE(d1, tryFromHexDigit(text[i*2])) {
b = *d1 << 4;
} else {
hadErrors = true;
}
KJ_IF_MAYBE(d2, tryFromHexDigit(text[i*2+1])) {
b |= *d2;
} else {
hadErrors = true;
}
result[i] = b;
}
return { kj::mv(result), hadErrors };
}
String encodeUriComponent(ArrayPtr<const byte> bytes) {
Vector<char> result(bytes.size() + 1);
for (byte b: bytes) {
if (('A' <= b && b <= 'Z') || ('a' <= b && b <= 'z') || ('0' <= b && b <= '9') ||
b == '-' || b == '_' || b == '.' || b == '!' || b == '~' || b == '*' || b == '\'' ||
b == '(' || b == ')') {
result.add(b);
} else {
result.add('%');
result.add(HEX_DIGITS[b/16]);
result.add(HEX_DIGITS[b%16]);
}
}
result.add('\0');
return String(result.releaseAsArray());
}
EncodingResult<Array<byte>> decodeBinaryUriComponent(
ArrayPtr<const char> text, bool nulTerminate) {
Vector<byte> result(text.size() + nulTerminate);
bool hadErrors = false;
const char* ptr = text.begin();
const char* end = text.end();
while (ptr < end) {
if (*ptr == '%') {
++ptr;
if (ptr == end) {
hadErrors = true;
} else KJ_IF_MAYBE(d1, tryFromHexDigit(*ptr)) {
byte b = *d1;
++ptr;
if (ptr == end) {
hadErrors = true;
} else KJ_IF_MAYBE(d2, tryFromHexDigit(*ptr)) {
b = (b << 4) | *d2;
++ptr;
} else {
hadErrors = true;
}
result.add(b);
} else {
hadErrors = true;
}
} else {
result.add(*ptr++);
}
}
if (nulTerminate) result.add(0);
return { result.releaseAsArray(), hadErrors };
}
// =======================================================================================
String encodeCEscape(ArrayPtr<const byte> bytes) {
Vector<char> escaped(bytes.size());
for (byte b: bytes) {
switch (b) {
case '\a': escaped.addAll(StringPtr("\\a")); break;
case '\b': escaped.addAll(StringPtr("\\b")); break;
case '\f': escaped.addAll(StringPtr("\\f")); break;
case '\n': escaped.addAll(StringPtr("\\n")); break;
case '\r': escaped.addAll(StringPtr("\\r")); break;
case '\t': escaped.addAll(StringPtr("\\t")); break;
case '\v': escaped.addAll(StringPtr("\\v")); break;
case '\'': escaped.addAll(StringPtr("\\\'")); break;
case '\"': escaped.addAll(StringPtr("\\\"")); break;
case '\\': escaped.addAll(StringPtr("\\\\")); break;
default:
if (b < 0x20 || b == 0x7f) {
// Use octal escape, not hex, because hex escapes technically have no length limit and
// so can create ambiguity with subsequent characters.
escaped.add('\\');
escaped.add(HEX_DIGITS[b / 64]);
escaped.add(HEX_DIGITS[(b / 8) % 8]);
escaped.add(HEX_DIGITS[b % 8]);
} else {
escaped.add(b);
}
break;
}
}
escaped.add(0);
return String(escaped.releaseAsArray());
}
EncodingResult<Array<byte>> decodeBinaryCEscape(ArrayPtr<const char> text, bool nulTerminate) {
Vector<byte> result(text.size() + nulTerminate);
bool hadErrors = false;
size_t i = 0;
while (i < text.size()) {
char c = text[i++];
if (c == '\\') {
if (i == text.size()) {
hadErrors = true;
continue;
}
char c2 = text[i++];
switch (c2) {
case 'a' : result.add('\a'); break;
case 'b' : result.add('\b'); break;
case 'f' : result.add('\f'); break;
case 'n' : result.add('\n'); break;
case 'r' : result.add('\r'); break;
case 't' : result.add('\t'); break;
case 'v' : result.add('\v'); break;
case '\'': result.add('\''); break;
case '\"': result.add('\"'); break;
case '\\': result.add('\\'); break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7': {
uint value = c2 - '0';
for (uint j = 0; j < 2 && i < text.size(); j++) {
KJ_IF_MAYBE(d, tryFromOctDigit(text[i])) {
++i;
value = (value << 3) | *d;
} else {
break;
}
}
if (value >= 0x100) hadErrors = true;
result.add(value);
break;
}
case 'x': {
uint value = 0;
while (i < text.size()) {
KJ_IF_MAYBE(d, tryFromHexDigit(text[i])) {
++i;
value = (value << 4) | *d;
} else {
break;
}
}
if (value >= 0x100) hadErrors = true;
result.add(value);
break;
}
case 'u': {
char16_t value = 0;
for (uint j = 0; j < 4; j++) {
if (i == text.size()) {
hadErrors = true;
break;
} else KJ_IF_MAYBE(d, tryFromHexDigit(text[i])) {
++i;
value = (value << 4) | *d;
} else {
hadErrors = true;
break;
}
}
auto utf = decodeUtf16(arrayPtr(&value, 1));
if (utf.hadErrors) hadErrors = true;
result.addAll(utf.asBytes());
break;
}
case 'U': {
char32_t value = 0;
for (uint j = 0; j < 8; j++) {
if (i == text.size()) {
hadErrors = true;
break;
} else KJ_IF_MAYBE(d, tryFromHexDigit(text[i])) {
++i;
value = (value << 4) | *d;
} else {
hadErrors = true;
break;
}
}
auto utf = decodeUtf32(arrayPtr(&value, 1));
if (utf.hadErrors) hadErrors = true;
result.addAll(utf.asBytes());
break;
}
default:
result.add(c2);
}
} else {
result.add(c);
}
}
if (nulTerminate) result.add(0);
return { result.releaseAsArray(), hadErrors };
}
// =======================================================================================
// This code is derived from libb64 which has been placed in the public domain.
// For details, see http://sourceforge.net/projects/libb64
// -------------------------------------------------------------------
// Encoder
namespace {
typedef enum {
step_A, step_B, step_C
} base64_encodestep;
typedef struct {
base64_encodestep step;
char result;
int stepcount;
} base64_encodestate;
const int CHARS_PER_LINE = 72;
void base64_init_encodestate(base64_encodestate* state_in) {
state_in->step = step_A;
state_in->result = 0;
state_in->stepcount = 0;
}
char base64_encode_value(char value_in) {
static const char* encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
if (value_in > 63) return '=';
return encoding[(int)value_in];
}
int base64_encode_block(const char* plaintext_in, int length_in,
char* code_out, base64_encodestate* state_in, bool breakLines) {
const char* plainchar = plaintext_in;
const char* const plaintextend = plaintext_in + length_in;
char* codechar = code_out;
char result;
char fragment;
result = state_in->result;
switch (state_in->step) {
while (1) {
case step_A:
if (plainchar == plaintextend) {
state_in->result = result;
state_in->step = step_A;
return codechar - code_out;
}
fragment = *plainchar++;
result = (fragment & 0x0fc) >> 2;
*codechar++ = base64_encode_value(result);
result = (fragment & 0x003) << 4;
case step_B:
if (plainchar == plaintextend) {
state_in->result = result;
state_in->step = step_B;
return codechar - code_out;
}
fragment = *plainchar++;
result |= (fragment & 0x0f0) >> 4;
*codechar++ = base64_encode_value(result);
result = (fragment & 0x00f) << 2;
case step_C:
if (plainchar == plaintextend) {
state_in->result = result;
state_in->step = step_C;
return codechar - code_out;
}
fragment = *plainchar++;
result |= (fragment & 0x0c0) >> 6;
*codechar++ = base64_encode_value(result);
result = (fragment & 0x03f) >> 0;
*codechar++ = base64_encode_value(result);
++(state_in->stepcount);
if (breakLines && state_in->stepcount == CHARS_PER_LINE/4) {
*codechar++ = '\n';
state_in->stepcount = 0;
}
}
}
/* control should not reach here */
return codechar - code_out;
}
int base64_encode_blockend(char* code_out, base64_encodestate* state_in, bool breakLines) {
char* codechar = code_out;
switch (state_in->step) {
case step_B:
*codechar++ = base64_encode_value(state_in->result);
*codechar++ = '=';
*codechar++ = '=';
++state_in->stepcount;
break;
case step_C:
*codechar++ = base64_encode_value(state_in->result);
*codechar++ = '=';
++state_in->stepcount;
break;
case step_A:
break;
}
if (breakLines && state_in->stepcount > 0) {
*codechar++ = '\n';
}
return codechar - code_out;
}
} // namespace
String encodeBase64(ArrayPtr<const byte> input, bool breakLines) {
/* set up a destination buffer large enough to hold the encoded data */
// equivalent to ceil(input.size() / 3) * 4
auto numChars = (input.size() + 2) / 3 * 4;
if (breakLines) {
// Add space for newline characters.
uint lineCount = numChars / CHARS_PER_LINE;
if (numChars % CHARS_PER_LINE > 0) {
// Partial line.
++lineCount;
}
numChars = numChars + lineCount;
}
auto output = heapString(numChars);
/* keep track of our encoded position */
char* c = output.begin();
/* store the number of bytes encoded by a single call */
int cnt = 0;
size_t total = 0;
/* we need an encoder state */
base64_encodestate s;
/*---------- START ENCODING ----------*/
/* initialise the encoder state */
base64_init_encodestate(&s);
/* gather data from the input and send it to the output */
cnt = base64_encode_block((const char *)input.begin(), input.size(), c, &s, breakLines);
c += cnt;
total += cnt;
/* since we have encoded the entire input string, we know that
there is no more input data; finalise the encoding */
cnt = base64_encode_blockend(c, &s, breakLines);
c += cnt;
total += cnt;
/*---------- STOP ENCODING ----------*/
KJ_ASSERT(total == output.size(), total, output.size());
return output;
}
// -------------------------------------------------------------------
// Decoder
namespace {
typedef enum {
step_a, step_b, step_c, step_d
} base64_decodestep;
typedef struct {
base64_decodestep step;
char plainchar;
} base64_decodestate;
int base64_decode_value(char value_in) {
static const char decoding[] = {
62,-1,-1,-1,63,52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-2,-1,-1,-1,
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,-1,
26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51};
static const char decoding_size = sizeof(decoding);
value_in -= 43;
if (value_in < 0 || value_in > decoding_size) return -1;
return decoding[(int)value_in];
}
void base64_init_decodestate(base64_decodestate* state_in) {
state_in->step = step_a;
state_in->plainchar = 0;
}
int base64_decode_block(const char* code_in, const int length_in,
char* plaintext_out, base64_decodestate* state_in) {
const char* codechar = code_in;
char* plainchar = plaintext_out;
char fragment;
*plainchar = state_in->plainchar;
switch (state_in->step)
{
while (1)
{
case step_a:
do {
if (codechar == code_in+length_in) {
state_in->step = step_a;
state_in->plainchar = *plainchar;
return plainchar - plaintext_out;
}
fragment = (char)base64_decode_value(*codechar++);
} while (fragment < 0);
*plainchar = (fragment & 0x03f) << 2;
case step_b:
do {
if (codechar == code_in+length_in) {
state_in->step = step_b;
state_in->plainchar = *plainchar;
return plainchar - plaintext_out;
}
fragment = (char)base64_decode_value(*codechar++);
} while (fragment < 0);
*plainchar++ |= (fragment & 0x030) >> 4;
*plainchar = (fragment & 0x00f) << 4;
case step_c:
do {
if (codechar == code_in+length_in) {
state_in->step = step_c;
state_in->plainchar = *plainchar;
return plainchar - plaintext_out;
}
fragment = (char)base64_decode_value(*codechar++);
} while (fragment < 0);
*plainchar++ |= (fragment & 0x03c) >> 2;
*plainchar = (fragment & 0x003) << 6;
case step_d:
do {
if (codechar == code_in+length_in) {
state_in->step = step_d;
state_in->plainchar = *plainchar;
return plainchar - plaintext_out;
}
fragment = (char)base64_decode_value(*codechar++);
} while (fragment < 0);
*plainchar++ |= (fragment & 0x03f);
}
}
/* control should not reach here */
return plainchar - plaintext_out;
}
} // namespace
Array<byte> decodeBase64(ArrayPtr<const char> input) {
base64_decodestate state;
base64_init_decodestate(&state);
auto output = heapArray<byte>((input.size() * 6 + 7) / 8);
size_t n = base64_decode_block(input.begin(), input.size(),
reinterpret_cast<char*>(output.begin()), &state);
if (n < output.size()) {
auto copy = heapArray<byte>(n);
memcpy(copy.begin(), output.begin(), n);
output = kj::mv(copy);
}
return output;
}
} // namespace kj
// Copyright (c) 2017 Cloudflare, Inc. and contributors
// Licensed under the MIT License:
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef KJ_ENCODING_H_
#define KJ_ENCODING_H_
// Functions for encoding/decoding bytes and text in common formats, including:
// - UTF-{8,16,32}
// - Hex
// - URI encoding
// - Base64
#if defined(__GNUC__) && !KJ_HEADER_WARNINGS
#pragma GCC system_header
#endif
#include "string.h"
namespace kj {
template <typename ResultType>
struct EncodingResult: public ResultType {
// Equivalent to ResultType (a String or wide-char array) for all intents and purposes, except
// that the bool `hadErrors` can be inspected to see if any errors were encountered in the input.
// Each encoding/decoding function that returns this type will "work around" errors in some way,
// so an application doesn't strictly have to check for errors. E.g. the Unicode functions
// replace errors with U+FFFD in the output.
//
// Through magic, KJ_IF_MAYBE() and KJ_{REQUIRE,ASSERT}_NONNULL() work on EncodingResult<T>
// exactly if it were a Maybe<T> that is null in case of errors.
inline EncodingResult(ResultType&& result, bool hadErrors)
: ResultType(kj::mv(result)), hadErrors(hadErrors) {}
const bool hadErrors;
};
EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false);
EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false);
// Convert UTF-8 text (which KJ strings use) to UTF-16 or UTF-32.
//
// If `nulTerminate` is true, an extra NUL character will be added to the end of the output.
//
// The `try` versions return null if the input is invalid; the non-`try` versions return data
// containing the Unicode replacement character (U+FFFD).
//
// The returned arrays are in platform-native endianness (otherwise they wouldn't really be
// char16_t / char32_t).
EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16);
EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32);
// Convert UTF-16 or UTF-32 to UTF-8 (which KJ strings use).
//
// The input should NOT include a NUL terminator; any NUL characters in the input array will be
// preserved in the output.
//
// The `try` versions return null if the input is invalid; the non-`try` versions return data
// containing the Unicode replacement character (U+FFFD).
//
// The input must be in platform-native endianness. BOMs are NOT recognized by these functions.
String encodeHex(ArrayPtr<const byte> bytes);
EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text);
// Encode/decode bytes as hex strings.
String encodeUriComponent(ArrayPtr<const byte> bytes);
String encodeUriComponent(ArrayPtr<const char> bytes);
EncodingResult<Array<byte>> decodeBinaryUriComponent(
ArrayPtr<const char> text, bool nulTerminate = false);
EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text);
// Encode/decode URI components using % escapes. See Javascript's encodeURIComponent().
String encodeCEscape(ArrayPtr<const byte> bytes);
String encodeCEscape(ArrayPtr<const char> bytes);
EncodingResult<Array<byte>> decodeBinaryCEscape(
ArrayPtr<const char> text, bool nulTerminate = false);
EncodingResult<String> decodeCEscape(ArrayPtr<const char> text);
String encodeBase64(ArrayPtr<const byte> bytes, bool breakLines = false);
// Encode the given bytes as base64 text. If `breakLines` is true, line breaks will be inserted
// into the output every 72 characters (e.g. for encoding e-mail bodies).
Array<byte> decodeBase64(ArrayPtr<const char> text);
// Decode base64 text. Non-base64 characters are ignored and padding characters are not requried;
// as such, this function never fails.
// =======================================================================================
// inline implementation details
namespace _ { // private
template <typename T>
NullableValue<T> readMaybe(EncodingResult<T>&& value) {
if (value.hadErrors) {
return nullptr;
} else {
return kj::mv(value);
}
}
template <typename T>
T* readMaybe(EncodingResult<T>& value) {
if (value.hadErrors) {
return nullptr;
} else {
return &value;
}
}
template <typename T>
const T* readMaybe(const EncodingResult<T>& value) {
if (value.hadErrors) {
return nullptr;
} else {
return &value;
}
}
} // namespace _ (private)
inline String encodeUriComponent(ArrayPtr<const char> text) {
return encodeUriComponent(text.asBytes());
}
inline EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text) {
auto result = decodeBinaryUriComponent(text, true);
return { String(result.releaseAsChars()), result.hadErrors };
}
inline String encodeCEscape(ArrayPtr<const char> text) {
return encodeCEscape(text.asBytes());
}
inline EncodingResult<String> decodeCEscape(ArrayPtr<const char> text) {
auto result = decodeBinaryCEscape(text, true);
return { String(result.releaseAsChars()), result.hadErrors };
}
// If you pass a string literal to a function taking ArrayPtr<const char>, it'll include the NUL
// termintator, which is surprising. Let's add overloads that avoid that. In practice this probably
// only even matters for encoding-test.c++.
template <size_t s>
inline EncodingResult<Array<char16_t>> encodeUtf16(const char (&text)[s], bool nulTerminate=false) {
return encodeUtf16(arrayPtr(text, s - 1), nulTerminate);
}
template <size_t s>
inline EncodingResult<Array<char32_t>> encodeUtf32(const char (&text)[s], bool nulTerminate=false) {
return encodeUtf32(arrayPtr(text, s - 1), nulTerminate);
}
template <size_t s>
inline EncodingResult<String> decodeUtf16(const char16_t (&utf16)[s]) {
return decodeUtf16(arrayPtr(utf16, s - 1));
}
template <size_t s>
inline EncodingResult<String> decodeUtf32(const char32_t (&utf32)[s]) {
return decodeUtf32(arrayPtr(utf32, s - 1));
}
template <size_t s>
inline EncodingResult<Array<byte>> decodeHex(const char (&text)[s]) {
return decodeHex(arrayPtr(text, s - 1));
}
template <size_t s>
inline String encodeUriComponent(const char (&text)[s]) {
return encodeUriComponent(arrayPtr(text, s - 1));
}
template <size_t s>
inline Array<byte> decodeBinaryUriComponent(const char (&text)[s]) {
return decodeBinaryUriComponent(arrayPtr(text, s - 1));
}
template <size_t s>
inline EncodingResult<String> decodeUriComponent(const char (&text)[s]) {
return decodeUriComponent(arrayPtr(text, s-1));
}
template <size_t s>
inline String encodeCEscape(const char (&text)[s]) {
return encodeCEscape(arrayPtr(text, s - 1));
}
template <size_t s>
inline EncodingResult<Array<byte>> decodeBinaryCEscape(const char (&text)[s]) {
return decodeBinaryCEscape(arrayPtr(text, s - 1));
}
template <size_t s>
inline EncodingResult<String> decodeCEscape(const char (&text)[s]) {
return decodeCEscape(arrayPtr(text, s-1));
}
template <size_t s>
Array<byte> decodeBase64(const char (&text)[s]) {
return decodeBase64(arrayPtr(text, s - 1));
}
} // namespace kj
#endif // KJ_ENCODING_H_
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment