Commit df52bf86 authored by Kenton Varda's avatar Kenton Varda

Improve KJ encoding lib error handling:

- Rename UtfResult -> EncodingResult
- Make it usable like a Maybe, so that we don't need separate "try" functions.
- Check errors in hex decoding and URI decoding.
parent 03800dfa
...@@ -915,7 +915,6 @@ public: ...@@ -915,7 +915,6 @@ public:
return value; return value;
} }
private: // internal interface used by friends only
inline NullableValue() noexcept: isSet(false) {} inline NullableValue() noexcept: isSet(false) {}
inline NullableValue(T&& t) noexcept(noexcept(T(instance<T&&>()))) inline NullableValue(T&& t) noexcept(noexcept(T(instance<T&&>())))
: isSet(true) { : isSet(true) {
......
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
namespace kj { namespace kj {
namespace { namespace {
CappedArray<char, sizeof(char ) * 2 + 1> hex(byte i) { return kj::hex((uint8_t )i); }
CappedArray<char, sizeof(char ) * 2 + 1> hex(char i) { return kj::hex((uint8_t )i); } CappedArray<char, sizeof(char ) * 2 + 1> hex(char i) { return kj::hex((uint8_t )i); }
CappedArray<char, sizeof(char16_t) * 2 + 1> hex(char16_t i) { return kj::hex((uint16_t)i); } CappedArray<char, sizeof(char16_t) * 2 + 1> hex(char16_t i) { return kj::hex((uint16_t)i); }
CappedArray<char, sizeof(char32_t) * 2 + 1> hex(char32_t i) { return kj::hex((uint32_t)i); } CappedArray<char, sizeof(char32_t) * 2 + 1> hex(char32_t i) { return kj::hex((uint32_t)i); }
...@@ -34,7 +35,7 @@ CappedArray<char, sizeof(char32_t) * 2 + 1> hex(char32_t i) { return kj::hex((ui ...@@ -34,7 +35,7 @@ CappedArray<char, sizeof(char32_t) * 2 + 1> hex(char32_t i) { return kj::hex((ui
// TODO(cleanup): Should this go into string.h with the other definitions of hex()? // TODO(cleanup): Should this go into string.h with the other definitions of hex()?
template <typename T> template <typename T>
void expectUtf(UtfResult<T> result, void expectRes(EncodingResult<T> result,
ArrayPtr<const Decay<decltype(result[0])>> expected, ArrayPtr<const Decay<decltype(result[0])>> expected,
bool errors = false) { bool errors = false) {
if (errors) { if (errors) {
...@@ -50,147 +51,153 @@ void expectUtf(UtfResult<T> result, ...@@ -50,147 +51,153 @@ void expectUtf(UtfResult<T> result,
} }
template <typename T, size_t s> template <typename T, size_t s>
void expectUtf(UtfResult<T> result, void expectRes(EncodingResult<T> result,
const Decay<decltype(result[0])> (&expected)[s], const Decay<decltype(result[0])> (&expected)[s],
bool errors = false) { bool errors = false) {
expectUtf(kj::mv(result), arrayPtr(expected, s - 1), errors); expectRes(kj::mv(result), arrayPtr(expected, s - 1), errors);
}
template <typename T, size_t s>
void expectRes(EncodingResult<T> result,
byte (&expected)[s],
bool errors = false) {
expectRes(kj::mv(result), arrayPtr(expected, s), errors);
} }
KJ_TEST("encode UTF-8 to UTF-16") { KJ_TEST("encode UTF-8 to UTF-16") {
expectUtf(encodeUtf16(u8"foo"), u"foo"); expectRes(encodeUtf16(u8"foo"), u"foo");
expectUtf(encodeUtf16(u8"Здравствуйте"), u"Здравствуйте"); expectRes(encodeUtf16(u8"Здравствуйте"), u"Здравствуйте");
expectUtf(encodeUtf16(u8"中国网络"), u"中国网络"); expectRes(encodeUtf16(u8"中国网络"), u"中国网络");
expectUtf(encodeUtf16(u8"😺☁☄🐵"), u"😺☁☄🐵"); expectRes(encodeUtf16(u8"😺☁☄🐵"), u"😺☁☄🐵");
} }
KJ_TEST("invalid UTF-8 to UTF-16") { KJ_TEST("invalid UTF-8 to UTF-16") {
// Disembodied continuation byte. // Disembodied continuation byte.
expectUtf(encodeUtf16("\x80"), u"\ufffd", true); expectRes(encodeUtf16("\x80"), u"\ufffd", true);
expectUtf(encodeUtf16("f\xbfo"), u"f\ufffdo", true); expectRes(encodeUtf16("f\xbfo"), u"f\ufffdo", true);
expectUtf(encodeUtf16("f\xbf\x80\xb0o"), u"f\ufffdo", true); expectRes(encodeUtf16("f\xbf\x80\xb0o"), u"f\ufffdo", true);
// Missing continuation bytes. // Missing continuation bytes.
expectUtf(encodeUtf16("\xc2x"), u"\ufffdx", true); expectRes(encodeUtf16("\xc2x"), u"\ufffdx", true);
expectUtf(encodeUtf16("\xe0x"), u"\ufffdx", true); expectRes(encodeUtf16("\xe0x"), u"\ufffdx", true);
expectUtf(encodeUtf16("\xe0\xa0x"), u"\ufffdx", true); expectRes(encodeUtf16("\xe0\xa0x"), u"\ufffdx", true);
expectUtf(encodeUtf16("\xf0x"), u"\ufffdx", true); expectRes(encodeUtf16("\xf0x"), u"\ufffdx", true);
expectUtf(encodeUtf16("\xf0\x90x"), u"\ufffdx", true); expectRes(encodeUtf16("\xf0\x90x"), u"\ufffdx", true);
expectUtf(encodeUtf16("\xf0\x90\x80x"), u"\ufffdx", true); expectRes(encodeUtf16("\xf0\x90\x80x"), u"\ufffdx", true);
// Overlong sequences. // Overlong sequences.
expectUtf(encodeUtf16("\xc0\x80"), u"\ufffd", true); expectRes(encodeUtf16("\xc0\x80"), u"\ufffd", true);
expectUtf(encodeUtf16("\xc1\xbf"), u"\ufffd", true); expectRes(encodeUtf16("\xc1\xbf"), u"\ufffd", true);
expectUtf(encodeUtf16("\xc2\x80"), u"\u0080", false); expectRes(encodeUtf16("\xc2\x80"), u"\u0080", false);
expectUtf(encodeUtf16("\xdf\xbf"), u"\u07ff", false); expectRes(encodeUtf16("\xdf\xbf"), u"\u07ff", false);
expectUtf(encodeUtf16("\xe0\x80\x80"), u"\ufffd", true); expectRes(encodeUtf16("\xe0\x80\x80"), u"\ufffd", true);
expectUtf(encodeUtf16("\xe0\x9f\xbf"), u"\ufffd", true); expectRes(encodeUtf16("\xe0\x9f\xbf"), u"\ufffd", true);
expectUtf(encodeUtf16("\xe0\xa0\x80"), u"\u0800", false); expectRes(encodeUtf16("\xe0\xa0\x80"), u"\u0800", false);
expectUtf(encodeUtf16("\xef\xbf\xbf"), u"\uffff", false); expectRes(encodeUtf16("\xef\xbf\xbf"), u"\uffff", false);
expectUtf(encodeUtf16("\xf0\x80\x80\x80"), u"\ufffd", true); expectRes(encodeUtf16("\xf0\x80\x80\x80"), u"\ufffd", true);
expectUtf(encodeUtf16("\xf0\x8f\xbf\xbf"), u"\ufffd", true); expectRes(encodeUtf16("\xf0\x8f\xbf\xbf"), u"\ufffd", true);
expectUtf(encodeUtf16("\xf0\x90\x80\x80"), u"\U00010000", false); expectRes(encodeUtf16("\xf0\x90\x80\x80"), u"\U00010000", false);
expectUtf(encodeUtf16("\xf4\x8f\xbf\xbf"), u"\U0010ffff", false); expectRes(encodeUtf16("\xf4\x8f\xbf\xbf"), u"\U0010ffff", false);
// Out of Unicode range. // Out of Unicode range.
expectUtf(encodeUtf16("\xf5\x80\x80\x80"), u"\ufffd", true); expectRes(encodeUtf16("\xf5\x80\x80\x80"), u"\ufffd", true);
expectUtf(encodeUtf16("\xf8\xbf\x80\x80\x80"), u"\ufffd", true); expectRes(encodeUtf16("\xf8\xbf\x80\x80\x80"), u"\ufffd", true);
expectUtf(encodeUtf16("\xfc\xbf\x80\x80\x80\x80"), u"\ufffd", true); expectRes(encodeUtf16("\xfc\xbf\x80\x80\x80\x80"), u"\ufffd", true);
expectUtf(encodeUtf16("\xfe\xbf\x80\x80\x80\x80\x80"), u"\ufffd", true); expectRes(encodeUtf16("\xfe\xbf\x80\x80\x80\x80\x80"), u"\ufffd", true);
expectUtf(encodeUtf16("\xff\xbf\x80\x80\x80\x80\x80\x80"), u"\ufffd", true); expectRes(encodeUtf16("\xff\xbf\x80\x80\x80\x80\x80\x80"), u"\ufffd", true);
} }
KJ_TEST("encode UTF-8 to UTF-32") { KJ_TEST("encode UTF-8 to UTF-32") {
expectUtf(encodeUtf32(u8"foo"), U"foo"); expectRes(encodeUtf32(u8"foo"), U"foo");
expectUtf(encodeUtf32(u8"Здравствуйте"), U"Здравствуйте"); expectRes(encodeUtf32(u8"Здравствуйте"), U"Здравствуйте");
expectUtf(encodeUtf32(u8"中国网络"), U"中国网络"); expectRes(encodeUtf32(u8"中国网络"), U"中国网络");
expectUtf(encodeUtf32(u8"😺☁☄🐵"), U"😺☁☄🐵"); expectRes(encodeUtf32(u8"😺☁☄🐵"), U"😺☁☄🐵");
} }
KJ_TEST("invalid UTF-8 to UTF-32") { KJ_TEST("invalid UTF-8 to UTF-32") {
// Disembodied continuation byte. // Disembodied continuation byte.
expectUtf(encodeUtf32("\x80"), U"\ufffd", true); expectRes(encodeUtf32("\x80"), U"\ufffd", true);
expectUtf(encodeUtf32("f\xbfo"), U"f\ufffdo", true); expectRes(encodeUtf32("f\xbfo"), U"f\ufffdo", true);
expectUtf(encodeUtf32("f\xbf\x80\xb0o"), U"f\ufffdo", true); expectRes(encodeUtf32("f\xbf\x80\xb0o"), U"f\ufffdo", true);
// Missing continuation bytes. // Missing continuation bytes.
expectUtf(encodeUtf32("\xc2x"), U"\ufffdx", true); expectRes(encodeUtf32("\xc2x"), U"\ufffdx", true);
expectUtf(encodeUtf32("\xe0x"), U"\ufffdx", true); expectRes(encodeUtf32("\xe0x"), U"\ufffdx", true);
expectUtf(encodeUtf32("\xe0\xa0x"), U"\ufffdx", true); expectRes(encodeUtf32("\xe0\xa0x"), U"\ufffdx", true);
expectUtf(encodeUtf32("\xf0x"), U"\ufffdx", true); expectRes(encodeUtf32("\xf0x"), U"\ufffdx", true);
expectUtf(encodeUtf32("\xf0\x90x"), U"\ufffdx", true); expectRes(encodeUtf32("\xf0\x90x"), U"\ufffdx", true);
expectUtf(encodeUtf32("\xf0\x90\x80x"), U"\ufffdx", true); expectRes(encodeUtf32("\xf0\x90\x80x"), U"\ufffdx", true);
// Overlong sequences. // Overlong sequences.
expectUtf(encodeUtf32("\xc0\x80"), U"\ufffd", true); expectRes(encodeUtf32("\xc0\x80"), U"\ufffd", true);
expectUtf(encodeUtf32("\xc1\xbf"), U"\ufffd", true); expectRes(encodeUtf32("\xc1\xbf"), U"\ufffd", true);
expectUtf(encodeUtf32("\xc2\x80"), U"\u0080", false); expectRes(encodeUtf32("\xc2\x80"), U"\u0080", false);
expectUtf(encodeUtf32("\xdf\xbf"), U"\u07ff", false); expectRes(encodeUtf32("\xdf\xbf"), U"\u07ff", false);
expectUtf(encodeUtf32("\xe0\x80\x80"), U"\ufffd", true); expectRes(encodeUtf32("\xe0\x80\x80"), U"\ufffd", true);
expectUtf(encodeUtf32("\xe0\x9f\xbf"), U"\ufffd", true); expectRes(encodeUtf32("\xe0\x9f\xbf"), U"\ufffd", true);
expectUtf(encodeUtf32("\xe0\xa0\x80"), U"\u0800", false); expectRes(encodeUtf32("\xe0\xa0\x80"), U"\u0800", false);
expectUtf(encodeUtf32("\xef\xbf\xbf"), U"\uffff", false); expectRes(encodeUtf32("\xef\xbf\xbf"), U"\uffff", false);
expectUtf(encodeUtf32("\xf0\x80\x80\x80"), U"\ufffd", true); expectRes(encodeUtf32("\xf0\x80\x80\x80"), U"\ufffd", true);
expectUtf(encodeUtf32("\xf0\x8f\xbf\xbf"), U"\ufffd", true); expectRes(encodeUtf32("\xf0\x8f\xbf\xbf"), U"\ufffd", true);
expectUtf(encodeUtf32("\xf0\x90\x80\x80"), U"\U00010000", false); expectRes(encodeUtf32("\xf0\x90\x80\x80"), U"\U00010000", false);
expectUtf(encodeUtf32("\xf4\x8f\xbf\xbf"), U"\U0010ffff", false); expectRes(encodeUtf32("\xf4\x8f\xbf\xbf"), U"\U0010ffff", false);
// Out of Unicode range. // Out of Unicode range.
expectUtf(encodeUtf32("\xf5\x80\x80\x80"), U"\ufffd", true); expectRes(encodeUtf32("\xf5\x80\x80\x80"), U"\ufffd", true);
expectUtf(encodeUtf32("\xf8\xbf\x80\x80\x80"), U"\ufffd", true); expectRes(encodeUtf32("\xf8\xbf\x80\x80\x80"), U"\ufffd", true);
expectUtf(encodeUtf32("\xfc\xbf\x80\x80\x80\x80"), U"\ufffd", true); expectRes(encodeUtf32("\xfc\xbf\x80\x80\x80\x80"), U"\ufffd", true);
expectUtf(encodeUtf32("\xfe\xbf\x80\x80\x80\x80\x80"), U"\ufffd", true); expectRes(encodeUtf32("\xfe\xbf\x80\x80\x80\x80\x80"), U"\ufffd", true);
expectUtf(encodeUtf32("\xff\xbf\x80\x80\x80\x80\x80\x80"), U"\ufffd", true); expectRes(encodeUtf32("\xff\xbf\x80\x80\x80\x80\x80\x80"), U"\ufffd", true);
} }
KJ_TEST("decode UTF-16 to UTF-8") { KJ_TEST("decode UTF-16 to UTF-8") {
expectUtf(decodeUtf16(u"foo"), u8"foo"); expectRes(decodeUtf16(u"foo"), u8"foo");
expectUtf(decodeUtf16(u"Здравствуйте"), u8"Здравствуйте"); expectRes(decodeUtf16(u"Здравствуйте"), u8"Здравствуйте");
expectUtf(decodeUtf16(u"中国网络"), u8"中国网络"); expectRes(decodeUtf16(u"中国网络"), u8"中国网络");
expectUtf(decodeUtf16(u"😺☁☄🐵"), u8"😺☁☄🐵"); expectRes(decodeUtf16(u"😺☁☄🐵"), u8"😺☁☄🐵");
} }
KJ_TEST("invalid UTF-16 to UTF-8") { KJ_TEST("invalid UTF-16 to UTF-8") {
// Surrogates in wrong order. // Surrogates in wrong order.
expectUtf(decodeUtf16(u"\xd7ff\xdc00\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true); expectRes(decodeUtf16(u"\xd7ff\xdc00\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true);
// Missing second surrogate. // Missing second surrogate.
expectUtf(decodeUtf16(u"f\xd800"), u8"f\ufffd", true); expectRes(decodeUtf16(u"f\xd800"), u8"f\ufffd", true);
expectUtf(decodeUtf16(u"f\xd800x"), u8"f\ufffdx", true); expectRes(decodeUtf16(u"f\xd800x"), u8"f\ufffdx", true);
expectUtf(decodeUtf16(u"f\xd800\xd800x"), u8"f\ufffd\ufffdx", true); expectRes(decodeUtf16(u"f\xd800\xd800x"), u8"f\ufffd\ufffdx", true);
} }
KJ_TEST("decode UTF-32 to UTF-8") { KJ_TEST("decode UTF-32 to UTF-8") {
expectUtf(decodeUtf32(U"foo"), u8"foo"); expectRes(decodeUtf32(U"foo"), u8"foo");
expectUtf(decodeUtf32(U"Здравствуйте"), u8"Здравствуйте"); expectRes(decodeUtf32(U"Здравствуйте"), u8"Здравствуйте");
expectUtf(decodeUtf32(U"中国网络"), u8"中国网络"); expectRes(decodeUtf32(U"中国网络"), u8"中国网络");
expectUtf(decodeUtf32(U"😺☁☄🐵"), u8"😺☁☄🐵"); expectRes(decodeUtf32(U"😺☁☄🐵"), u8"😺☁☄🐵");
} }
KJ_TEST("invalid UTF-32 to UTF-8") { KJ_TEST("invalid UTF-32 to UTF-8") {
// Surrogates rejected. // Surrogates rejected.
expectUtf(decodeUtf32(U"\xd7ff\xdc00\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true); expectRes(decodeUtf32(U"\xd7ff\xdc00\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true);
// Even if it would be a valid surrogate pair in UTF-16. // Even if it would be a valid surrogate pair in UTF-16.
expectUtf(decodeUtf32(U"\xd7ff\xd800\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true); expectRes(decodeUtf32(U"\xd7ff\xd800\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true);
} }
KJ_TEST("tryEncode / tryDecode") { KJ_TEST("EncodingResult as a Maybe") {
KJ_EXPECT(tryEncodeUtf16("\x80") == nullptr); KJ_IF_MAYBE(result, encodeUtf16("\x80")) {
KJ_EXPECT(ArrayPtr<const char16_t>(KJ_ASSERT_NONNULL(tryEncodeUtf16("foo"))) KJ_FAIL_EXPECT("expected failure");
== arrayPtr(u"foo", 3)); }
KJ_EXPECT(tryEncodeUtf32("\x80") == nullptr); KJ_IF_MAYBE(result, encodeUtf16("foo")) {
KJ_EXPECT(ArrayPtr<const char32_t>(KJ_ASSERT_NONNULL(tryEncodeUtf32("foo"))) // good
== arrayPtr(U"foo", 3)); } else {
KJ_FAIL_EXPECT("expected success");
}
KJ_EXPECT(tryDecodeUtf16(u"\xd800") == nullptr); KJ_EXPECT(KJ_ASSERT_NONNULL(decodeUtf16(u"foo")) == "foo");
KJ_EXPECT(KJ_ASSERT_NONNULL(tryDecodeUtf16(u"foo")) == "foo");
KJ_EXPECT(tryDecodeUtf32(U"\xd800") == nullptr);
KJ_EXPECT(KJ_ASSERT_NONNULL(tryDecodeUtf32(U"foo")) == "foo");
} }
// ======================================================================================= // =======================================================================================
...@@ -199,7 +206,16 @@ KJ_TEST("hex encoding/decoding") { ...@@ -199,7 +206,16 @@ KJ_TEST("hex encoding/decoding") {
byte bytes[] = {0x12, 0x34, 0xab, 0xf2}; byte bytes[] = {0x12, 0x34, 0xab, 0xf2};
KJ_EXPECT(encodeHex(bytes) == "1234abf2"); KJ_EXPECT(encodeHex(bytes) == "1234abf2");
KJ_EXPECT(decodeHex("1234abf2").asPtr() == bytes);
expectRes(decodeHex("1234abf2"), bytes);
expectRes(decodeHex("1234abf21"), bytes, true);
bytes[2] = 0xa0;
expectRes(decodeHex("1234axf2"), bytes, true);
bytes[2] = 0x0b;
expectRes(decodeHex("1234xbf2"), bytes, true);
} }
KJ_TEST("URI encoding/decoding") { KJ_TEST("URI encoding/decoding") {
...@@ -208,8 +224,13 @@ KJ_TEST("URI encoding/decoding") { ...@@ -208,8 +224,13 @@ KJ_TEST("URI encoding/decoding") {
KJ_EXPECT(encodeUriComponent("\xab\xba") == "%ab%ba"); KJ_EXPECT(encodeUriComponent("\xab\xba") == "%ab%ba");
KJ_EXPECT(encodeUriComponent(StringPtr("foo\0bar", 7)) == "foo%00bar"); KJ_EXPECT(encodeUriComponent(StringPtr("foo\0bar", 7)) == "foo%00bar");
KJ_EXPECT(decodeUriComponent("foo%20bar") == "foo bar"); expectRes(decodeUriComponent("foo%20bar"), "foo bar");
KJ_EXPECT(decodeUriComponent("%ab%BA") == "\xab\xba"); expectRes(decodeUriComponent("%ab%BA"), "\xab\xba");
expectRes(decodeUriComponent("foo%1xxx"), "foo\1xxx", true);
expectRes(decodeUriComponent("foo%1"), "foo\1", true);
expectRes(decodeUriComponent("foo%xxx"), "fooxxx", true);
expectRes(decodeUriComponent("foo%"), "foo", true);
byte bytes[] = {12, 34, 56}; byte bytes[] = {12, 34, 56};
KJ_EXPECT(decodeBinaryUriComponent(encodeUriComponent(bytes)).asPtr() == bytes); KJ_EXPECT(decodeBinaryUriComponent(encodeUriComponent(bytes)).asPtr() == bytes);
...@@ -221,23 +242,23 @@ KJ_TEST("C escape encoding/decoding") { ...@@ -221,23 +242,23 @@ KJ_TEST("C escape encoding/decoding") {
KJ_EXPECT(encodeCEscape("foo\x01\x7fxxx") == KJ_EXPECT(encodeCEscape("foo\x01\x7fxxx") ==
"foo\\001\\177xxx"); "foo\\001\\177xxx");
expectUtf(decodeCEscape("fooo\\a\\b\\f\\n\\r\\t\\v\\\'\\\"\\\\bar"), expectRes(decodeCEscape("fooo\\a\\b\\f\\n\\r\\t\\v\\\'\\\"\\\\bar"),
"fooo\a\b\f\n\r\t\v\'\"\\bar"); "fooo\a\b\f\n\r\t\v\'\"\\bar");
expectUtf(decodeCEscape("foo\\x01\\x7fxxx"), "foo\x01\x7fxxx"); expectRes(decodeCEscape("foo\\x01\\x7fxxx"), "foo\x01\x7fxxx");
expectUtf(decodeCEscape("foo\\001\\177234"), "foo\001\177234"); expectRes(decodeCEscape("foo\\001\\177234"), "foo\001\177234");
expectUtf(decodeCEscape("foo\\x1"), "foo\x1"); expectRes(decodeCEscape("foo\\x1"), "foo\x1");
expectUtf(decodeCEscape("foo\\1"), "foo\1"); expectRes(decodeCEscape("foo\\1"), "foo\1");
expectUtf(decodeCEscape("foo\\u1234bar"), u8"foo\u1234bar"); expectRes(decodeCEscape("foo\\u1234bar"), u8"foo\u1234bar");
expectUtf(decodeCEscape("foo\\U00045678bar"), u8"foo\U00045678bar"); expectRes(decodeCEscape("foo\\U00045678bar"), u8"foo\U00045678bar");
// Error cases. // Error cases.
expectUtf(decodeCEscape("foo\\"), "foo", true); expectRes(decodeCEscape("foo\\"), "foo", true);
expectUtf(decodeCEscape("foo\\x123x"), u8"foo\x23x", true); expectRes(decodeCEscape("foo\\x123x"), u8"foo\x23x", true);
expectUtf(decodeCEscape("foo\\u12"), u8"foo\u0012", true); expectRes(decodeCEscape("foo\\u12"), u8"foo\u0012", true);
expectUtf(decodeCEscape("foo\\u12xxx"), u8"foo\u0012xxx", true); expectRes(decodeCEscape("foo\\u12xxx"), u8"foo\u0012xxx", true);
expectUtf(decodeCEscape("foo\\U12"), u8"foo\u0012", true); expectRes(decodeCEscape("foo\\U12"), u8"foo\u0012", true);
expectUtf(decodeCEscape("foo\\U12xxxxxxxx"), u8"foo\u0012xxxxxxxx", true); expectRes(decodeCEscape("foo\\U12xxxxxxxx"), u8"foo\u0012xxxxxxxx", true);
} }
KJ_TEST("base64 encoding/decoding") { KJ_TEST("base64 encoding/decoding") {
......
...@@ -41,7 +41,7 @@ inline void addChar32(Vector<char32_t>& vec, char32_t u) { ...@@ -41,7 +41,7 @@ inline void addChar32(Vector<char32_t>& vec, char32_t u) {
} }
template <typename T> template <typename T>
UtfResult<Array<T>> encodeUtf(ArrayPtr<const char> text, bool nulTerminate) { EncodingResult<Array<T>> encodeUtf(ArrayPtr<const char> text, bool nulTerminate) {
Vector<T> result(text.size() + nulTerminate); Vector<T> result(text.size() + nulTerminate);
bool hadErrors = false; bool hadErrors = false;
...@@ -125,33 +125,15 @@ UtfResult<Array<T>> encodeUtf(ArrayPtr<const char> text, bool nulTerminate) { ...@@ -125,33 +125,15 @@ UtfResult<Array<T>> encodeUtf(ArrayPtr<const char> text, bool nulTerminate) {
} // namespace } // namespace
UtfResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate) { EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate) {
return encodeUtf<char16_t>(text, nulTerminate); return encodeUtf<char16_t>(text, nulTerminate);
} }
UtfResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate) { EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate) {
return encodeUtf<char32_t>(text, nulTerminate); return encodeUtf<char32_t>(text, nulTerminate);
} }
Maybe<Array<char16_t>> tryEncodeUtf16(ArrayPtr<const char> text, bool nulTerminate) { EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) {
auto result = encodeUtf16(text, nulTerminate);
if (result.hadErrors) {
return nullptr;
} else {
return kj::mv(result);
}
}
Maybe<Array<char32_t>> tryEncodeUtf32(ArrayPtr<const char> text, bool nulTerminate) {
auto result = encodeUtf32(text, nulTerminate);
if (result.hadErrors) {
return nullptr;
} else {
return kj::mv(result);
}
}
UtfResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) {
Vector<char> result(utf16.size() + 1); Vector<char> result(utf16.size() + 1);
bool hadErrors = false; bool hadErrors = false;
...@@ -202,7 +184,7 @@ UtfResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) { ...@@ -202,7 +184,7 @@ UtfResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) {
return { String(result.releaseAsArray()), hadErrors }; return { String(result.releaseAsArray()), hadErrors };
} }
UtfResult<String> decodeUtf32(ArrayPtr<const char32_t> utf16) { EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf16) {
Vector<char> result(utf16.size() + 1); Vector<char> result(utf16.size() + 1);
bool hadErrors = false; bool hadErrors = false;
...@@ -247,28 +229,33 @@ UtfResult<String> decodeUtf32(ArrayPtr<const char32_t> utf16) { ...@@ -247,28 +229,33 @@ UtfResult<String> decodeUtf32(ArrayPtr<const char32_t> utf16) {
return { String(result.releaseAsArray()), hadErrors }; return { String(result.releaseAsArray()), hadErrors };
} }
Maybe<String> tryDecodeUtf16(ArrayPtr<const char16_t> utf16) { // =======================================================================================
auto result = decodeUtf16(utf16);
if (result.hadErrors) { namespace {
return nullptr;
const char HEX_DIGITS[] = "0123456789abcdef";
static Maybe<uint> tryFromHexDigit(char c) {
if ('0' <= c && c <= '9') {
return c - '0';
} else if ('a' <= c && c <= 'f') {
return c - ('a' - 10);
} else if ('A' <= c && c <= 'F') {
return c - ('A' - 10);
} else { } else {
return kj::mv(result); return nullptr;
} }
} }
Maybe<String> tryDecodeUtf32(ArrayPtr<const char32_t> utf32) {
auto result = decodeUtf32(utf32); static Maybe<uint> tryFromOctDigit(char c) {
if (result.hadErrors) { if ('0' <= c && c <= '7') {
return nullptr; return c - '0';
} else { } else {
return kj::mv(result); return nullptr;
} }
} }
// ======================================================================================= } // namespace
namespace {
const char HEX_DIGITS[] = "0123456789abcdef";
}
String encodeHex(ArrayPtr<const byte> input) { String encodeHex(ArrayPtr<const byte> input) {
return strArray(KJ_MAP(b, input) { return strArray(KJ_MAP(b, input) {
...@@ -276,27 +263,26 @@ String encodeHex(ArrayPtr<const byte> input) { ...@@ -276,27 +263,26 @@ String encodeHex(ArrayPtr<const byte> input) {
}, ""); }, "");
} }
static uint fromDigit(char c) { EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text) {
if ('0' <= c && c <= '9') {
return c - '0';
} else if ('a' <= c && c <= 'z') {
return c - ('a' - 10);
} else if ('A' <= c && c <= 'Z') {
return c - ('A' - 10);
} else {
return 0;
}
}
Array<byte> decodeHex(ArrayPtr<const char> text) {
auto result = heapArray<byte>(text.size() / 2); auto result = heapArray<byte>(text.size() / 2);
bool hadErrors = text.size() % 2;
for (auto i: kj::indices(result)) { for (auto i: kj::indices(result)) {
result[i] = (fromDigit(text[i*2]) << 4) byte b = 0;
| (fromDigit(text[i*2+1])); KJ_IF_MAYBE(d1, tryFromHexDigit(text[i*2])) {
b = *d1 << 4;
} else {
hadErrors = true;
}
KJ_IF_MAYBE(d2, tryFromHexDigit(text[i*2+1])) {
b |= *d2;
} else {
hadErrors = true;
}
result[i] = b;
} }
return result; return { kj::mv(result), hadErrors };
} }
String encodeUriComponent(ArrayPtr<const byte> bytes) { String encodeUriComponent(ArrayPtr<const byte> bytes) {
...@@ -316,26 +302,41 @@ String encodeUriComponent(ArrayPtr<const byte> bytes) { ...@@ -316,26 +302,41 @@ String encodeUriComponent(ArrayPtr<const byte> bytes) {
return String(result.releaseAsArray()); return String(result.releaseAsArray());
} }
Array<byte> decodeBinaryUriComponent(ArrayPtr<const char> text, bool nulTerminate) { EncodingResult<Array<byte>> decodeBinaryUriComponent(
ArrayPtr<const char> text, bool nulTerminate) {
Vector<byte> result(text.size() + nulTerminate); Vector<byte> result(text.size() + nulTerminate);
bool hadErrors = false;
const char* ptr = text.begin(); const char* ptr = text.begin();
const char* end = text.end(); const char* end = text.end();
while (ptr < end) { while (ptr < end) {
if (*ptr == '%') { if (*ptr == '%') {
++ptr; ++ptr;
if (ptr == end) break;
byte b = fromDigit(*ptr++) << 4; if (ptr == end) {
if (ptr == end) break; hadErrors = true;
b |= fromDigit(*ptr++); } else KJ_IF_MAYBE(d1, tryFromHexDigit(*ptr)) {
result.add(b); byte b = *d1;
++ptr;
if (ptr == end) {
hadErrors = true;
} else KJ_IF_MAYBE(d2, tryFromHexDigit(*ptr)) {
b = (b << 4) | *d2;
++ptr;
} else {
hadErrors = true;
}
result.add(b);
} else {
hadErrors = true;
}
} else { } else {
result.add(*ptr++); result.add(*ptr++);
} }
} }
if (nulTerminate) result.add(0); if (nulTerminate) result.add(0);
return result.releaseAsArray(); return { result.releaseAsArray(), hadErrors };
} }
// ======================================================================================= // =======================================================================================
...@@ -374,31 +375,7 @@ String encodeCEscape(ArrayPtr<const byte> bytes) { ...@@ -374,31 +375,7 @@ String encodeCEscape(ArrayPtr<const byte> bytes) {
return String(escaped.releaseAsArray()); return String(escaped.releaseAsArray());
} }
namespace { EncodingResult<Array<byte>> decodeBinaryCEscape(ArrayPtr<const char> text, bool nulTerminate) {
static Maybe<uint> tryFromHexDigit(char c) {
if ('0' <= c && c <= '9') {
return c - '0';
} else if ('a' <= c && c <= 'f') {
return c - ('a' - 10);
} else if ('A' <= c && c <= 'F') {
return c - ('A' - 10);
} else {
return nullptr;
}
}
static Maybe<uint> tryFromOctDigit(char c) {
if ('0' <= c && c <= '7') {
return c - '0';
} else {
return nullptr;
}
}
} // namespace
UtfResult<Array<byte>> decodeBinaryCEscape(ArrayPtr<const char> text, bool nulTerminate) {
Vector<byte> result(text.size() + nulTerminate); Vector<byte> result(text.size() + nulTerminate);
bool hadErrors = false; bool hadErrors = false;
......
...@@ -36,24 +36,24 @@ ...@@ -36,24 +36,24 @@
namespace kj { namespace kj {
template <typename ResultType> template <typename ResultType>
struct UtfResult: public ResultType { struct EncodingResult: public ResultType {
// Equivalent to ResultType (a String or wide-char array) for all intents and purposes, except // Equivalent to ResultType (a String or wide-char array) for all intents and purposes, except
// that the bool `hadErrors` can be inspected to see if any errors were encountered in the input, // that the bool `hadErrors` can be inspected to see if any errors were encountered in the input.
// resulting in instances of the replacement character (U+FFFD) in the output. // Each encoding/decoding function that returns this type will "work around" errors in some way,
// so an application doesn't strictly have to check for errors. E.g. the Unicode functions
inline UtfResult(ResultType&& result, bool hadErrors) // replace errors with U+FFFD in the output.
//
// Through magic, KJ_IF_MAYBE() and KJ_{REQUIRE,ASSERT}_NONNULL() work on EncodingResult<T>
// exactly if it were a Maybe<T> that is null in case of errors.
inline EncodingResult(ResultType&& result, bool hadErrors)
: ResultType(kj::mv(result)), hadErrors(hadErrors) {} : ResultType(kj::mv(result)), hadErrors(hadErrors) {}
const bool hadErrors; const bool hadErrors;
// If true, then invalid sequences were detected in the input and were replaced with the Unicode
// replacement character (U+FFFD) in the output. Many applications will chose to ignore this
// boolean and continue on with the damaged data.
}; };
UtfResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false); EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false);
UtfResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false); EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false);
Maybe<Array<char16_t>> tryEncodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false);
Maybe<Array<char32_t>> tryEncodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false);
// Convert UTF-8 text (which KJ strings use) to UTF-16 or UTF-32. // Convert UTF-8 text (which KJ strings use) to UTF-16 or UTF-32.
// //
// If `nulTerminate` is true, an extra NUL character will be added to the end of the output. // If `nulTerminate` is true, an extra NUL character will be added to the end of the output.
...@@ -64,10 +64,8 @@ Maybe<Array<char32_t>> tryEncodeUtf32(ArrayPtr<const char> text, bool nulTermina ...@@ -64,10 +64,8 @@ Maybe<Array<char32_t>> tryEncodeUtf32(ArrayPtr<const char> text, bool nulTermina
// The returned arrays are in platform-native endianness (otherwise they wouldn't really be // The returned arrays are in platform-native endianness (otherwise they wouldn't really be
// char16_t / char32_t). // char16_t / char32_t).
UtfResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16); EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16);
UtfResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32); EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32);
Maybe<String> tryDecodeUtf16(ArrayPtr<const char16_t> utf16);
Maybe<String> tryDecodeUtf32(ArrayPtr<const char32_t> utf32);
// Convert UTF-16 or UTF-32 to UTF-8 (which KJ strings use). // Convert UTF-16 or UTF-32 to UTF-8 (which KJ strings use).
// //
// The input should NOT include a NUL terminator; any NUL characters in the input array will be // The input should NOT include a NUL terminator; any NUL characters in the input array will be
...@@ -79,41 +77,76 @@ Maybe<String> tryDecodeUtf32(ArrayPtr<const char32_t> utf32); ...@@ -79,41 +77,76 @@ Maybe<String> tryDecodeUtf32(ArrayPtr<const char32_t> utf32);
// The input must be in platform-native endianness. BOMs are NOT recognized by these functions. // The input must be in platform-native endianness. BOMs are NOT recognized by these functions.
String encodeHex(ArrayPtr<const byte> bytes); String encodeHex(ArrayPtr<const byte> bytes);
Array<byte> decodeHex(ArrayPtr<const char> text); EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text);
// Encode/decode bytes as hex strings. // Encode/decode bytes as hex strings.
String encodeUriComponent(ArrayPtr<const byte> bytes); String encodeUriComponent(ArrayPtr<const byte> bytes);
String encodeUriComponent(ArrayPtr<const char> bytes); String encodeUriComponent(ArrayPtr<const char> bytes);
Array<byte> decodeBinaryUriComponent(ArrayPtr<const char> text, bool nulTerminate = false); EncodingResult<Array<byte>> decodeBinaryUriComponent(
String decodeUriComponent(ArrayPtr<const char> text); ArrayPtr<const char> text, bool nulTerminate = false);
EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text);
// Encode/decode URI components using % escapes. See Javascript's encodeURIComponent(). // Encode/decode URI components using % escapes. See Javascript's encodeURIComponent().
String encodeCEscape(ArrayPtr<const byte> bytes); String encodeCEscape(ArrayPtr<const byte> bytes);
String encodeCEscape(ArrayPtr<const char> bytes); String encodeCEscape(ArrayPtr<const char> bytes);
UtfResult<Array<byte>> decodeBinaryCEscape(ArrayPtr<const char> text, bool nulTerminate = false); EncodingResult<Array<byte>> decodeBinaryCEscape(
UtfResult<String> decodeCEscape(ArrayPtr<const char> text); ArrayPtr<const char> text, bool nulTerminate = false);
EncodingResult<String> decodeCEscape(ArrayPtr<const char> text);
String encodeBase64(ArrayPtr<const byte> bytes, bool breakLines = false); String encodeBase64(ArrayPtr<const byte> bytes, bool breakLines = false);
// Encode the given bytes as base64 text. If `breakLines` is true, line breaks will be inserted // Encode the given bytes as base64 text. If `breakLines` is true, line breaks will be inserted
// into the output every 72 characters (e.g. for encoding e-mail bodies). // into the output every 72 characters (e.g. for encoding e-mail bodies).
Array<byte> decodeBase64(ArrayPtr<const char> text); Array<byte> decodeBase64(ArrayPtr<const char> text);
// Decode base64 text. Non-base64 characters are ignored. // Decode base64 text. Non-base64 characters are ignored and padding characters are not requried;
// as such, this function never fails.
// ======================================================================================= // =======================================================================================
// inline implementation details // inline implementation details
namespace _ { // private
template <typename T>
NullableValue<T> readMaybe(EncodingResult<T>&& value) {
if (value.hadErrors) {
return nullptr;
} else {
return kj::mv(value);
}
}
template <typename T>
T* readMaybe(EncodingResult<T>& value) {
if (value.hadErrors) {
return nullptr;
} else {
return &value;
}
}
template <typename T>
const T* readMaybe(const EncodingResult<T>& value) {
if (value.hadErrors) {
return nullptr;
} else {
return &value;
}
}
} // namespace _ (private)
inline String encodeUriComponent(ArrayPtr<const char> text) { inline String encodeUriComponent(ArrayPtr<const char> text) {
return encodeUriComponent(text.asBytes()); return encodeUriComponent(text.asBytes());
} }
inline String decodeUriComponent(ArrayPtr<const char> text) { inline EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text) {
return String(decodeBinaryUriComponent(text, true).releaseAsChars()); auto result = decodeBinaryUriComponent(text, true);
return { String(result.releaseAsChars()), result.hadErrors };
} }
inline String encodeCEscape(ArrayPtr<const char> text) { inline String encodeCEscape(ArrayPtr<const char> text) {
return encodeCEscape(text.asBytes()); return encodeCEscape(text.asBytes());
} }
inline UtfResult<String> decodeCEscape(ArrayPtr<const char> text) { inline EncodingResult<String> decodeCEscape(ArrayPtr<const char> text) {
auto result = decodeBinaryCEscape(text, true); auto result = decodeBinaryCEscape(text, true);
return { String(result.releaseAsChars()), result.hadErrors }; return { String(result.releaseAsChars()), result.hadErrors };
} }
...@@ -123,39 +156,23 @@ inline UtfResult<String> decodeCEscape(ArrayPtr<const char> text) { ...@@ -123,39 +156,23 @@ inline UtfResult<String> decodeCEscape(ArrayPtr<const char> text) {
// only even matters for encoding-test.c++. // only even matters for encoding-test.c++.
template <size_t s> template <size_t s>
inline UtfResult<Array<char16_t>> encodeUtf16(const char (&text)[s], bool nulTerminate = false) { inline EncodingResult<Array<char16_t>> encodeUtf16(const char (&text)[s], bool nulTerminate=false) {
return encodeUtf16(arrayPtr(text, s - 1), nulTerminate); return encodeUtf16(arrayPtr(text, s - 1), nulTerminate);
} }
template <size_t s> template <size_t s>
inline UtfResult<Array<char32_t>> encodeUtf32(const char (&text)[s], bool nulTerminate = false) { inline EncodingResult<Array<char32_t>> encodeUtf32(const char (&text)[s], bool nulTerminate=false) {
return encodeUtf32(arrayPtr(text, s - 1), nulTerminate); return encodeUtf32(arrayPtr(text, s - 1), nulTerminate);
} }
template <size_t s> template <size_t s>
inline Maybe<Array<char16_t>> tryEncodeUtf16(const char (&text)[s], bool nulTerminate = false) { inline EncodingResult<String> decodeUtf16(const char16_t (&utf16)[s]) {
return tryEncodeUtf16(arrayPtr(text, s - 1), nulTerminate);
}
template <size_t s>
inline Maybe<Array<char32_t>> tryEncodeUtf32(const char (&text)[s], bool nulTerminate = false) {
return tryEncodeUtf32(arrayPtr(text, s - 1), nulTerminate);
}
template <size_t s>
inline UtfResult<String> decodeUtf16(const char16_t (&utf16)[s]) {
return decodeUtf16(arrayPtr(utf16, s - 1)); return decodeUtf16(arrayPtr(utf16, s - 1));
} }
template <size_t s> template <size_t s>
inline UtfResult<String> decodeUtf32(const char32_t (&utf32)[s]) { inline EncodingResult<String> decodeUtf32(const char32_t (&utf32)[s]) {
return decodeUtf32(arrayPtr(utf32, s - 1)); return decodeUtf32(arrayPtr(utf32, s - 1));
} }
template <size_t s> template <size_t s>
inline Maybe<String> tryDecodeUtf16(const char16_t (&utf16)[s]) { inline EncodingResult<Array<byte>> decodeHex(const char (&text)[s]) {
return tryDecodeUtf16(arrayPtr(utf16, s - 1));
}
template <size_t s>
inline Maybe<String> tryDecodeUtf32(const char32_t (&utf32)[s]) {
return tryDecodeUtf32(arrayPtr(utf32, s - 1));
}
template <size_t s>
inline Array<byte> decodeHex(const char (&text)[s]) {
return decodeHex(arrayPtr(text, s - 1)); return decodeHex(arrayPtr(text, s - 1));
} }
template <size_t s> template <size_t s>
...@@ -167,21 +184,20 @@ inline Array<byte> decodeBinaryUriComponent(const char (&text)[s]) { ...@@ -167,21 +184,20 @@ inline Array<byte> decodeBinaryUriComponent(const char (&text)[s]) {
return decodeBinaryUriComponent(arrayPtr(text, s - 1)); return decodeBinaryUriComponent(arrayPtr(text, s - 1));
} }
template <size_t s> template <size_t s>
inline String decodeUriComponent(const char (&text)[s]) { inline EncodingResult<String> decodeUriComponent(const char (&text)[s]) {
return String(decodeBinaryUriComponent(arrayPtr(text, s - 1), true).releaseAsChars()); return decodeUriComponent(arrayPtr(text, s-1));
} }
template <size_t s> template <size_t s>
inline String encodeCEscape(const char (&text)[s]) { inline String encodeCEscape(const char (&text)[s]) {
return encodeCEscape(arrayPtr(text, s - 1)); return encodeCEscape(arrayPtr(text, s - 1));
} }
template <size_t s> template <size_t s>
inline UtfResult<Array<byte>> decodeBinaryCEscape(const char (&text)[s]) { inline EncodingResult<Array<byte>> decodeBinaryCEscape(const char (&text)[s]) {
return decodeBinaryCEscape(arrayPtr(text, s - 1)); return decodeBinaryCEscape(arrayPtr(text, s - 1));
} }
template <size_t s> template <size_t s>
inline UtfResult<String> decodeCEscape(const char (&text)[s]) { inline EncodingResult<String> decodeCEscape(const char (&text)[s]) {
auto result = decodeBinaryCEscape(arrayPtr(text, s - 1), true); return decodeCEscape(arrayPtr(text, s-1));
return { String(result.releaseAsChars()), result.hadErrors };
} }
template <size_t s> template <size_t s>
Array<byte> decodeBase64(const char (&text)[s]) { Array<byte> decodeBase64(const char (&text)[s]) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment