Commit df52bf86 authored by Kenton Varda's avatar Kenton Varda

Improve KJ encoding lib error handling:

- Rename UtfResult -> EncodingResult
- Make it usable like a Maybe, so that we don't need separate "try" functions.
- Check errors in hex decoding and URI decoding.
parent 03800dfa
......@@ -915,7 +915,6 @@ public:
return value;
}
private: // internal interface used by friends only
inline NullableValue() noexcept: isSet(false) {}
inline NullableValue(T&& t) noexcept(noexcept(T(instance<T&&>())))
: isSet(true) {
......
This diff is collapsed.
......@@ -41,7 +41,7 @@ inline void addChar32(Vector<char32_t>& vec, char32_t u) {
}
template <typename T>
UtfResult<Array<T>> encodeUtf(ArrayPtr<const char> text, bool nulTerminate) {
EncodingResult<Array<T>> encodeUtf(ArrayPtr<const char> text, bool nulTerminate) {
Vector<T> result(text.size() + nulTerminate);
bool hadErrors = false;
......@@ -125,33 +125,15 @@ UtfResult<Array<T>> encodeUtf(ArrayPtr<const char> text, bool nulTerminate) {
} // namespace
UtfResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate) {
EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate) {
return encodeUtf<char16_t>(text, nulTerminate);
}
UtfResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate) {
EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate) {
return encodeUtf<char32_t>(text, nulTerminate);
}
Maybe<Array<char16_t>> tryEncodeUtf16(ArrayPtr<const char> text, bool nulTerminate) {
auto result = encodeUtf16(text, nulTerminate);
if (result.hadErrors) {
return nullptr;
} else {
return kj::mv(result);
}
}
Maybe<Array<char32_t>> tryEncodeUtf32(ArrayPtr<const char> text, bool nulTerminate) {
auto result = encodeUtf32(text, nulTerminate);
if (result.hadErrors) {
return nullptr;
} else {
return kj::mv(result);
}
}
UtfResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) {
EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) {
Vector<char> result(utf16.size() + 1);
bool hadErrors = false;
......@@ -202,7 +184,7 @@ UtfResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) {
return { String(result.releaseAsArray()), hadErrors };
}
UtfResult<String> decodeUtf32(ArrayPtr<const char32_t> utf16) {
EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf16) {
Vector<char> result(utf16.size() + 1);
bool hadErrors = false;
......@@ -247,28 +229,33 @@ UtfResult<String> decodeUtf32(ArrayPtr<const char32_t> utf16) {
return { String(result.releaseAsArray()), hadErrors };
}
Maybe<String> tryDecodeUtf16(ArrayPtr<const char16_t> utf16) {
auto result = decodeUtf16(utf16);
if (result.hadErrors) {
return nullptr;
// =======================================================================================
namespace {
const char HEX_DIGITS[] = "0123456789abcdef";
static Maybe<uint> tryFromHexDigit(char c) {
if ('0' <= c && c <= '9') {
return c - '0';
} else if ('a' <= c && c <= 'f') {
return c - ('a' - 10);
} else if ('A' <= c && c <= 'F') {
return c - ('A' - 10);
} else {
return kj::mv(result);
return nullptr;
}
}
Maybe<String> tryDecodeUtf32(ArrayPtr<const char32_t> utf32) {
auto result = decodeUtf32(utf32);
if (result.hadErrors) {
return nullptr;
static Maybe<uint> tryFromOctDigit(char c) {
if ('0' <= c && c <= '7') {
return c - '0';
} else {
return kj::mv(result);
return nullptr;
}
}
// =======================================================================================
namespace {
const char HEX_DIGITS[] = "0123456789abcdef";
}
} // namespace
String encodeHex(ArrayPtr<const byte> input) {
return strArray(KJ_MAP(b, input) {
......@@ -276,27 +263,26 @@ String encodeHex(ArrayPtr<const byte> input) {
}, "");
}
static uint fromDigit(char c) {
if ('0' <= c && c <= '9') {
return c - '0';
} else if ('a' <= c && c <= 'z') {
return c - ('a' - 10);
} else if ('A' <= c && c <= 'Z') {
return c - ('A' - 10);
} else {
return 0;
}
}
Array<byte> decodeHex(ArrayPtr<const char> text) {
EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text) {
auto result = heapArray<byte>(text.size() / 2);
bool hadErrors = text.size() % 2;
for (auto i: kj::indices(result)) {
result[i] = (fromDigit(text[i*2]) << 4)
| (fromDigit(text[i*2+1]));
byte b = 0;
KJ_IF_MAYBE(d1, tryFromHexDigit(text[i*2])) {
b = *d1 << 4;
} else {
hadErrors = true;
}
KJ_IF_MAYBE(d2, tryFromHexDigit(text[i*2+1])) {
b |= *d2;
} else {
hadErrors = true;
}
result[i] = b;
}
return result;
return { kj::mv(result), hadErrors };
}
String encodeUriComponent(ArrayPtr<const byte> bytes) {
......@@ -316,26 +302,41 @@ String encodeUriComponent(ArrayPtr<const byte> bytes) {
return String(result.releaseAsArray());
}
Array<byte> decodeBinaryUriComponent(ArrayPtr<const char> text, bool nulTerminate) {
EncodingResult<Array<byte>> decodeBinaryUriComponent(
ArrayPtr<const char> text, bool nulTerminate) {
Vector<byte> result(text.size() + nulTerminate);
bool hadErrors = false;
const char* ptr = text.begin();
const char* end = text.end();
while (ptr < end) {
if (*ptr == '%') {
++ptr;
if (ptr == end) break;
byte b = fromDigit(*ptr++) << 4;
if (ptr == end) break;
b |= fromDigit(*ptr++);
result.add(b);
if (ptr == end) {
hadErrors = true;
} else KJ_IF_MAYBE(d1, tryFromHexDigit(*ptr)) {
byte b = *d1;
++ptr;
if (ptr == end) {
hadErrors = true;
} else KJ_IF_MAYBE(d2, tryFromHexDigit(*ptr)) {
b = (b << 4) | *d2;
++ptr;
} else {
hadErrors = true;
}
result.add(b);
} else {
hadErrors = true;
}
} else {
result.add(*ptr++);
}
}
if (nulTerminate) result.add(0);
return result.releaseAsArray();
return { result.releaseAsArray(), hadErrors };
}
// =======================================================================================
......@@ -374,31 +375,7 @@ String encodeCEscape(ArrayPtr<const byte> bytes) {
return String(escaped.releaseAsArray());
}
namespace {
static Maybe<uint> tryFromHexDigit(char c) {
if ('0' <= c && c <= '9') {
return c - '0';
} else if ('a' <= c && c <= 'f') {
return c - ('a' - 10);
} else if ('A' <= c && c <= 'F') {
return c - ('A' - 10);
} else {
return nullptr;
}
}
static Maybe<uint> tryFromOctDigit(char c) {
if ('0' <= c && c <= '7') {
return c - '0';
} else {
return nullptr;
}
}
} // namespace
UtfResult<Array<byte>> decodeBinaryCEscape(ArrayPtr<const char> text, bool nulTerminate) {
EncodingResult<Array<byte>> decodeBinaryCEscape(ArrayPtr<const char> text, bool nulTerminate) {
Vector<byte> result(text.size() + nulTerminate);
bool hadErrors = false;
......
......@@ -36,24 +36,24 @@
namespace kj {
template <typename ResultType>
struct UtfResult: public ResultType {
struct EncodingResult: public ResultType {
// Equivalent to ResultType (a String or wide-char array) for all intents and purposes, except
// that the bool `hadErrors` can be inspected to see if any errors were encountered in the input,
// resulting in instances of the replacement character (U+FFFD) in the output.
inline UtfResult(ResultType&& result, bool hadErrors)
// that the bool `hadErrors` can be inspected to see if any errors were encountered in the input.
// Each encoding/decoding function that returns this type will "work around" errors in some way,
// so an application doesn't strictly have to check for errors. E.g. the Unicode functions
// replace errors with U+FFFD in the output.
//
// Through magic, KJ_IF_MAYBE() and KJ_{REQUIRE,ASSERT}_NONNULL() work on EncodingResult<T>
// exactly if it were a Maybe<T> that is null in case of errors.
inline EncodingResult(ResultType&& result, bool hadErrors)
: ResultType(kj::mv(result)), hadErrors(hadErrors) {}
const bool hadErrors;
// If true, then invalid sequences were detected in the input and were replaced with the Unicode
// replacement character (U+FFFD) in the output. Many applications will chose to ignore this
// boolean and continue on with the damaged data.
};
UtfResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false);
UtfResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false);
Maybe<Array<char16_t>> tryEncodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false);
Maybe<Array<char32_t>> tryEncodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false);
EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false);
EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false);
// Convert UTF-8 text (which KJ strings use) to UTF-16 or UTF-32.
//
// If `nulTerminate` is true, an extra NUL character will be added to the end of the output.
......@@ -64,10 +64,8 @@ Maybe<Array<char32_t>> tryEncodeUtf32(ArrayPtr<const char> text, bool nulTermina
// The returned arrays are in platform-native endianness (otherwise they wouldn't really be
// char16_t / char32_t).
UtfResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16);
UtfResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32);
Maybe<String> tryDecodeUtf16(ArrayPtr<const char16_t> utf16);
Maybe<String> tryDecodeUtf32(ArrayPtr<const char32_t> utf32);
EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16);
EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32);
// Convert UTF-16 or UTF-32 to UTF-8 (which KJ strings use).
//
// The input should NOT include a NUL terminator; any NUL characters in the input array will be
......@@ -79,41 +77,76 @@ Maybe<String> tryDecodeUtf32(ArrayPtr<const char32_t> utf32);
// The input must be in platform-native endianness. BOMs are NOT recognized by these functions.
String encodeHex(ArrayPtr<const byte> bytes);
Array<byte> decodeHex(ArrayPtr<const char> text);
EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text);
// Encode/decode bytes as hex strings.
String encodeUriComponent(ArrayPtr<const byte> bytes);
String encodeUriComponent(ArrayPtr<const char> bytes);
Array<byte> decodeBinaryUriComponent(ArrayPtr<const char> text, bool nulTerminate = false);
String decodeUriComponent(ArrayPtr<const char> text);
EncodingResult<Array<byte>> decodeBinaryUriComponent(
ArrayPtr<const char> text, bool nulTerminate = false);
EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text);
// Encode/decode URI components using % escapes. See Javascript's encodeURIComponent().
String encodeCEscape(ArrayPtr<const byte> bytes);
String encodeCEscape(ArrayPtr<const char> bytes);
UtfResult<Array<byte>> decodeBinaryCEscape(ArrayPtr<const char> text, bool nulTerminate = false);
UtfResult<String> decodeCEscape(ArrayPtr<const char> text);
EncodingResult<Array<byte>> decodeBinaryCEscape(
ArrayPtr<const char> text, bool nulTerminate = false);
EncodingResult<String> decodeCEscape(ArrayPtr<const char> text);
String encodeBase64(ArrayPtr<const byte> bytes, bool breakLines = false);
// Encode the given bytes as base64 text. If `breakLines` is true, line breaks will be inserted
// into the output every 72 characters (e.g. for encoding e-mail bodies).
Array<byte> decodeBase64(ArrayPtr<const char> text);
// Decode base64 text. Non-base64 characters are ignored.
// Decode base64 text. Non-base64 characters are ignored and padding characters are not requried;
// as such, this function never fails.
// =======================================================================================
// inline implementation details
namespace _ { // private
template <typename T>
NullableValue<T> readMaybe(EncodingResult<T>&& value) {
if (value.hadErrors) {
return nullptr;
} else {
return kj::mv(value);
}
}
template <typename T>
T* readMaybe(EncodingResult<T>& value) {
if (value.hadErrors) {
return nullptr;
} else {
return &value;
}
}
template <typename T>
const T* readMaybe(const EncodingResult<T>& value) {
if (value.hadErrors) {
return nullptr;
} else {
return &value;
}
}
} // namespace _ (private)
inline String encodeUriComponent(ArrayPtr<const char> text) {
return encodeUriComponent(text.asBytes());
}
inline String decodeUriComponent(ArrayPtr<const char> text) {
return String(decodeBinaryUriComponent(text, true).releaseAsChars());
inline EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text) {
auto result = decodeBinaryUriComponent(text, true);
return { String(result.releaseAsChars()), result.hadErrors };
}
inline String encodeCEscape(ArrayPtr<const char> text) {
return encodeCEscape(text.asBytes());
}
inline UtfResult<String> decodeCEscape(ArrayPtr<const char> text) {
inline EncodingResult<String> decodeCEscape(ArrayPtr<const char> text) {
auto result = decodeBinaryCEscape(text, true);
return { String(result.releaseAsChars()), result.hadErrors };
}
......@@ -123,39 +156,23 @@ inline UtfResult<String> decodeCEscape(ArrayPtr<const char> text) {
// only even matters for encoding-test.c++.
template <size_t s>
inline UtfResult<Array<char16_t>> encodeUtf16(const char (&text)[s], bool nulTerminate = false) {
inline EncodingResult<Array<char16_t>> encodeUtf16(const char (&text)[s], bool nulTerminate=false) {
return encodeUtf16(arrayPtr(text, s - 1), nulTerminate);
}
template <size_t s>
inline UtfResult<Array<char32_t>> encodeUtf32(const char (&text)[s], bool nulTerminate = false) {
inline EncodingResult<Array<char32_t>> encodeUtf32(const char (&text)[s], bool nulTerminate=false) {
return encodeUtf32(arrayPtr(text, s - 1), nulTerminate);
}
template <size_t s>
inline Maybe<Array<char16_t>> tryEncodeUtf16(const char (&text)[s], bool nulTerminate = false) {
return tryEncodeUtf16(arrayPtr(text, s - 1), nulTerminate);
}
template <size_t s>
inline Maybe<Array<char32_t>> tryEncodeUtf32(const char (&text)[s], bool nulTerminate = false) {
return tryEncodeUtf32(arrayPtr(text, s - 1), nulTerminate);
}
template <size_t s>
inline UtfResult<String> decodeUtf16(const char16_t (&utf16)[s]) {
inline EncodingResult<String> decodeUtf16(const char16_t (&utf16)[s]) {
return decodeUtf16(arrayPtr(utf16, s - 1));
}
template <size_t s>
inline UtfResult<String> decodeUtf32(const char32_t (&utf32)[s]) {
inline EncodingResult<String> decodeUtf32(const char32_t (&utf32)[s]) {
return decodeUtf32(arrayPtr(utf32, s - 1));
}
template <size_t s>
inline Maybe<String> tryDecodeUtf16(const char16_t (&utf16)[s]) {
return tryDecodeUtf16(arrayPtr(utf16, s - 1));
}
template <size_t s>
inline Maybe<String> tryDecodeUtf32(const char32_t (&utf32)[s]) {
return tryDecodeUtf32(arrayPtr(utf32, s - 1));
}
template <size_t s>
inline Array<byte> decodeHex(const char (&text)[s]) {
inline EncodingResult<Array<byte>> decodeHex(const char (&text)[s]) {
return decodeHex(arrayPtr(text, s - 1));
}
template <size_t s>
......@@ -167,21 +184,20 @@ inline Array<byte> decodeBinaryUriComponent(const char (&text)[s]) {
return decodeBinaryUriComponent(arrayPtr(text, s - 1));
}
template <size_t s>
inline String decodeUriComponent(const char (&text)[s]) {
return String(decodeBinaryUriComponent(arrayPtr(text, s - 1), true).releaseAsChars());
inline EncodingResult<String> decodeUriComponent(const char (&text)[s]) {
return decodeUriComponent(arrayPtr(text, s-1));
}
template <size_t s>
inline String encodeCEscape(const char (&text)[s]) {
return encodeCEscape(arrayPtr(text, s - 1));
}
template <size_t s>
inline UtfResult<Array<byte>> decodeBinaryCEscape(const char (&text)[s]) {
inline EncodingResult<Array<byte>> decodeBinaryCEscape(const char (&text)[s]) {
return decodeBinaryCEscape(arrayPtr(text, s - 1));
}
template <size_t s>
inline UtfResult<String> decodeCEscape(const char (&text)[s]) {
auto result = decodeBinaryCEscape(arrayPtr(text, s - 1), true);
return { String(result.releaseAsChars()), result.hadErrors };
inline EncodingResult<String> decodeCEscape(const char (&text)[s]) {
return decodeCEscape(arrayPtr(text, s-1));
}
template <size_t s>
Array<byte> decodeBase64(const char (&text)[s]) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment