Commit 5483d8f7 authored by Kenton Varda's avatar Kenton Varda

Extend Unicode encoders to support 'WTF-8'.

This allows arbitrary char16 arrays to round-trip through UTF-8 without losing information, even if the char16 arrays are not valid UTF-16.

This is necessary e.g. for filesystem manipulation on Windows, where filenames contain 16-bit characters but valid UTF-16 is not enforced.

Invalid UTF-16 represented in UTF-8 is affectionately known as WTF-8: http://simonsapin.github.io/wtf-8/
parent d3278477
...@@ -64,6 +64,13 @@ void expectRes(EncodingResult<T> result, ...@@ -64,6 +64,13 @@ void expectRes(EncodingResult<T> result,
expectResImpl(kj::mv(result), arrayPtr<const byte>(expected, s), errors); expectResImpl(kj::mv(result), arrayPtr<const byte>(expected, s), errors);
} }
// Handy reference for surrogate pair edge cases:
//
// \ud800 -> \xed\xa0\x80
// \udc00 -> \xed\xb0\x80
// \udbff -> \xed\xaf\xbf
// \udfff -> \xed\xbf\xbf
KJ_TEST("encode UTF-8 to UTF-16") { KJ_TEST("encode UTF-8 to UTF-16") {
expectRes(encodeUtf16(u8"foo"), u"foo"); expectRes(encodeUtf16(u8"foo"), u"foo");
expectRes(encodeUtf16(u8"Здравствуйте"), u"Здравствуйте"); expectRes(encodeUtf16(u8"Здравствуйте"), u"Здравствуйте");
...@@ -113,6 +120,26 @@ KJ_TEST("invalid UTF-8 to UTF-16") { ...@@ -113,6 +120,26 @@ KJ_TEST("invalid UTF-8 to UTF-16") {
expectRes(encodeUtf16("\xfc\xbf\x80\x80\x80\x80"), u"\ufffd", true); expectRes(encodeUtf16("\xfc\xbf\x80\x80\x80\x80"), u"\ufffd", true);
expectRes(encodeUtf16("\xfe\xbf\x80\x80\x80\x80\x80"), u"\ufffd", true); expectRes(encodeUtf16("\xfe\xbf\x80\x80\x80\x80\x80"), u"\ufffd", true);
expectRes(encodeUtf16("\xff\xbf\x80\x80\x80\x80\x80\x80"), u"\ufffd", true); expectRes(encodeUtf16("\xff\xbf\x80\x80\x80\x80\x80\x80"), u"\ufffd", true);
// Surrogates encoded as separate UTF-8 code points are flagged as errors but allowed to decode
// to UTF-16 surrogate values.
expectRes(encodeUtf16(u8"\ud7ff\xed\xb0\x80\xed\xaf\xbf\ue000"),
u"\xd7ff\xdc00\xdbff\xe000", true);
expectRes(encodeUtf16(u8"\ud7ff\xed\xbf\xbf\xed\xa0\x80\ue000"),
u"\xd7ff\xdfff\xd800\xe000", true);
expectRes(encodeUtf16(u8"\ud7ff\xed\xb0\x80\xed\xbf\xbf\ue000"),
u"\xd7ff\xdc00\xdfff\xe000", true);
expectRes(encodeUtf16(u8"f\xed\xa0\x80"), u"f\xd800", true);
expectRes(encodeUtf16(u8"f\xed\xa0\x80x"), u"f\xd800x", true);
expectRes(encodeUtf16(u8"f\xed\xa0\x80\xed\xa0\x80x"), u"f\xd800\xd800x", true);
// However, if successive UTF-8 codepoints decode to a proper surrogate pair, the second
// surrogate is replaced with the Unicode replacement character to avoid creating valid UTF-16.
expectRes(encodeUtf16(u8"\ud7ff\xed\xa0\x80\xed\xbf\xbf\ue000"),
u"\xd7ff\xd800\xfffd\xe000", true);
expectRes(encodeUtf16(u8"\ud7ff\xed\xaf\xbf\xed\xb0\x80\ue000"),
u"\xd7ff\xdbff\xfffd\xe000", true);
} }
KJ_TEST("encode UTF-8 to UTF-32") { KJ_TEST("encode UTF-8 to UTF-32") {
...@@ -169,12 +196,15 @@ KJ_TEST("decode UTF-16 to UTF-8") { ...@@ -169,12 +196,15 @@ KJ_TEST("decode UTF-16 to UTF-8") {
KJ_TEST("invalid UTF-16 to UTF-8") { KJ_TEST("invalid UTF-16 to UTF-8") {
// Surrogates in wrong order. // Surrogates in wrong order.
expectRes(decodeUtf16(u"\xd7ff\xdc00\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true); expectRes(decodeUtf16(u"\xd7ff\xdc00\xdbff\xe000"),
u8"\ud7ff\xed\xb0\x80\xed\xaf\xbf\ue000", true);
expectRes(decodeUtf16(u"\xd7ff\xdfff\xd800\xe000"),
u8"\ud7ff\xed\xbf\xbf\xed\xa0\x80\ue000", true);
// Missing second surrogate. // Missing second surrogate.
expectRes(decodeUtf16(u"f\xd800"), u8"f\ufffd", true); expectRes(decodeUtf16(u"f\xd800"), u8"f\xed\xa0\x80", true);
expectRes(decodeUtf16(u"f\xd800x"), u8"f\ufffdx", true); expectRes(decodeUtf16(u"f\xd800x"), u8"f\xed\xa0\x80x", true);
expectRes(decodeUtf16(u"f\xd800\xd800x"), u8"f\ufffd\ufffdx", true); expectRes(decodeUtf16(u"f\xd800\xd800x"), u8"f\xed\xa0\x80\xed\xa0\x80x", true);
} }
KJ_TEST("decode UTF-32 to UTF-8") { KJ_TEST("decode UTF-32 to UTF-8") {
...@@ -186,10 +216,19 @@ KJ_TEST("decode UTF-32 to UTF-8") { ...@@ -186,10 +216,19 @@ KJ_TEST("decode UTF-32 to UTF-8") {
KJ_TEST("invalid UTF-32 to UTF-8") { KJ_TEST("invalid UTF-32 to UTF-8") {
// Surrogates rejected. // Surrogates rejected.
expectRes(decodeUtf32(U"\xd7ff\xdc00\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true); expectRes(decodeUtf32(U"\xd7ff\xdfff\xd800\xe000"),
u8"\ud7ff\xed\xbf\xbf\xed\xa0\x80\ue000", true);
// Even if it would be a valid surrogate pair in UTF-16. // Even if it would be a valid surrogate pair in UTF-16.
expectRes(decodeUtf32(U"\xd7ff\xd800\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true); expectRes(decodeUtf32(U"\xd7ff\xd800\xdfff\xe000"),
u8"\ud7ff\xed\xa0\x80\xed\xbf\xbf\ue000", true);
}
KJ_TEST("round-trip invalid UTF-16") {
const char16_t INVALID[] = u"\xdfff foo \xd800\xdc00 bar \xdc00\xd800 baz \xdbff qux \xd800";
expectRes(encodeUtf16(decodeUtf16(INVALID)), INVALID, true);
expectRes(encodeUtf16(decodeUtf32(encodeUtf32(decodeUtf16(INVALID)))), INVALID, true);
} }
KJ_TEST("EncodingResult as a Maybe") { KJ_TEST("EncodingResult as a Maybe") {
......
...@@ -79,8 +79,23 @@ EncodingResult<Array<T>> encodeUtf(ArrayPtr<const char> text, bool nulTerminate) ...@@ -79,8 +79,23 @@ EncodingResult<Array<T>> encodeUtf(ArrayPtr<const char> text, bool nulTerminate)
// Disallow overlong sequence. // Disallow overlong sequence.
GOTO_ERROR_IF(u < 0x0800); GOTO_ERROR_IF(u < 0x0800);
// Disallow surrogate pair code points. // Flag surrogate pair code points as errors, but allow them through.
GOTO_ERROR_IF((u & 0xf800) == 0xd800); if (KJ_UNLIKELY((u & 0xf800) == 0xd800)) {
if (result.size() > 0 &&
(u & 0xfc00) == 0xdc00 &&
(result.back() & 0xfc00) == 0xd800) {
// Whoops, the *previous* character was also an invalid surrogate, and if we add this
// one too, they'll form a valid surrogate pair. If we allowed this, then it would mean
// invalid UTF-8 round-tripped to UTF-16 and back could actually change meaning entirely.
// OTOH, the reason we allow dangling surrogates is to allow invalid UTF-16 to round-trip
// to UTF-8 without loss, but if the original UTF-16 had a valid surrogate pair, it would
// have been encoded as a valid single UTF-8 codepoint, not as separate UTF-8 codepoints
// for each surrogate.
goto error;
}
hadErrors = true;
}
result.add(u); result.add(u);
continue; continue;
...@@ -153,9 +168,12 @@ EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) { ...@@ -153,9 +168,12 @@ EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) {
} else if ((u & 0xf800) == 0xd800) { } else if ((u & 0xf800) == 0xd800) {
// surrogate pair // surrogate pair
char16_t u2; char16_t u2;
GOTO_ERROR_IF(i == utf16.size() // missing second half if (KJ_UNLIKELY(i == utf16.size() // missing second half
|| (u & 0x0400) != 0 // first half in wrong range || (u & 0x0400) != 0 // first half in wrong range
|| ((u2 = utf16[i]) & 0xfc00) != 0xdc00); // second half in wrong range || ((u2 = utf16[i]) & 0xfc00) != 0xdc00)) { // second half in wrong range
hadErrors = true;
goto threeByte;
}
++i; ++i;
char32_t u32 = (((u & 0x03ff) << 10) | (u2 & 0x03ff)) + 0x10000; char32_t u32 = (((u & 0x03ff) << 10) | (u2 & 0x03ff)) + 0x10000;
...@@ -167,6 +185,7 @@ EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) { ...@@ -167,6 +185,7 @@ EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) {
}); });
continue; continue;
} else { } else {
threeByte:
result.addAll<std::initializer_list<char>>({ result.addAll<std::initializer_list<char>>({
static_cast<char>(((u >> 12) ) | 0xe0), static_cast<char>(((u >> 12) ) | 0xe0),
static_cast<char>(((u >> 6) & 0x3f) | 0x80), static_cast<char>(((u >> 6) & 0x3f) | 0x80),
...@@ -174,10 +193,6 @@ EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) { ...@@ -174,10 +193,6 @@ EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) {
}); });
continue; continue;
} }
error:
result.addAll(StringPtr(u8"\ufffd"));
hadErrors = true;
} }
result.add(0); result.add(0);
...@@ -202,7 +217,10 @@ EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf16) { ...@@ -202,7 +217,10 @@ EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf16) {
}); });
continue; continue;
} else if (u < 0x10000) { } else if (u < 0x10000) {
GOTO_ERROR_IF((u & 0xfffff800) == 0xd800); // no surrogates allowed in utf-32 if (KJ_UNLIKELY((u & 0xfffff800) == 0xd800)) {
// no surrogates allowed in utf-32
hadErrors = true;
}
result.addAll<std::initializer_list<char>>({ result.addAll<std::initializer_list<char>>({
static_cast<char>(((u >> 12) ) | 0xe0), static_cast<char>(((u >> 12) ) | 0xe0),
static_cast<char>(((u >> 6) & 0x3f) | 0x80), static_cast<char>(((u >> 6) & 0x3f) | 0x80),
......
...@@ -52,17 +52,24 @@ struct EncodingResult: public ResultType { ...@@ -52,17 +52,24 @@ struct EncodingResult: public ResultType {
const bool hadErrors; const bool hadErrors;
}; };
template <typename T>
inline auto KJ_STRINGIFY(const EncodingResult<T>& value)
-> decltype(toCharSequence(implicitCast<const T&>(value))) {
return toCharSequence(implicitCast<const T&>(value));
}
EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false); EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false);
EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false); EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false);
// Convert UTF-8 text (which KJ strings use) to UTF-16 or UTF-32. // Convert UTF-8 text (which KJ strings use) to UTF-16 or UTF-32.
// //
// If `nulTerminate` is true, an extra NUL character will be added to the end of the output. // If `nulTerminate` is true, an extra NUL character will be added to the end of the output.
// //
// The `try` versions return null if the input is invalid; the non-`try` versions return data
// containing the Unicode replacement character (U+FFFD).
//
// The returned arrays are in platform-native endianness (otherwise they wouldn't really be // The returned arrays are in platform-native endianness (otherwise they wouldn't really be
// char16_t / char32_t). // char16_t / char32_t).
//
// Note that the KJ Unicode encoding and decoding functions actually implement
// [WTF-8 encoding](http://simonsapin.github.io/wtf-8/), which affects how invalid input is
// handled. See comments on decodeUtf16() for more info.
EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16); EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16);
EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32); EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32);
...@@ -71,10 +78,34 @@ EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32); ...@@ -71,10 +78,34 @@ EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32);
// The input should NOT include a NUL terminator; any NUL characters in the input array will be // The input should NOT include a NUL terminator; any NUL characters in the input array will be
// preserved in the output. // preserved in the output.
// //
// The `try` versions return null if the input is invalid; the non-`try` versions return data
// containing the Unicode replacement character (U+FFFD).
//
// The input must be in platform-native endianness. BOMs are NOT recognized by these functions. // The input must be in platform-native endianness. BOMs are NOT recognized by these functions.
//
// Note that the KJ Unicode encoding and decoding functions actually implement
// [WTF-8 encoding](http://simonsapin.github.io/wtf-8/). This means that if you start with an array
// of char16_t and you pass it through any number of conversions to other Unicode encodings,
// eventually returning it to UTF-16, all the while ignoring `hadErrors`, you will end up with
// exactly the same char16_t array you started with, *even if* the array is not valid UTF-16. This
// is useful because many real-world systems that were designed for UCS-2 (plain 16-bit Unicode)
// and later "upgraded" to UTF-16 do not enforce that their UTF-16 is well-formed. For example,
// file names on Windows NT are encoded using 16-bit characters, without enforcing that the
// character sequence is valid UTF-16. It is important that programs on Windows be able to handle
// such filenames, even if they choose to convert the name to UTF-8 for internal processing.
//
// Specifically, KJ's Unicode handling allows unpaired surrogate code points to round-trip through
// UTF-8 and UTF-32. Unpaired surrogates will be flagged as an error (setting `hadErrors` in the
// result), but will NOT be replaced with the Unicode replacement character as other erroneous
// sequences would be, but rather encoded as an invalid surrogate codepoint in the target encoding.
//
// KJ makes the following guarantees about invalid input:
// - A round trip from UTF-16 to other encodings and back will produce exactly the original input,
// with every leg of the trip raising the `hadErrors` flag if the original input was not valid.
// - A round trip from UTF-8 or UTF-32 to other encodings and back will either produce exactly
// the original input, or will have replaced some invalid sequences with the Unicode replacement
// character, U+FFFD. No code units will ever be removed unless they are replaced with U+FFFD,
// and no code units will ever be added except to encode U+FFFD. If the original input was not
// valid, the `hadErrors` flag will be raised on the first leg of the trip, and will also be
// raised on subsequent legs unless all invalid sequences were replaced with U+FFFD (which, after
// all, is a valid code point).
String encodeHex(ArrayPtr<const byte> bytes); String encodeHex(ArrayPtr<const byte> bytes);
EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text); EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment