Commit 5483d8f7 authored by Kenton Varda's avatar Kenton Varda

Extend Unicode encoders to support 'WTF-8'.

This allows arbitrary char16 arrays to round-trip through UTF-8 without losing information, even if the char16 arrays are not valid UTF-16.

This is necessary e.g. for filesystem manipulation on Windows, where filenames contain 16-bit characters but valid UTF-16 is not enforced.

Invalid UTF-16 represented in UTF-8 is affectionately known as WTF-8: http://simonsapin.github.io/wtf-8/
parent d3278477
......@@ -64,6 +64,13 @@ void expectRes(EncodingResult<T> result,
expectResImpl(kj::mv(result), arrayPtr<const byte>(expected, s), errors);
}
// Handy reference for surrogate pair edge cases:
//
// \ud800 -> \xed\xa0\x80
// \udc00 -> \xed\xb0\x80
// \udbff -> \xed\xaf\xbf
// \udfff -> \xed\xbf\xbf
KJ_TEST("encode UTF-8 to UTF-16") {
expectRes(encodeUtf16(u8"foo"), u"foo");
expectRes(encodeUtf16(u8"Здравствуйте"), u"Здравствуйте");
......@@ -113,6 +120,26 @@ KJ_TEST("invalid UTF-8 to UTF-16") {
expectRes(encodeUtf16("\xfc\xbf\x80\x80\x80\x80"), u"\ufffd", true);
expectRes(encodeUtf16("\xfe\xbf\x80\x80\x80\x80\x80"), u"\ufffd", true);
expectRes(encodeUtf16("\xff\xbf\x80\x80\x80\x80\x80\x80"), u"\ufffd", true);
// Surrogates encoded as separate UTF-8 code points are flagged as errors but allowed to decode
// to UTF-16 surrogate values.
expectRes(encodeUtf16(u8"\ud7ff\xed\xb0\x80\xed\xaf\xbf\ue000"),
u"\xd7ff\xdc00\xdbff\xe000", true);
expectRes(encodeUtf16(u8"\ud7ff\xed\xbf\xbf\xed\xa0\x80\ue000"),
u"\xd7ff\xdfff\xd800\xe000", true);
expectRes(encodeUtf16(u8"\ud7ff\xed\xb0\x80\xed\xbf\xbf\ue000"),
u"\xd7ff\xdc00\xdfff\xe000", true);
expectRes(encodeUtf16(u8"f\xed\xa0\x80"), u"f\xd800", true);
expectRes(encodeUtf16(u8"f\xed\xa0\x80x"), u"f\xd800x", true);
expectRes(encodeUtf16(u8"f\xed\xa0\x80\xed\xa0\x80x"), u"f\xd800\xd800x", true);
// However, if successive UTF-8 codepoints decode to a proper surrogate pair, the second
// surrogate is replaced with the Unicode replacement character to avoid creating valid UTF-16.
expectRes(encodeUtf16(u8"\ud7ff\xed\xa0\x80\xed\xbf\xbf\ue000"),
u"\xd7ff\xd800\xfffd\xe000", true);
expectRes(encodeUtf16(u8"\ud7ff\xed\xaf\xbf\xed\xb0\x80\ue000"),
u"\xd7ff\xdbff\xfffd\xe000", true);
}
KJ_TEST("encode UTF-8 to UTF-32") {
......@@ -169,12 +196,15 @@ KJ_TEST("decode UTF-16 to UTF-8") {
KJ_TEST("invalid UTF-16 to UTF-8") {
// Surrogates in wrong order.
expectRes(decodeUtf16(u"\xd7ff\xdc00\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true);
expectRes(decodeUtf16(u"\xd7ff\xdc00\xdbff\xe000"),
u8"\ud7ff\xed\xb0\x80\xed\xaf\xbf\ue000", true);
expectRes(decodeUtf16(u"\xd7ff\xdfff\xd800\xe000"),
u8"\ud7ff\xed\xbf\xbf\xed\xa0\x80\ue000", true);
// Missing second surrogate.
expectRes(decodeUtf16(u"f\xd800"), u8"f\ufffd", true);
expectRes(decodeUtf16(u"f\xd800x"), u8"f\ufffdx", true);
expectRes(decodeUtf16(u"f\xd800\xd800x"), u8"f\ufffd\ufffdx", true);
expectRes(decodeUtf16(u"f\xd800"), u8"f\xed\xa0\x80", true);
expectRes(decodeUtf16(u"f\xd800x"), u8"f\xed\xa0\x80x", true);
expectRes(decodeUtf16(u"f\xd800\xd800x"), u8"f\xed\xa0\x80\xed\xa0\x80x", true);
}
KJ_TEST("decode UTF-32 to UTF-8") {
......@@ -186,10 +216,19 @@ KJ_TEST("decode UTF-32 to UTF-8") {
KJ_TEST("invalid UTF-32 to UTF-8") {
// Surrogates rejected.
expectRes(decodeUtf32(U"\xd7ff\xdc00\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true);
expectRes(decodeUtf32(U"\xd7ff\xdfff\xd800\xe000"),
u8"\ud7ff\xed\xbf\xbf\xed\xa0\x80\ue000", true);
// Even if it would be a valid surrogate pair in UTF-16.
expectRes(decodeUtf32(U"\xd7ff\xd800\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true);
expectRes(decodeUtf32(U"\xd7ff\xd800\xdfff\xe000"),
u8"\ud7ff\xed\xa0\x80\xed\xbf\xbf\ue000", true);
}
KJ_TEST("round-trip invalid UTF-16") {
const char16_t INVALID[] = u"\xdfff foo \xd800\xdc00 bar \xdc00\xd800 baz \xdbff qux \xd800";
expectRes(encodeUtf16(decodeUtf16(INVALID)), INVALID, true);
expectRes(encodeUtf16(decodeUtf32(encodeUtf32(decodeUtf16(INVALID)))), INVALID, true);
}
KJ_TEST("EncodingResult as a Maybe") {
......
......@@ -79,8 +79,23 @@ EncodingResult<Array<T>> encodeUtf(ArrayPtr<const char> text, bool nulTerminate)
// Disallow overlong sequence.
GOTO_ERROR_IF(u < 0x0800);
// Disallow surrogate pair code points.
GOTO_ERROR_IF((u & 0xf800) == 0xd800);
// Flag surrogate pair code points as errors, but allow them through.
if (KJ_UNLIKELY((u & 0xf800) == 0xd800)) {
if (result.size() > 0 &&
(u & 0xfc00) == 0xdc00 &&
(result.back() & 0xfc00) == 0xd800) {
// Whoops, the *previous* character was also an invalid surrogate, and if we add this
// one too, they'll form a valid surrogate pair. If we allowed this, then it would mean
// invalid UTF-8 round-tripped to UTF-16 and back could actually change meaning entirely.
// OTOH, the reason we allow dangling surrogates is to allow invalid UTF-16 to round-trip
// to UTF-8 without loss, but if the original UTF-16 had a valid surrogate pair, it would
// have been encoded as a valid single UTF-8 codepoint, not as separate UTF-8 codepoints
// for each surrogate.
goto error;
}
hadErrors = true;
}
result.add(u);
continue;
......@@ -153,9 +168,12 @@ EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) {
} else if ((u & 0xf800) == 0xd800) {
// surrogate pair
char16_t u2;
GOTO_ERROR_IF(i == utf16.size() // missing second half
if (KJ_UNLIKELY(i == utf16.size() // missing second half
|| (u & 0x0400) != 0 // first half in wrong range
|| ((u2 = utf16[i]) & 0xfc00) != 0xdc00); // second half in wrong range
|| ((u2 = utf16[i]) & 0xfc00) != 0xdc00)) { // second half in wrong range
hadErrors = true;
goto threeByte;
}
++i;
char32_t u32 = (((u & 0x03ff) << 10) | (u2 & 0x03ff)) + 0x10000;
......@@ -167,6 +185,7 @@ EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) {
});
continue;
} else {
threeByte:
result.addAll<std::initializer_list<char>>({
static_cast<char>(((u >> 12) ) | 0xe0),
static_cast<char>(((u >> 6) & 0x3f) | 0x80),
......@@ -174,10 +193,6 @@ EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) {
});
continue;
}
error:
result.addAll(StringPtr(u8"\ufffd"));
hadErrors = true;
}
result.add(0);
......@@ -202,7 +217,10 @@ EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf16) {
});
continue;
} else if (u < 0x10000) {
GOTO_ERROR_IF((u & 0xfffff800) == 0xd800); // no surrogates allowed in utf-32
if (KJ_UNLIKELY((u & 0xfffff800) == 0xd800)) {
// no surrogates allowed in utf-32
hadErrors = true;
}
result.addAll<std::initializer_list<char>>({
static_cast<char>(((u >> 12) ) | 0xe0),
static_cast<char>(((u >> 6) & 0x3f) | 0x80),
......
......@@ -52,17 +52,24 @@ struct EncodingResult: public ResultType {
const bool hadErrors;
};
template <typename T>
inline auto KJ_STRINGIFY(const EncodingResult<T>& value)
-> decltype(toCharSequence(implicitCast<const T&>(value))) {
return toCharSequence(implicitCast<const T&>(value));
}
EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false);
EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false);
// Convert UTF-8 text (which KJ strings use) to UTF-16 or UTF-32.
//
// If `nulTerminate` is true, an extra NUL character will be added to the end of the output.
//
// The `try` versions return null if the input is invalid; the non-`try` versions return data
// containing the Unicode replacement character (U+FFFD).
//
// The returned arrays are in platform-native endianness (otherwise they wouldn't really be
// char16_t / char32_t).
//
// Note that the KJ Unicode encoding and decoding functions actually implement
// [WTF-8 encoding](http://simonsapin.github.io/wtf-8/), which affects how invalid input is
// handled. See comments on decodeUtf16() for more info.
EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16);
EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32);
......@@ -71,10 +78,34 @@ EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32);
// The input should NOT include a NUL terminator; any NUL characters in the input array will be
// preserved in the output.
//
// The `try` versions return null if the input is invalid; the non-`try` versions return data
// containing the Unicode replacement character (U+FFFD).
//
// The input must be in platform-native endianness. BOMs are NOT recognized by these functions.
//
// Note that the KJ Unicode encoding and decoding functions actually implement
// [WTF-8 encoding](http://simonsapin.github.io/wtf-8/). This means that if you start with an array
// of char16_t and you pass it through any number of conversions to other Unicode encodings,
// eventually returning it to UTF-16, all the while ignoring `hadErrors`, you will end up with
// exactly the same char16_t array you started with, *even if* the array is not valid UTF-16. This
// is useful because many real-world systems that were designed for UCS-2 (plain 16-bit Unicode)
// and later "upgraded" to UTF-16 do not enforce that their UTF-16 is well-formed. For example,
// file names on Windows NT are encoded using 16-bit characters, without enforcing that the
// character sequence is valid UTF-16. It is important that programs on Windows be able to handle
// such filenames, even if they choose to convert the name to UTF-8 for internal processing.
//
// Specifically, KJ's Unicode handling allows unpaired surrogate code points to round-trip through
// UTF-8 and UTF-32. Unpaired surrogates will be flagged as an error (setting `hadErrors` in the
// result), but will NOT be replaced with the Unicode replacement character as other erroneous
// sequences would be, but rather encoded as an invalid surrogate codepoint in the target encoding.
//
// KJ makes the following guarantees about invalid input:
// - A round trip from UTF-16 to other encodings and back will produce exactly the original input,
// with every leg of the trip raising the `hadErrors` flag if the original input was not valid.
// - A round trip from UTF-8 or UTF-32 to other encodings and back will either produce exactly
// the original input, or will have replaced some invalid sequences with the Unicode replacement
// character, U+FFFD. No code units will ever be removed unless they are replaced with U+FFFD,
// and no code units will ever be added except to encode U+FFFD. If the original input was not
// valid, the `hadErrors` flag will be raised on the first leg of the trip, and will also be
// raised on subsequent legs unless all invalid sequences were replaced with U+FFFD (which, after
// all, is a valid code point).
String encodeHex(ArrayPtr<const byte> bytes);
EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment