// Copyright (c) 2017 Cloudflare, Inc. and contributors // Licensed under the MIT License: // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "encoding.h" #include <kj/test.h> #include <stdint.h> namespace kj { namespace { CappedArray<char, sizeof(char ) * 2 + 1> hex(byte i) { return kj::hex((uint8_t )i); } CappedArray<char, sizeof(char ) * 2 + 1> hex(char i) { return kj::hex((uint8_t )i); } CappedArray<char, sizeof(char16_t) * 2 + 1> hex(char16_t i) { return kj::hex((uint16_t)i); } CappedArray<char, sizeof(char32_t) * 2 + 1> hex(char32_t i) { return kj::hex((uint32_t)i); } CappedArray<char, sizeof(uint32_t) * 2 + 1> hex(wchar_t i) { return kj::hex((uint32_t)i); } // Hexify chars correctly. // // TODO(cleanup): Should this go into string.h with the other definitions of hex()? template <typename T, typename U> void expectResImpl(EncodingResult<T> result, ArrayPtr<const U> expected, bool errors = false) { if (errors) { KJ_EXPECT(result.hadErrors); } else { KJ_EXPECT(!result.hadErrors); } KJ_EXPECT(result.size() == expected.size(), result.size(), expected.size()); for (auto i: kj::zeroTo(kj::min(result.size(), expected.size()))) { KJ_EXPECT(result[i] == expected[i], i, hex(result[i]), hex(expected[i])); } } template <typename T, typename U, size_t s> void expectRes(EncodingResult<T> result, const U (&expected)[s], bool errors = false) { expectResImpl(kj::mv(result), arrayPtr(expected, s - 1), errors); } template <typename T, size_t s> void expectRes(EncodingResult<T> result, byte (&expected)[s], bool errors = false) { expectResImpl(kj::mv(result), arrayPtr<const byte>(expected, s), errors); } // Handy reference for surrogate pair edge cases: // // \ud800 -> \xed\xa0\x80 // \udc00 -> \xed\xb0\x80 // \udbff -> \xed\xaf\xbf // \udfff -> \xed\xbf\xbf KJ_TEST("encode UTF-8 to UTF-16") { expectRes(encodeUtf16(u8"foo"), u"foo"); expectRes(encodeUtf16(u8"Здравствуйте"), u"Здравствуйте"); expectRes(encodeUtf16(u8"中国网络"), u"中国网络"); expectRes(encodeUtf16(u8"😺☁☄🐵"), u"😺☁☄🐵"); } KJ_TEST("invalid UTF-8 to UTF-16") { // Disembodied continuation byte. expectRes(encodeUtf16("\x80"), u"\ufffd", true); expectRes(encodeUtf16("f\xbfo"), u"f\ufffdo", true); expectRes(encodeUtf16("f\xbf\x80\xb0o"), u"f\ufffdo", true); // Missing continuation bytes. expectRes(encodeUtf16("\xc2x"), u"\ufffdx", true); expectRes(encodeUtf16("\xe0x"), u"\ufffdx", true); expectRes(encodeUtf16("\xe0\xa0x"), u"\ufffdx", true); expectRes(encodeUtf16("\xf0x"), u"\ufffdx", true); expectRes(encodeUtf16("\xf0\x90x"), u"\ufffdx", true); expectRes(encodeUtf16("\xf0\x90\x80x"), u"\ufffdx", true); // Overlong sequences. expectRes(encodeUtf16("\xc0\x80"), u"\ufffd", true); expectRes(encodeUtf16("\xc1\xbf"), u"\ufffd", true); expectRes(encodeUtf16("\xc2\x80"), u"\u0080", false); expectRes(encodeUtf16("\xdf\xbf"), u"\u07ff", false); expectRes(encodeUtf16("\xe0\x80\x80"), u"\ufffd", true); expectRes(encodeUtf16("\xe0\x9f\xbf"), u"\ufffd", true); expectRes(encodeUtf16("\xe0\xa0\x80"), u"\u0800", false); expectRes(encodeUtf16("\xef\xbf\xbe"), u"\ufffe", false); // Due to a classic off-by-one error, GCC 4.x rather hilariously encodes '\uffff' as the // "surrogate pair" 0xd7ff, 0xdfff: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=41698 if (kj::size(u"\uffff") == 2) { expectRes(encodeUtf16("\xef\xbf\xbf"), u"\uffff", false); } expectRes(encodeUtf16("\xf0\x80\x80\x80"), u"\ufffd", true); expectRes(encodeUtf16("\xf0\x8f\xbf\xbf"), u"\ufffd", true); expectRes(encodeUtf16("\xf0\x90\x80\x80"), u"\U00010000", false); expectRes(encodeUtf16("\xf4\x8f\xbf\xbf"), u"\U0010ffff", false); // Out of Unicode range. expectRes(encodeUtf16("\xf5\x80\x80\x80"), u"\ufffd", true); expectRes(encodeUtf16("\xf8\xbf\x80\x80\x80"), u"\ufffd", true); expectRes(encodeUtf16("\xfc\xbf\x80\x80\x80\x80"), u"\ufffd", true); expectRes(encodeUtf16("\xfe\xbf\x80\x80\x80\x80\x80"), u"\ufffd", true); expectRes(encodeUtf16("\xff\xbf\x80\x80\x80\x80\x80\x80"), u"\ufffd", true); // Surrogates encoded as separate UTF-8 code points are flagged as errors but allowed to decode // to UTF-16 surrogate values. expectRes(encodeUtf16("\xed\xb0\x80\xed\xaf\xbf"), u"\xdc00\xdbff", true); expectRes(encodeUtf16("\xed\xbf\xbf\xed\xa0\x80"), u"\xdfff\xd800", true); expectRes(encodeUtf16("\xed\xb0\x80\xed\xbf\xbf"), u"\xdc00\xdfff", true); expectRes(encodeUtf16("f\xed\xa0\x80"), u"f\xd800", true); expectRes(encodeUtf16("f\xed\xa0\x80x"), u"f\xd800x", true); expectRes(encodeUtf16("f\xed\xa0\x80\xed\xa0\x80x"), u"f\xd800\xd800x", true); // However, if successive UTF-8 codepoints decode to a proper surrogate pair, the second // surrogate is replaced with the Unicode replacement character to avoid creating valid UTF-16. expectRes(encodeUtf16("\xed\xa0\x80\xed\xbf\xbf"), u"\xd800\xfffd", true); expectRes(encodeUtf16("\xed\xaf\xbf\xed\xb0\x80"), u"\xdbff\xfffd", true); } KJ_TEST("encode UTF-8 to UTF-32") { expectRes(encodeUtf32(u8"foo"), U"foo"); expectRes(encodeUtf32(u8"Здравствуйте"), U"Здравствуйте"); expectRes(encodeUtf32(u8"中国网络"), U"中国网络"); expectRes(encodeUtf32(u8"😺☁☄🐵"), U"😺☁☄🐵"); } KJ_TEST("invalid UTF-8 to UTF-32") { // Disembodied continuation byte. expectRes(encodeUtf32("\x80"), U"\ufffd", true); expectRes(encodeUtf32("f\xbfo"), U"f\ufffdo", true); expectRes(encodeUtf32("f\xbf\x80\xb0o"), U"f\ufffdo", true); // Missing continuation bytes. expectRes(encodeUtf32("\xc2x"), U"\ufffdx", true); expectRes(encodeUtf32("\xe0x"), U"\ufffdx", true); expectRes(encodeUtf32("\xe0\xa0x"), U"\ufffdx", true); expectRes(encodeUtf32("\xf0x"), U"\ufffdx", true); expectRes(encodeUtf32("\xf0\x90x"), U"\ufffdx", true); expectRes(encodeUtf32("\xf0\x90\x80x"), U"\ufffdx", true); // Overlong sequences. expectRes(encodeUtf32("\xc0\x80"), U"\ufffd", true); expectRes(encodeUtf32("\xc1\xbf"), U"\ufffd", true); expectRes(encodeUtf32("\xc2\x80"), U"\u0080", false); expectRes(encodeUtf32("\xdf\xbf"), U"\u07ff", false); expectRes(encodeUtf32("\xe0\x80\x80"), U"\ufffd", true); expectRes(encodeUtf32("\xe0\x9f\xbf"), U"\ufffd", true); expectRes(encodeUtf32("\xe0\xa0\x80"), U"\u0800", false); expectRes(encodeUtf32("\xef\xbf\xbf"), U"\uffff", false); expectRes(encodeUtf32("\xf0\x80\x80\x80"), U"\ufffd", true); expectRes(encodeUtf32("\xf0\x8f\xbf\xbf"), U"\ufffd", true); expectRes(encodeUtf32("\xf0\x90\x80\x80"), U"\U00010000", false); expectRes(encodeUtf32("\xf4\x8f\xbf\xbf"), U"\U0010ffff", false); // Out of Unicode range. expectRes(encodeUtf32("\xf5\x80\x80\x80"), U"\ufffd", true); expectRes(encodeUtf32("\xf8\xbf\x80\x80\x80"), U"\ufffd", true); expectRes(encodeUtf32("\xfc\xbf\x80\x80\x80\x80"), U"\ufffd", true); expectRes(encodeUtf32("\xfe\xbf\x80\x80\x80\x80\x80"), U"\ufffd", true); expectRes(encodeUtf32("\xff\xbf\x80\x80\x80\x80\x80\x80"), U"\ufffd", true); } KJ_TEST("decode UTF-16 to UTF-8") { expectRes(decodeUtf16(u"foo"), u8"foo"); expectRes(decodeUtf16(u"Здравствуйте"), u8"Здравствуйте"); expectRes(decodeUtf16(u"中国网络"), u8"中国网络"); expectRes(decodeUtf16(u"😺☁☄🐵"), u8"😺☁☄🐵"); } KJ_TEST("invalid UTF-16 to UTF-8") { // Surrogates in wrong order. expectRes(decodeUtf16(u"\xdc00\xdbff"), "\xed\xb0\x80\xed\xaf\xbf", true); expectRes(decodeUtf16(u"\xdfff\xd800"), "\xed\xbf\xbf\xed\xa0\x80", true); // Missing second surrogate. expectRes(decodeUtf16(u"f\xd800"), "f\xed\xa0\x80", true); expectRes(decodeUtf16(u"f\xd800x"), "f\xed\xa0\x80x", true); expectRes(decodeUtf16(u"f\xd800\xd800x"), "f\xed\xa0\x80\xed\xa0\x80x", true); } KJ_TEST("decode UTF-32 to UTF-8") { expectRes(decodeUtf32(U"foo"), u8"foo"); expectRes(decodeUtf32(U"Здравствуйте"), u8"Здравствуйте"); expectRes(decodeUtf32(U"中国网络"), u8"中国网络"); expectRes(decodeUtf32(U"😺☁☄🐵"), u8"😺☁☄🐵"); } KJ_TEST("invalid UTF-32 to UTF-8") { // Surrogates rejected. expectRes(decodeUtf32(U"\xdfff\xd800"), "\xed\xbf\xbf\xed\xa0\x80", true); // Even if it would be a valid surrogate pair in UTF-16. expectRes(decodeUtf32(U"\xd800\xdfff"), "\xed\xa0\x80\xed\xbf\xbf", true); } KJ_TEST("round-trip invalid UTF-16") { const char16_t INVALID[] = u"\xdfff foo \xd800\xdc00 bar \xdc00\xd800 baz \xdbff qux \xd800"; expectRes(encodeUtf16(decodeUtf16(INVALID)), INVALID, true); expectRes(encodeUtf16(decodeUtf32(encodeUtf32(decodeUtf16(INVALID)))), INVALID, true); } KJ_TEST("EncodingResult as a Maybe") { KJ_IF_MAYBE(result, encodeUtf16("\x80")) { KJ_FAIL_EXPECT("expected failure"); } KJ_IF_MAYBE(result, encodeUtf16("foo")) { // good } else { KJ_FAIL_EXPECT("expected success"); } KJ_EXPECT(KJ_ASSERT_NONNULL(decodeUtf16(u"foo")) == "foo"); } KJ_TEST("encode to wchar_t") { expectRes(encodeWideString(u8"foo"), L"foo"); expectRes(encodeWideString(u8"Здравствуйте"), L"Здравствуйте"); expectRes(encodeWideString(u8"中国网络"), L"中国网络"); expectRes(encodeWideString(u8"😺☁☄🐵"), L"😺☁☄🐵"); } KJ_TEST("decode from wchar_t") { expectRes(decodeWideString(L"foo"), u8"foo"); expectRes(decodeWideString(L"Здравствуйте"), u8"Здравствуйте"); expectRes(decodeWideString(L"中国网络"), u8"中国网络"); expectRes(decodeWideString(L"😺☁☄🐵"), u8"😺☁☄🐵"); } // ======================================================================================= KJ_TEST("hex encoding/decoding") { byte bytes[] = {0x12, 0x34, 0xab, 0xf2}; KJ_EXPECT(encodeHex(bytes) == "1234abf2"); expectRes(decodeHex("1234abf2"), bytes); expectRes(decodeHex("1234abf21"), bytes, true); bytes[2] = 0xa0; expectRes(decodeHex("1234axf2"), bytes, true); bytes[2] = 0x0b; expectRes(decodeHex("1234xbf2"), bytes, true); } constexpr char RFC2396_FRAGMENT_SET_DIFF[] = "#$&+,/:;=?@[\\]^{|}"; // These are the characters reserved in RFC 2396, but not in the fragment percent encode set. KJ_TEST("URI encoding/decoding") { KJ_EXPECT(encodeUriComponent("foo") == "foo"); KJ_EXPECT(encodeUriComponent("foo bar") == "foo%20bar"); KJ_EXPECT(encodeUriComponent("\xab\xba") == "%AB%BA"); KJ_EXPECT(encodeUriComponent(StringPtr("foo\0bar", 7)) == "foo%00bar"); KJ_EXPECT(encodeUriComponent(RFC2396_FRAGMENT_SET_DIFF) == "%23%24%26%2B%2C%2F%3A%3B%3D%3F%40%5B%5C%5D%5E%7B%7C%7D"); // Encode characters reserved by application/x-www-form-urlencoded, but not by RFC 2396. KJ_EXPECT(encodeUriComponent("'foo'! (~)") == "'foo'!%20(~)"); expectRes(decodeUriComponent("foo%20bar"), "foo bar"); expectRes(decodeUriComponent("%ab%BA"), "\xab\xba"); expectRes(decodeUriComponent("foo%1xxx"), "foo\1xxx", true); expectRes(decodeUriComponent("foo%1"), "foo\1", true); expectRes(decodeUriComponent("foo%xxx"), "fooxxx", true); expectRes(decodeUriComponent("foo%"), "foo", true); { byte bytes[] = {12, 34, 56}; KJ_EXPECT(decodeBinaryUriComponent(encodeUriComponent(bytes)).asPtr() == bytes); // decodeBinaryUriComponent() takes a DecodeUriOptions struct as its second parameter, but it // once took a single `bool nulTerminate`. Verify that the old behavior still compiles and // works. auto bytesWithNul = decodeBinaryUriComponent(encodeUriComponent(bytes), true); KJ_ASSERT(bytesWithNul.size() == 4); KJ_EXPECT(bytesWithNul[3] == '\0'); KJ_EXPECT(bytesWithNul.slice(0, 3) == bytes); } } KJ_TEST("URL component encoding") { KJ_EXPECT(encodeUriFragment("foo") == "foo"); KJ_EXPECT(encodeUriFragment("foo bar") == "foo%20bar"); KJ_EXPECT(encodeUriFragment("\xab\xba") == "%AB%BA"); KJ_EXPECT(encodeUriFragment(StringPtr("foo\0bar", 7)) == "foo%00bar"); KJ_EXPECT(encodeUriFragment(RFC2396_FRAGMENT_SET_DIFF) == RFC2396_FRAGMENT_SET_DIFF); KJ_EXPECT(encodeUriPath("foo") == "foo"); KJ_EXPECT(encodeUriPath("foo bar") == "foo%20bar"); KJ_EXPECT(encodeUriPath("\xab\xba") == "%AB%BA"); KJ_EXPECT(encodeUriPath(StringPtr("foo\0bar", 7)) == "foo%00bar"); KJ_EXPECT(encodeUriPath(RFC2396_FRAGMENT_SET_DIFF) == "%23$&+,%2F:;=%3F@[%5C]^%7B|%7D"); KJ_EXPECT(encodeUriUserInfo("foo") == "foo"); KJ_EXPECT(encodeUriUserInfo("foo bar") == "foo%20bar"); KJ_EXPECT(encodeUriUserInfo("\xab\xba") == "%AB%BA"); KJ_EXPECT(encodeUriUserInfo(StringPtr("foo\0bar", 7)) == "foo%00bar"); KJ_EXPECT(encodeUriUserInfo(RFC2396_FRAGMENT_SET_DIFF) == "%23$&+,%2F%3A%3B%3D%3F%40%5B%5C%5D%5E%7B%7C%7D"); // NOTE: None of these functions have explicit decode equivalents. } KJ_TEST("application/x-www-form-urlencoded encoding/decoding") { KJ_EXPECT(encodeWwwForm("foo") == "foo"); KJ_EXPECT(encodeWwwForm("foo bar") == "foo+bar"); KJ_EXPECT(encodeWwwForm("\xab\xba") == "%AB%BA"); KJ_EXPECT(encodeWwwForm(StringPtr("foo\0bar", 7)) == "foo%00bar"); // Encode characters reserved by application/x-www-form-urlencoded, but not by RFC 2396. KJ_EXPECT(encodeWwwForm("'foo'! (~)") == "%27foo%27%21+%28%7E%29"); expectRes(decodeWwwForm("foo%20bar"), "foo bar"); expectRes(decodeWwwForm("foo+bar"), "foo bar"); expectRes(decodeWwwForm("%ab%BA"), "\xab\xba"); expectRes(decodeWwwForm("foo%1xxx"), "foo\1xxx", true); expectRes(decodeWwwForm("foo%1"), "foo\1", true); expectRes(decodeWwwForm("foo%xxx"), "fooxxx", true); expectRes(decodeWwwForm("foo%"), "foo", true); { byte bytes[] = {12, 34, 56}; DecodeUriOptions options { /*.nulTerminate=*/false, /*.plusToSpace=*/true }; KJ_EXPECT(decodeBinaryUriComponent(encodeWwwForm(bytes), options) == bytes); } } KJ_TEST("C escape encoding/decoding") { KJ_EXPECT(encodeCEscape("fooo\a\b\f\n\r\t\v\'\"\\bar") == "fooo\\a\\b\\f\\n\\r\\t\\v\\\'\\\"\\\\bar"); KJ_EXPECT(encodeCEscape("foo\x01\x7fxxx") == "foo\\001\\177xxx"); expectRes(decodeCEscape("fooo\\a\\b\\f\\n\\r\\t\\v\\\'\\\"\\\\bar"), "fooo\a\b\f\n\r\t\v\'\"\\bar"); expectRes(decodeCEscape("foo\\x01\\x7fxxx"), "foo\x01\x7fxxx"); expectRes(decodeCEscape("foo\\001\\177234"), "foo\001\177234"); expectRes(decodeCEscape("foo\\x1"), "foo\x1"); expectRes(decodeCEscape("foo\\1"), "foo\1"); expectRes(decodeCEscape("foo\\u1234bar"), u8"foo\u1234bar"); expectRes(decodeCEscape("foo\\U00045678bar"), u8"foo\U00045678bar"); // Error cases. expectRes(decodeCEscape("foo\\"), "foo", true); expectRes(decodeCEscape("foo\\x123x"), u8"foo\x23x", true); expectRes(decodeCEscape("foo\\u12"), u8"foo\u0012", true); expectRes(decodeCEscape("foo\\u12xxx"), u8"foo\u0012xxx", true); expectRes(decodeCEscape("foo\\U12"), u8"foo\u0012", true); expectRes(decodeCEscape("foo\\U12xxxxxxxx"), u8"foo\u0012xxxxxxxx", true); } KJ_TEST("base64 encoding/decoding") { { auto encoded = encodeBase64(StringPtr("").asBytes(), false); KJ_EXPECT(encoded == "", encoded, encoded.size()); KJ_EXPECT(heapString(decodeBase64(encoded.asArray()).asChars()) == ""); } { auto encoded = encodeBase64(StringPtr("foo").asBytes(), false); KJ_EXPECT(encoded == "Zm9v", encoded, encoded.size()); auto decoded = decodeBase64(encoded.asArray()); KJ_EXPECT(!decoded.hadErrors); KJ_EXPECT(heapString(decoded.asChars()) == "foo"); } { auto encoded = encodeBase64(StringPtr("quux").asBytes(), false); KJ_EXPECT(encoded == "cXV1eA==", encoded, encoded.size()); KJ_EXPECT(heapString(decodeBase64(encoded.asArray()).asChars()) == "quux"); } { auto encoded = encodeBase64(StringPtr("corge").asBytes(), false); KJ_EXPECT(encoded == "Y29yZ2U=", encoded); auto decoded = decodeBase64(encoded.asArray()); KJ_EXPECT(!decoded.hadErrors); KJ_EXPECT(heapString(decoded.asChars()) == "corge"); } { auto decoded = decodeBase64("Y29yZ2U"); KJ_EXPECT(!decoded.hadErrors); KJ_EXPECT(heapString(decoded.asChars()) == "corge"); } { auto decoded = decodeBase64("Y\n29y Z@2U=\n"); KJ_EXPECT(decoded.hadErrors); // @-sign is invalid base64 input. KJ_EXPECT(heapString(decoded.asChars()) == "corge"); } { auto decoded = decodeBase64("Y\n29y Z2U=\n"); KJ_EXPECT(!decoded.hadErrors); KJ_EXPECT(heapString(decoded.asChars()) == "corge"); } // Too much padding. KJ_EXPECT(decodeBase64("Y29yZ2U==").hadErrors); KJ_EXPECT(decodeBase64("Y29yZ===").hadErrors); // Non-terminal padding. KJ_EXPECT(decodeBase64("ab=c").hadErrors); { auto encoded = encodeBase64(StringPtr("corge").asBytes(), true); KJ_EXPECT(encoded == "Y29yZ2U=\n", encoded); } StringPtr fullLine = "012345678901234567890123456789012345678901234567890123"; { auto encoded = encodeBase64(fullLine.asBytes(), false); KJ_EXPECT( encoded == "MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIz", encoded); } { auto encoded = encodeBase64(fullLine.asBytes(), true); KJ_EXPECT( encoded == "MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIz\n", encoded); } String multiLine = str(fullLine, "456"); { auto encoded = encodeBase64(multiLine.asBytes(), false); KJ_EXPECT( encoded == "MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2", encoded); } { auto encoded = encodeBase64(multiLine.asBytes(), true); KJ_EXPECT( encoded == "MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIz\n" "NDU2\n", encoded); } } } // namespace } // namespace kj