encoding-test.c++ 12.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
// Copyright (c) 2017 Cloudflare, Inc. and contributors
// Licensed under the MIT License:
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

#include "encoding.h"
#include <kj/test.h>
#include <stdint.h>

namespace kj {
namespace {

29
CappedArray<char, sizeof(char    ) * 2 + 1> hex(byte     i) { return kj::hex((uint8_t )i); }
30 31 32 33 34 35 36
CappedArray<char, sizeof(char    ) * 2 + 1> hex(char     i) { return kj::hex((uint8_t )i); }
CappedArray<char, sizeof(char16_t) * 2 + 1> hex(char16_t i) { return kj::hex((uint16_t)i); }
CappedArray<char, sizeof(char32_t) * 2 + 1> hex(char32_t i) { return kj::hex((uint32_t)i); }
// Hexify chars correctly.
//
// TODO(cleanup): Should this go into string.h with the other definitions of hex()?

37
template <typename T, typename U>
Kenton Varda's avatar
Kenton Varda committed
38
void expectResImpl(EncodingResult<T> result,
39 40
                   ArrayPtr<const U> expected,
                   bool errors = false) {
41 42 43 44 45 46 47 48 49 50 51 52
  if (errors) {
    KJ_EXPECT(result.hadErrors);
  } else {
    KJ_EXPECT(!result.hadErrors);
  }

  KJ_EXPECT(result.size() == expected.size(), result.size(), expected.size());
  for (auto i: kj::zeroTo(kj::min(result.size(), expected.size()))) {
    KJ_EXPECT(result[i] == expected[i], i, hex(result[i]), hex(expected[i]));
  }
}

53
template <typename T, typename U, size_t s>
54
void expectRes(EncodingResult<T> result,
55
               const U (&expected)[s],
56
               bool errors = false) {
Kenton Varda's avatar
Kenton Varda committed
57
  expectResImpl(kj::mv(result), arrayPtr(expected, s - 1), errors);
58 59 60 61 62 63
}

template <typename T, size_t s>
void expectRes(EncodingResult<T> result,
               byte (&expected)[s],
               bool errors = false) {
64
  expectResImpl(kj::mv(result), arrayPtr<const byte>(expected, s), errors);
65 66 67
}

KJ_TEST("encode UTF-8 to UTF-16") {
68 69 70 71
  expectRes(encodeUtf16(u8"foo"), u"foo");
  expectRes(encodeUtf16(u8"Здравствуйте"), u"Здравствуйте");
  expectRes(encodeUtf16(u8"中国网络"), u"中国网络");
  expectRes(encodeUtf16(u8"😺☁☄🐵"), u"😺☁☄🐵");
72 73 74 75
}

KJ_TEST("invalid UTF-8 to UTF-16") {
  // Disembodied continuation byte.
76 77 78
  expectRes(encodeUtf16("\x80"), u"\ufffd", true);
  expectRes(encodeUtf16("f\xbfo"), u"f\ufffdo", true);
  expectRes(encodeUtf16("f\xbf\x80\xb0o"), u"f\ufffdo", true);
79 80

  // Missing continuation bytes.
81 82 83 84 85 86
  expectRes(encodeUtf16("\xc2x"), u"\ufffdx", true);
  expectRes(encodeUtf16("\xe0x"), u"\ufffdx", true);
  expectRes(encodeUtf16("\xe0\xa0x"), u"\ufffdx", true);
  expectRes(encodeUtf16("\xf0x"), u"\ufffdx", true);
  expectRes(encodeUtf16("\xf0\x90x"), u"\ufffdx", true);
  expectRes(encodeUtf16("\xf0\x90\x80x"), u"\ufffdx", true);
87 88

  // Overlong sequences.
89 90 91 92
  expectRes(encodeUtf16("\xc0\x80"), u"\ufffd", true);
  expectRes(encodeUtf16("\xc1\xbf"), u"\ufffd", true);
  expectRes(encodeUtf16("\xc2\x80"), u"\u0080", false);
  expectRes(encodeUtf16("\xdf\xbf"), u"\u07ff", false);
93

94 95 96
  expectRes(encodeUtf16("\xe0\x80\x80"), u"\ufffd", true);
  expectRes(encodeUtf16("\xe0\x9f\xbf"), u"\ufffd", true);
  expectRes(encodeUtf16("\xe0\xa0\x80"), u"\u0800", false);
Kenton Varda's avatar
Kenton Varda committed
97 98 99 100 101 102 103
  expectRes(encodeUtf16("\xef\xbf\xbe"), u"\ufffe", false);

  // Due to a classic off-by-one error, GCC 4.x rather hilariously encodes '\uffff' as the
  // "surrogate pair" 0xd7ff, 0xdfff: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=41698
  if (kj::size(u"\uffff") == 2) {
    expectRes(encodeUtf16("\xef\xbf\xbf"), u"\uffff", false);
  }
104

105 106 107 108
  expectRes(encodeUtf16("\xf0\x80\x80\x80"), u"\ufffd", true);
  expectRes(encodeUtf16("\xf0\x8f\xbf\xbf"), u"\ufffd", true);
  expectRes(encodeUtf16("\xf0\x90\x80\x80"), u"\U00010000", false);
  expectRes(encodeUtf16("\xf4\x8f\xbf\xbf"), u"\U0010ffff", false);
109 110

  // Out of Unicode range.
111 112 113 114 115
  expectRes(encodeUtf16("\xf5\x80\x80\x80"), u"\ufffd", true);
  expectRes(encodeUtf16("\xf8\xbf\x80\x80\x80"), u"\ufffd", true);
  expectRes(encodeUtf16("\xfc\xbf\x80\x80\x80\x80"), u"\ufffd", true);
  expectRes(encodeUtf16("\xfe\xbf\x80\x80\x80\x80\x80"), u"\ufffd", true);
  expectRes(encodeUtf16("\xff\xbf\x80\x80\x80\x80\x80\x80"), u"\ufffd", true);
116 117 118
}

KJ_TEST("encode UTF-8 to UTF-32") {
119 120 121 122
  expectRes(encodeUtf32(u8"foo"), U"foo");
  expectRes(encodeUtf32(u8"Здравствуйте"), U"Здравствуйте");
  expectRes(encodeUtf32(u8"中国网络"), U"中国网络");
  expectRes(encodeUtf32(u8"😺☁☄🐵"), U"😺☁☄🐵");
123 124 125 126
}

KJ_TEST("invalid UTF-8 to UTF-32") {
  // Disembodied continuation byte.
127 128 129
  expectRes(encodeUtf32("\x80"), U"\ufffd", true);
  expectRes(encodeUtf32("f\xbfo"), U"f\ufffdo", true);
  expectRes(encodeUtf32("f\xbf\x80\xb0o"), U"f\ufffdo", true);
130 131

  // Missing continuation bytes.
132 133 134 135 136 137
  expectRes(encodeUtf32("\xc2x"), U"\ufffdx", true);
  expectRes(encodeUtf32("\xe0x"), U"\ufffdx", true);
  expectRes(encodeUtf32("\xe0\xa0x"), U"\ufffdx", true);
  expectRes(encodeUtf32("\xf0x"), U"\ufffdx", true);
  expectRes(encodeUtf32("\xf0\x90x"), U"\ufffdx", true);
  expectRes(encodeUtf32("\xf0\x90\x80x"), U"\ufffdx", true);
138 139

  // Overlong sequences.
140 141 142 143
  expectRes(encodeUtf32("\xc0\x80"), U"\ufffd", true);
  expectRes(encodeUtf32("\xc1\xbf"), U"\ufffd", true);
  expectRes(encodeUtf32("\xc2\x80"), U"\u0080", false);
  expectRes(encodeUtf32("\xdf\xbf"), U"\u07ff", false);
144

145 146 147 148
  expectRes(encodeUtf32("\xe0\x80\x80"), U"\ufffd", true);
  expectRes(encodeUtf32("\xe0\x9f\xbf"), U"\ufffd", true);
  expectRes(encodeUtf32("\xe0\xa0\x80"), U"\u0800", false);
  expectRes(encodeUtf32("\xef\xbf\xbf"), U"\uffff", false);
149

150 151 152 153
  expectRes(encodeUtf32("\xf0\x80\x80\x80"), U"\ufffd", true);
  expectRes(encodeUtf32("\xf0\x8f\xbf\xbf"), U"\ufffd", true);
  expectRes(encodeUtf32("\xf0\x90\x80\x80"), U"\U00010000", false);
  expectRes(encodeUtf32("\xf4\x8f\xbf\xbf"), U"\U0010ffff", false);
154 155

  // Out of Unicode range.
156 157 158 159 160
  expectRes(encodeUtf32("\xf5\x80\x80\x80"), U"\ufffd", true);
  expectRes(encodeUtf32("\xf8\xbf\x80\x80\x80"), U"\ufffd", true);
  expectRes(encodeUtf32("\xfc\xbf\x80\x80\x80\x80"), U"\ufffd", true);
  expectRes(encodeUtf32("\xfe\xbf\x80\x80\x80\x80\x80"), U"\ufffd", true);
  expectRes(encodeUtf32("\xff\xbf\x80\x80\x80\x80\x80\x80"), U"\ufffd", true);
161 162 163
}

KJ_TEST("decode UTF-16 to UTF-8") {
164 165 166 167
  expectRes(decodeUtf16(u"foo"), u8"foo");
  expectRes(decodeUtf16(u"Здравствуйте"), u8"Здравствуйте");
  expectRes(decodeUtf16(u"中国网络"), u8"中国网络");
  expectRes(decodeUtf16(u"😺☁☄🐵"), u8"😺☁☄🐵");
168 169 170 171
}

KJ_TEST("invalid UTF-16 to UTF-8") {
  // Surrogates in wrong order.
172
  expectRes(decodeUtf16(u"\xd7ff\xdc00\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true);
173 174

  // Missing second surrogate.
175 176 177
  expectRes(decodeUtf16(u"f\xd800"), u8"f\ufffd", true);
  expectRes(decodeUtf16(u"f\xd800x"), u8"f\ufffdx", true);
  expectRes(decodeUtf16(u"f\xd800\xd800x"), u8"f\ufffd\ufffdx", true);
178 179 180
}

KJ_TEST("decode UTF-32 to UTF-8") {
181 182 183 184
  expectRes(decodeUtf32(U"foo"), u8"foo");
  expectRes(decodeUtf32(U"Здравствуйте"), u8"Здравствуйте");
  expectRes(decodeUtf32(U"中国网络"), u8"中国网络");
  expectRes(decodeUtf32(U"😺☁☄🐵"), u8"😺☁☄🐵");
185 186 187 188
}

KJ_TEST("invalid UTF-32 to UTF-8") {
  // Surrogates rejected.
189
  expectRes(decodeUtf32(U"\xd7ff\xdc00\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true);
190 191

  // Even if it would be a valid surrogate pair in UTF-16.
192
  expectRes(decodeUtf32(U"\xd7ff\xd800\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true);
193 194
}

195 196 197 198
KJ_TEST("EncodingResult as a Maybe") {
  KJ_IF_MAYBE(result, encodeUtf16("\x80")) {
    KJ_FAIL_EXPECT("expected failure");
  }
199

200 201 202 203 204
  KJ_IF_MAYBE(result, encodeUtf16("foo")) {
    // good
  } else {
    KJ_FAIL_EXPECT("expected success");
  }
205

206
  KJ_EXPECT(KJ_ASSERT_NONNULL(decodeUtf16(u"foo")) == "foo");
207 208 209 210 211 212 213 214
}

// =======================================================================================

KJ_TEST("hex encoding/decoding") {
  byte bytes[] = {0x12, 0x34, 0xab, 0xf2};

  KJ_EXPECT(encodeHex(bytes) == "1234abf2");
215 216 217 218 219 220 221 222 223 224

  expectRes(decodeHex("1234abf2"), bytes);

  expectRes(decodeHex("1234abf21"), bytes, true);

  bytes[2] = 0xa0;
  expectRes(decodeHex("1234axf2"), bytes, true);

  bytes[2] = 0x0b;
  expectRes(decodeHex("1234xbf2"), bytes, true);
225 226 227 228 229
}

KJ_TEST("URI encoding/decoding") {
  KJ_EXPECT(encodeUriComponent("foo") == "foo");
  KJ_EXPECT(encodeUriComponent("foo bar") == "foo%20bar");
230
  KJ_EXPECT(encodeUriComponent("\xab\xba") == "%AB%BA");
231 232
  KJ_EXPECT(encodeUriComponent(StringPtr("foo\0bar", 7)) == "foo%00bar");

233 234 235 236 237 238 239
  expectRes(decodeUriComponent("foo%20bar"), "foo bar");
  expectRes(decodeUriComponent("%ab%BA"), "\xab\xba");

  expectRes(decodeUriComponent("foo%1xxx"), "foo\1xxx", true);
  expectRes(decodeUriComponent("foo%1"), "foo\1", true);
  expectRes(decodeUriComponent("foo%xxx"), "fooxxx", true);
  expectRes(decodeUriComponent("foo%"), "foo", true);
240 241 242 243 244

  byte bytes[] = {12, 34, 56};
  KJ_EXPECT(decodeBinaryUriComponent(encodeUriComponent(bytes)).asPtr() == bytes);
}

245 246 247 248 249 250
KJ_TEST("C escape encoding/decoding") {
  KJ_EXPECT(encodeCEscape("fooo\a\b\f\n\r\t\v\'\"\\bar") ==
      "fooo\\a\\b\\f\\n\\r\\t\\v\\\'\\\"\\\\bar");
  KJ_EXPECT(encodeCEscape("foo\x01\x7fxxx") ==
      "foo\\001\\177xxx");

251
  expectRes(decodeCEscape("fooo\\a\\b\\f\\n\\r\\t\\v\\\'\\\"\\\\bar"),
252
      "fooo\a\b\f\n\r\t\v\'\"\\bar");
253 254 255 256
  expectRes(decodeCEscape("foo\\x01\\x7fxxx"), "foo\x01\x7fxxx");
  expectRes(decodeCEscape("foo\\001\\177234"), "foo\001\177234");
  expectRes(decodeCEscape("foo\\x1"), "foo\x1");
  expectRes(decodeCEscape("foo\\1"), "foo\1");
257

258 259
  expectRes(decodeCEscape("foo\\u1234bar"), u8"foo\u1234bar");
  expectRes(decodeCEscape("foo\\U00045678bar"), u8"foo\U00045678bar");
260 261

  // Error cases.
262 263 264 265 266 267
  expectRes(decodeCEscape("foo\\"), "foo", true);
  expectRes(decodeCEscape("foo\\x123x"), u8"foo\x23x", true);
  expectRes(decodeCEscape("foo\\u12"), u8"foo\u0012", true);
  expectRes(decodeCEscape("foo\\u12xxx"), u8"foo\u0012xxx", true);
  expectRes(decodeCEscape("foo\\U12"), u8"foo\u0012", true);
  expectRes(decodeCEscape("foo\\U12xxxxxxxx"), u8"foo\u0012xxxxxxxx", true);
268 269
}

270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322
KJ_TEST("base64 encoding/decoding") {
  {
    auto encoded = encodeBase64(StringPtr("foo").asBytes(), false);
    KJ_EXPECT(encoded == "Zm9v", encoded, encoded.size());
    KJ_EXPECT(heapString(decodeBase64(encoded.asArray()).asChars()) == "foo");
  }

  {
    auto encoded = encodeBase64(StringPtr("corge").asBytes(), false);
    KJ_EXPECT(encoded == "Y29yZ2U=", encoded);
    KJ_EXPECT(heapString(decodeBase64(encoded.asArray()).asChars()) == "corge");
  }

  KJ_EXPECT(heapString(decodeBase64("Y29yZ2U").asChars()) == "corge");
  KJ_EXPECT(heapString(decodeBase64("Y\n29y Z@2U=\n").asChars()) == "corge");

  {
    auto encoded = encodeBase64(StringPtr("corge").asBytes(), true);
    KJ_EXPECT(encoded == "Y29yZ2U=\n", encoded);
  }

  StringPtr fullLine = "012345678901234567890123456789012345678901234567890123";
  {
    auto encoded = encodeBase64(fullLine.asBytes(), false);
    KJ_EXPECT(
        encoded == "MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIz",
        encoded);
  }
  {
    auto encoded = encodeBase64(fullLine.asBytes(), true);
    KJ_EXPECT(
        encoded == "MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIz\n",
        encoded);
  }

  String multiLine = str(fullLine, "456");
  {
    auto encoded = encodeBase64(multiLine.asBytes(), false);
    KJ_EXPECT(
        encoded == "MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2",
        encoded);
  }
  {
    auto encoded = encodeBase64(multiLine.asBytes(), true);
    KJ_EXPECT(
        encoded == "MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIz\n"
                   "NDU2\n",
        encoded);
  }
}

}  // namespace
}  // namespace kj