encoding.h 16.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
// Copyright (c) 2017 Cloudflare, Inc. and contributors
// Licensed under the MIT License:
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.

#pragma once
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
// Functions for encoding/decoding bytes and text in common formats, including:
// - UTF-{8,16,32}
// - Hex
// - URI encoding
// - Base64

#if defined(__GNUC__) && !KJ_HEADER_WARNINGS
#pragma GCC system_header

#include "string.h"

namespace kj {

template <typename ResultType>
struct EncodingResult: public ResultType {
  // Equivalent to ResultType (a String or wide-char array) for all intents and purposes, except
40 41 42 43 44 45 46 47 48
  // that the bool `hadErrors` can be inspected to see if any errors were encountered in the input.
  // Each encoding/decoding function that returns this type will "work around" errors in some way,
  // so an application doesn't strictly have to check for errors. E.g. the Unicode functions
  // replace errors with U+FFFD in the output.
  // Through magic, KJ_IF_MAYBE() and KJ_{REQUIRE,ASSERT}_NONNULL() work on EncodingResult<T>
  // exactly if it were a Maybe<T> that is null in case of errors.

  inline EncodingResult(ResultType&& result, bool hadErrors)
49 50 51 52 53
      : ResultType(kj::mv(result)), hadErrors(hadErrors) {}

  const bool hadErrors;

54 55 56 57 58 59
template <typename T>
inline auto KJ_STRINGIFY(const EncodingResult<T>& value)
    -> decltype(toCharSequence(implicitCast<const T&>(value))) {
  return toCharSequence(implicitCast<const T&>(value));

60 61
EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false);
EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false);
62 63 64 65 66 67
// Convert UTF-8 text (which KJ strings use) to UTF-16 or UTF-32.
// If `nulTerminate` is true, an extra NUL character will be added to the end of the output.
// The returned arrays are in platform-native endianness (otherwise they wouldn't really be
// char16_t / char32_t).
68 69 70 71
// Note that the KJ Unicode encoding and decoding functions actually implement
// [WTF-8 encoding](http://simonsapin.github.io/wtf-8/), which affects how invalid input is
// handled. See comments on decodeUtf16() for more info.

73 74
EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16);
EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32);
75 76 77 78 79 80
// Convert UTF-16 or UTF-32 to UTF-8 (which KJ strings use).
// The input should NOT include a NUL terminator; any NUL characters in the input array will be
// preserved in the output.
// The input must be in platform-native endianness. BOMs are NOT recognized by these functions.
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
// Note that the KJ Unicode encoding and decoding functions actually implement
// [WTF-8 encoding](http://simonsapin.github.io/wtf-8/). This means that if you start with an array
// of char16_t and you pass it through any number of conversions to other Unicode encodings,
// eventually returning it to UTF-16, all the while ignoring `hadErrors`, you will end up with
// exactly the same char16_t array you started with, *even if* the array is not valid UTF-16. This
// is useful because many real-world systems that were designed for UCS-2 (plain 16-bit Unicode)
// and later "upgraded" to UTF-16 do not enforce that their UTF-16 is well-formed. For example,
// file names on Windows NT are encoded using 16-bit characters, without enforcing that the
// character sequence is valid UTF-16. It is important that programs on Windows be able to handle
// such filenames, even if they choose to convert the name to UTF-8 for internal processing.
// Specifically, KJ's Unicode handling allows unpaired surrogate code points to round-trip through
// UTF-8 and UTF-32. Unpaired surrogates will be flagged as an error (setting `hadErrors` in the
// result), but will NOT be replaced with the Unicode replacement character as other erroneous
// sequences would be, but rather encoded as an invalid surrogate codepoint in the target encoding.
// KJ makes the following guarantees about invalid input:
// - A round trip from UTF-16 to other encodings and back will produce exactly the original input,
//   with every leg of the trip raising the `hadErrors` flag if the original input was not valid.
// - A round trip from UTF-8 or UTF-32 to other encodings and back will either produce exactly
//   the original input, or will have replaced some invalid sequences with the Unicode replacement
//   character, U+FFFD. No code units will ever be removed unless they are replaced with U+FFFD,
//   and no code units will ever be added except to encode U+FFFD. If the original input was not
//   valid, the `hadErrors` flag will be raised on the first leg of the trip, and will also be
//   raised on subsequent legs unless all invalid sequences were replaced with U+FFFD (which, after
//   all, is a valid code point).

109 110 111 112 113 114 115 116 117 118 119
EncodingResult<Array<wchar_t>> encodeWideString(
    ArrayPtr<const char> text, bool nulTerminate = false);
EncodingResult<String> decodeWideString(ArrayPtr<const wchar_t> wide);
// Encode / decode strings of wchar_t, aka "wide strings". Unfortunately, different platforms have
// different definitions for wchar_t. For example, on Windows they are 16-bit and encode UTF-16,
// but on Linux they are 32-bit and encode UTF-32. Some platforms even define wchar_t as 8-bit,
// encoding UTF-8 (e.g. BeOS did this).
// KJ assumes that wide strings use the UTF encoding that corresponds to the size of wchar_t on
// the target platform. So, these functions are simple aliases for encodeUtf*/decodeUtf*, above
// (or simply make a copy if wchar_t is 8 bits).
120 121

String encodeHex(ArrayPtr<const byte> bytes);
EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text);
123 124 125 126
// Encode/decode bytes as hex strings.

String encodeUriComponent(ArrayPtr<const byte> bytes);
String encodeUriComponent(ArrayPtr<const char> bytes);
EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text);
128 129 130 131 132
// Encode/decode URI components using % escapes for characters listed as "reserved" in RFC 2396.
// This is the same behavior as JavaScript's `encodeURIComponent()`.
// See https://tools.ietf.org/html/rfc2396#section-2.3

133 134 135 136 137
String encodeUriFragment(ArrayPtr<const byte> bytes);
String encodeUriFragment(ArrayPtr<const char> bytes);
// Encode URL fragment components using the fragment percent encode set defined by the WHATWG URL
// specification. Use decodeUriComponent() to decode.
138 139 140
// Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent-
//   decoded data. In other words, this function is not idempotent, in contrast to the URL spec.
141 142 143 144 145 146 147
// See https://url.spec.whatwg.org/#fragment-percent-encode-set

String encodeUriPath(ArrayPtr<const byte> bytes);
String encodeUriPath(ArrayPtr<const char> bytes);
// Encode URL path components (not entire paths!) using the path percent encode set defined by the
// WHATWG URL specification. Use decodeUriComponent() to decode.
148 149 150
// Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent-
//   decoded data. In other words, this function is not idempotent, in contrast to the URL spec.
151 152 153 154
// Quirk: This percent-encodes '/' and '\' characters as well, which are not actually in the set
//   defined by the WHATWG URL spec. Since a conforming URL implementation will only ever call this
//   function on individual path components, and never entire paths, augmenting the character set to
//   include these separators allows this function to be used to implement a URL class that stores
//   its path components in percent-decoded form.
156 157 158 159 160 161 162 163
// See https://url.spec.whatwg.org/#path-percent-encode-set

String encodeUriUserInfo(ArrayPtr<const byte> bytes);
String encodeUriUserInfo(ArrayPtr<const char> bytes);
// Encode URL userinfo components using the userinfo percent encode set defined by the WHATWG URL
// specification. Use decodeUriComponent() to decode.
164 165 166
// Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent-
//   decoded data. In other words, this function is not idempotent, in contrast to the URL spec.
167 168
// See https://url.spec.whatwg.org/#userinfo-percent-encode-set

169 170 171 172 173 174
String encodeWwwForm(ArrayPtr<const byte> bytes);
String encodeWwwForm(ArrayPtr<const char> bytes);
EncodingResult<String> decodeWwwForm(ArrayPtr<const char> text);
// Encode/decode URI components using % escapes and '+' (for spaces) according to the
// application/x-www-form-urlencoded format defined by the WHATWG URL specification.
175 176 177 178
// Note: Like the fragment, path, and userinfo percent-encoding functions above, this function is
//   not idempotent: we percent-encode '%' signs. However, in this particular case the spec happens
//   to agree with us!
179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
// See https://url.spec.whatwg.org/#concept-urlencoded-byte-serializer

struct DecodeUriOptions {
  // Parameter to `decodeBinaryUriComponent()`.

  // This struct is intentionally convertible from bool, in order to maintain backwards
  // compatibility with code written when `decodeBinaryUriComponent()` took a boolean second
  // parameter.
  DecodeUriOptions(bool nulTerminate = false, bool plusToSpace = false)
      : nulTerminate(nulTerminate), plusToSpace(plusToSpace) {}

  bool nulTerminate;
  // Append a terminal NUL byte.

  bool plusToSpace;
  // Convert '+' to ' ' characters before percent decoding. Used to decode
  // application/x-www-form-urlencoded text, such as query strings.
EncodingResult<Array<byte>> decodeBinaryUriComponent(
    ArrayPtr<const char> text, DecodeUriOptions options = DecodeUriOptions());
// Decode URI components using % escapes. This is a lower-level interface used to implement both
// `decodeUriComponent()` and `decodeWwwForm()`

202 203
String encodeCEscape(ArrayPtr<const byte> bytes);
String encodeCEscape(ArrayPtr<const char> bytes);
204 205 206
EncodingResult<Array<byte>> decodeBinaryCEscape(
    ArrayPtr<const char> text, bool nulTerminate = false);
EncodingResult<String> decodeCEscape(ArrayPtr<const char> text);

208 209 210 211
String encodeBase64(ArrayPtr<const byte> bytes, bool breakLines = false);
// Encode the given bytes as base64 text. If `breakLines` is true, line breaks will be inserted
// into the output every 72 characters (e.g. for encoding e-mail bodies).

212 213 214
EncodingResult<Array<byte>> decodeBase64(ArrayPtr<const char> text);
// Decode base64 text. This function reports errors required by the WHATWG HTML/Infra specs: see
// https://html.spec.whatwg.org/multipage/webappapis.html#atob for details.

216 217 218
String encodeBase64Url(ArrayPtr<const byte> bytes);
// Encode the given bytes as URL-safe base64 text. (RFC 4648, section 5)

219 220 221
// =======================================================================================
// inline implementation details

222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
namespace _ {  // private

template <typename T>
NullableValue<T> readMaybe(EncodingResult<T>&& value) {
  if (value.hadErrors) {
    return nullptr;
  } else {
    return kj::mv(value);

template <typename T>
T* readMaybe(EncodingResult<T>& value) {
  if (value.hadErrors) {
    return nullptr;
  } else {
    return &value;

template <typename T>
const T* readMaybe(const EncodingResult<T>& value) {
  if (value.hadErrors) {
    return nullptr;
  } else {
    return &value;

}  // namespace _ (private)

253 254 255
inline String encodeUriComponent(ArrayPtr<const char> text) {
  return encodeUriComponent(text.asBytes());
inline EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text) {
257 258 259 260
  auto result = decodeBinaryUriComponent(text, DecodeUriOptions { /*.nulTerminate=*/true });
  return { String(result.releaseAsChars()), result.hadErrors };

261 262 263 264 265 266 267 268 269 270
inline String encodeUriFragment(ArrayPtr<const char> text) {
  return encodeUriFragment(text.asBytes());
inline String encodeUriPath(ArrayPtr<const char> text) {
  return encodeUriPath(text.asBytes());
inline String encodeUriUserInfo(ArrayPtr<const char> text) {
  return encodeUriUserInfo(text.asBytes());

271 272 273 274 275 276
inline String encodeWwwForm(ArrayPtr<const char> text) {
  return encodeWwwForm(text.asBytes());
inline EncodingResult<String> decodeWwwForm(ArrayPtr<const char> text) {
  auto result = decodeBinaryUriComponent(text, DecodeUriOptions { /*.nulTerminate=*/true,
                                                                  /*.plusToSpace=*/true });
  return { String(result.releaseAsChars()), result.hadErrors };
278 279

280 281 282
inline String encodeCEscape(ArrayPtr<const char> text) {
  return encodeCEscape(text.asBytes());
inline EncodingResult<String> decodeCEscape(ArrayPtr<const char> text) {
284 285 286 287
  auto result = decodeBinaryCEscape(text, true);
  return { String(result.releaseAsChars()), result.hadErrors };

288 289 290 291 292
// If you pass a string literal to a function taking ArrayPtr<const char>, it'll include the NUL
// termintator, which is surprising. Let's add overloads that avoid that. In practice this probably
// only even matters for encoding-test.c++.

template <size_t s>
inline EncodingResult<Array<char16_t>> encodeUtf16(const char (&text)[s], bool nulTerminate=false) {
294 295 296
  return encodeUtf16(arrayPtr(text, s - 1), nulTerminate);
template <size_t s>
inline EncodingResult<Array<char32_t>> encodeUtf32(const char (&text)[s], bool nulTerminate=false) {
298 299 300
  return encodeUtf32(arrayPtr(text, s - 1), nulTerminate);
template <size_t s>
301 302 303 304 305
inline EncodingResult<Array<wchar_t>> encodeWideString(
    const char (&text)[s], bool nulTerminate=false) {
  return encodeWideString(arrayPtr(text, s - 1), nulTerminate);
template <size_t s>
inline EncodingResult<String> decodeUtf16(const char16_t (&utf16)[s]) {
307 308 309
  return decodeUtf16(arrayPtr(utf16, s - 1));
template <size_t s>
inline EncodingResult<String> decodeUtf32(const char32_t (&utf32)[s]) {
311 312 313
  return decodeUtf32(arrayPtr(utf32, s - 1));
template <size_t s>
314 315 316 317
inline EncodingResult<String> decodeWideString(const wchar_t (&utf32)[s]) {
  return decodeWideString(arrayPtr(utf32, s - 1));
template <size_t s>
inline EncodingResult<Array<byte>> decodeHex(const char (&text)[s]) {
319 320 321 322 323 324 325 326 327 328 329
  return decodeHex(arrayPtr(text, s - 1));
template <size_t s>
inline String encodeUriComponent(const char (&text)[s]) {
  return encodeUriComponent(arrayPtr(text, s - 1));
template <size_t s>
inline Array<byte> decodeBinaryUriComponent(const char (&text)[s]) {
  return decodeBinaryUriComponent(arrayPtr(text, s - 1));
template <size_t s>
330 331
inline EncodingResult<String> decodeUriComponent(const char (&text)[s]) {
  return decodeUriComponent(arrayPtr(text, s-1));
332 333
template <size_t s>
334 335 336 337 338 339 340 341 342 343 344 345
inline String encodeUriFragment(const char (&text)[s]) {
  return encodeUriFragment(arrayPtr(text, s - 1));
template <size_t s>
inline String encodeUriPath(const char (&text)[s]) {
  return encodeUriPath(arrayPtr(text, s - 1));
template <size_t s>
inline String encodeUriUserInfo(const char (&text)[s]) {
  return encodeUriUserInfo(arrayPtr(text, s - 1));
template <size_t s>
346 347 348 349 350 351 352 353
inline String encodeWwwForm(const char (&text)[s]) {
  return encodeWwwForm(arrayPtr(text, s - 1));
template <size_t s>
inline EncodingResult<String> decodeWwwForm(const char (&text)[s]) {
  return decodeWwwForm(arrayPtr(text, s-1));
template <size_t s>
354 355 356 357
inline String encodeCEscape(const char (&text)[s]) {
  return encodeCEscape(arrayPtr(text, s - 1));
template <size_t s>
inline EncodingResult<Array<byte>> decodeBinaryCEscape(const char (&text)[s]) {
359 360 361
  return decodeBinaryCEscape(arrayPtr(text, s - 1));
template <size_t s>
362 363
inline EncodingResult<String> decodeCEscape(const char (&text)[s]) {
  return decodeCEscape(arrayPtr(text, s-1));
364 365
template <size_t s>
EncodingResult<Array<byte>> decodeBase64(const char (&text)[s]) {
367 368 369 370
  return decodeBase64(arrayPtr(text, s - 1));

} // namespace kj