encoding.h 11.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
// Copyright (c) 2017 Cloudflare, Inc. and contributors
// Licensed under the MIT License:
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

22
#pragma once
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
// Functions for encoding/decoding bytes and text in common formats, including:
// - UTF-{8,16,32}
// - Hex
// - URI encoding
// - Base64

#if defined(__GNUC__) && !KJ_HEADER_WARNINGS
#pragma GCC system_header
#endif

#include "string.h"

namespace kj {

template <typename ResultType>
38
struct EncodingResult: public ResultType {
39
  // Equivalent to ResultType (a String or wide-char array) for all intents and purposes, except
40 41 42 43 44 45 46 47 48
  // that the bool `hadErrors` can be inspected to see if any errors were encountered in the input.
  // Each encoding/decoding function that returns this type will "work around" errors in some way,
  // so an application doesn't strictly have to check for errors. E.g. the Unicode functions
  // replace errors with U+FFFD in the output.
  //
  // Through magic, KJ_IF_MAYBE() and KJ_{REQUIRE,ASSERT}_NONNULL() work on EncodingResult<T>
  // exactly if it were a Maybe<T> that is null in case of errors.

  inline EncodingResult(ResultType&& result, bool hadErrors)
49 50 51 52 53
      : ResultType(kj::mv(result)), hadErrors(hadErrors) {}

  const bool hadErrors;
};

54 55 56 57 58 59
template <typename T>
inline auto KJ_STRINGIFY(const EncodingResult<T>& value)
    -> decltype(toCharSequence(implicitCast<const T&>(value))) {
  return toCharSequence(implicitCast<const T&>(value));
}

60 61
EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false);
EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false);
62 63 64 65 66 67
// Convert UTF-8 text (which KJ strings use) to UTF-16 or UTF-32.
//
// If `nulTerminate` is true, an extra NUL character will be added to the end of the output.
//
// The returned arrays are in platform-native endianness (otherwise they wouldn't really be
// char16_t / char32_t).
68 69 70 71
//
// Note that the KJ Unicode encoding and decoding functions actually implement
// [WTF-8 encoding](http://simonsapin.github.io/wtf-8/), which affects how invalid input is
// handled. See comments on decodeUtf16() for more info.
72

73 74
EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16);
EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32);
75 76 77 78 79 80
// Convert UTF-16 or UTF-32 to UTF-8 (which KJ strings use).
//
// The input should NOT include a NUL terminator; any NUL characters in the input array will be
// preserved in the output.
//
// The input must be in platform-native endianness. BOMs are NOT recognized by these functions.
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
//
// Note that the KJ Unicode encoding and decoding functions actually implement
// [WTF-8 encoding](http://simonsapin.github.io/wtf-8/). This means that if you start with an array
// of char16_t and you pass it through any number of conversions to other Unicode encodings,
// eventually returning it to UTF-16, all the while ignoring `hadErrors`, you will end up with
// exactly the same char16_t array you started with, *even if* the array is not valid UTF-16. This
// is useful because many real-world systems that were designed for UCS-2 (plain 16-bit Unicode)
// and later "upgraded" to UTF-16 do not enforce that their UTF-16 is well-formed. For example,
// file names on Windows NT are encoded using 16-bit characters, without enforcing that the
// character sequence is valid UTF-16. It is important that programs on Windows be able to handle
// such filenames, even if they choose to convert the name to UTF-8 for internal processing.
//
// Specifically, KJ's Unicode handling allows unpaired surrogate code points to round-trip through
// UTF-8 and UTF-32. Unpaired surrogates will be flagged as an error (setting `hadErrors` in the
// result), but will NOT be replaced with the Unicode replacement character as other erroneous
// sequences would be, but rather encoded as an invalid surrogate codepoint in the target encoding.
//
// KJ makes the following guarantees about invalid input:
// - A round trip from UTF-16 to other encodings and back will produce exactly the original input,
//   with every leg of the trip raising the `hadErrors` flag if the original input was not valid.
// - A round trip from UTF-8 or UTF-32 to other encodings and back will either produce exactly
//   the original input, or will have replaced some invalid sequences with the Unicode replacement
//   character, U+FFFD. No code units will ever be removed unless they are replaced with U+FFFD,
//   and no code units will ever be added except to encode U+FFFD. If the original input was not
//   valid, the `hadErrors` flag will be raised on the first leg of the trip, and will also be
//   raised on subsequent legs unless all invalid sequences were replaced with U+FFFD (which, after
//   all, is a valid code point).
108

109 110 111 112 113 114 115 116 117 118 119
EncodingResult<Array<wchar_t>> encodeWideString(
    ArrayPtr<const char> text, bool nulTerminate = false);
EncodingResult<String> decodeWideString(ArrayPtr<const wchar_t> wide);
// Encode / decode strings of wchar_t, aka "wide strings". Unfortunately, different platforms have
// different definitions for wchar_t. For example, on Windows they are 16-bit and encode UTF-16,
// but on Linux they are 32-bit and encode UTF-32. Some platforms even define wchar_t as 8-bit,
// encoding UTF-8 (e.g. BeOS did this).
//
// KJ assumes that wide strings use the UTF encoding that corresponds to the size of wchar_t on
// the target platform. So, these functions are simple aliases for encodeUtf*/decodeUtf*, above
// (or simply make a copy if wchar_t is 8 bits).
120 121

String encodeHex(ArrayPtr<const byte> bytes);
122
EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text);
123 124 125 126
// Encode/decode bytes as hex strings.

String encodeUriComponent(ArrayPtr<const byte> bytes);
String encodeUriComponent(ArrayPtr<const char> bytes);
127 128 129
EncodingResult<Array<byte>> decodeBinaryUriComponent(
    ArrayPtr<const char> text, bool nulTerminate = false);
EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text);
130 131
// Encode/decode URI components using % escapes. See Javascript's encodeURIComponent().

132 133
String encodeCEscape(ArrayPtr<const byte> bytes);
String encodeCEscape(ArrayPtr<const char> bytes);
134 135 136
EncodingResult<Array<byte>> decodeBinaryCEscape(
    ArrayPtr<const char> text, bool nulTerminate = false);
EncodingResult<String> decodeCEscape(ArrayPtr<const char> text);
137

138 139 140 141
String encodeBase64(ArrayPtr<const byte> bytes, bool breakLines = false);
// Encode the given bytes as base64 text. If `breakLines` is true, line breaks will be inserted
// into the output every 72 characters (e.g. for encoding e-mail bodies).

142 143 144
EncodingResult<Array<byte>> decodeBase64(ArrayPtr<const char> text);
// Decode base64 text. This function reports errors required by the WHATWG HTML/Infra specs: see
// https://html.spec.whatwg.org/multipage/webappapis.html#atob for details.
145 146 147 148

// =======================================================================================
// inline implementation details

149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
namespace _ {  // private

template <typename T>
NullableValue<T> readMaybe(EncodingResult<T>&& value) {
  if (value.hadErrors) {
    return nullptr;
  } else {
    return kj::mv(value);
  }
}

template <typename T>
T* readMaybe(EncodingResult<T>& value) {
  if (value.hadErrors) {
    return nullptr;
  } else {
    return &value;
  }
}

template <typename T>
const T* readMaybe(const EncodingResult<T>& value) {
  if (value.hadErrors) {
    return nullptr;
  } else {
    return &value;
  }
}

}  // namespace _ (private)

180 181 182
inline String encodeUriComponent(ArrayPtr<const char> text) {
  return encodeUriComponent(text.asBytes());
}
183 184 185
inline EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text) {
  auto result = decodeBinaryUriComponent(text, true);
  return { String(result.releaseAsChars()), result.hadErrors };
186 187
}

188 189 190
inline String encodeCEscape(ArrayPtr<const char> text) {
  return encodeCEscape(text.asBytes());
}
191
inline EncodingResult<String> decodeCEscape(ArrayPtr<const char> text) {
192 193 194 195
  auto result = decodeBinaryCEscape(text, true);
  return { String(result.releaseAsChars()), result.hadErrors };
}

196 197 198 199 200
// If you pass a string literal to a function taking ArrayPtr<const char>, it'll include the NUL
// termintator, which is surprising. Let's add overloads that avoid that. In practice this probably
// only even matters for encoding-test.c++.

template <size_t s>
201
inline EncodingResult<Array<char16_t>> encodeUtf16(const char (&text)[s], bool nulTerminate=false) {
202 203 204
  return encodeUtf16(arrayPtr(text, s - 1), nulTerminate);
}
template <size_t s>
205
inline EncodingResult<Array<char32_t>> encodeUtf32(const char (&text)[s], bool nulTerminate=false) {
206 207 208
  return encodeUtf32(arrayPtr(text, s - 1), nulTerminate);
}
template <size_t s>
209 210 211 212 213
inline EncodingResult<Array<wchar_t>> encodeWideString(
    const char (&text)[s], bool nulTerminate=false) {
  return encodeWideString(arrayPtr(text, s - 1), nulTerminate);
}
template <size_t s>
214
inline EncodingResult<String> decodeUtf16(const char16_t (&utf16)[s]) {
215 216 217
  return decodeUtf16(arrayPtr(utf16, s - 1));
}
template <size_t s>
218
inline EncodingResult<String> decodeUtf32(const char32_t (&utf32)[s]) {
219 220 221
  return decodeUtf32(arrayPtr(utf32, s - 1));
}
template <size_t s>
222 223 224 225
inline EncodingResult<String> decodeWideString(const wchar_t (&utf32)[s]) {
  return decodeWideString(arrayPtr(utf32, s - 1));
}
template <size_t s>
226
inline EncodingResult<Array<byte>> decodeHex(const char (&text)[s]) {
227 228 229 230 231 232 233 234 235 236 237
  return decodeHex(arrayPtr(text, s - 1));
}
template <size_t s>
inline String encodeUriComponent(const char (&text)[s]) {
  return encodeUriComponent(arrayPtr(text, s - 1));
}
template <size_t s>
inline Array<byte> decodeBinaryUriComponent(const char (&text)[s]) {
  return decodeBinaryUriComponent(arrayPtr(text, s - 1));
}
template <size_t s>
238 239
inline EncodingResult<String> decodeUriComponent(const char (&text)[s]) {
  return decodeUriComponent(arrayPtr(text, s-1));
240 241
}
template <size_t s>
242 243 244 245
inline String encodeCEscape(const char (&text)[s]) {
  return encodeCEscape(arrayPtr(text, s - 1));
}
template <size_t s>
246
inline EncodingResult<Array<byte>> decodeBinaryCEscape(const char (&text)[s]) {
247 248 249
  return decodeBinaryCEscape(arrayPtr(text, s - 1));
}
template <size_t s>
250 251
inline EncodingResult<String> decodeCEscape(const char (&text)[s]) {
  return decodeCEscape(arrayPtr(text, s-1));
252 253
}
template <size_t s>
254
EncodingResult<Array<byte>> decodeBase64(const char (&text)[s]) {
255 256 257 258
  return decodeBase64(arrayPtr(text, s - 1));
}

} // namespace kj