Commit f74555b4 authored by Kenton Varda's avatar Kenton Varda

Add KJ utility functions to encode/decode blobs in common formats.

In particular: UTF-{8,16,32}, Hex, URI encoding, and Base64
parent 97aae1bb
......@@ -128,6 +128,7 @@ includekj_HEADERS = \
src/kj/vector.h \
src/kj/string.h \
src/kj/string-tree.h \
src/kj/encoding.h \
src/kj/exception.h \
src/kj/debug.h \
src/kj/arena.h \
......@@ -218,6 +219,7 @@ libkj_la_SOURCES= \
src/kj/array.c++ \
src/kj/string.c++ \
src/kj/string-tree.c++ \
src/kj/encoding.c++ \
src/kj/exception.c++ \
src/kj/debug.c++ \
src/kj/arena.c++ \
......@@ -451,6 +453,7 @@ capnp_test_SOURCES = \
src/kj/array-test.c++ \
src/kj/string-test.c++ \
src/kj/string-tree-test.c++ \
src/kj/encoding-test.c++ \
src/kj/exception-test.c++ \
src/kj/debug-test.c++ \
src/kj/arena-test.c++ \
......
......@@ -19,6 +19,7 @@ set(kj_sources_heavy
units.c++
refcount.c++
string-tree.c++
encoding.c++
parse/char.c++
)
if(NOT CAPNP_LITE)
......@@ -36,6 +37,7 @@ set(kj_headers
vector.h
string.h
string-tree.h
encoding.h
exception.h
debug.h
arena.h
......@@ -170,6 +172,7 @@ if(BUILD_TESTING)
async-io-test.c++
refcount-test.c++
string-tree-test.c++
encoding-test.c++
arena-test.c++
units-test.c++
tuple-test.c++
......
// Copyright (c) 2017 Cloudflare, Inc. and contributors
// Licensed under the MIT License:
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include "encoding.h"
#include <kj/test.h>
#include <stdint.h>
namespace kj {
namespace {
CappedArray<char, sizeof(char ) * 2 + 1> hex(char i) { return kj::hex((uint8_t )i); }
CappedArray<char, sizeof(char16_t) * 2 + 1> hex(char16_t i) { return kj::hex((uint16_t)i); }
CappedArray<char, sizeof(char32_t) * 2 + 1> hex(char32_t i) { return kj::hex((uint32_t)i); }
// Hexify chars correctly.
//
// TODO(cleanup): Should this go into string.h with the other definitions of hex()?
template <typename T>
void expectUtf(UtfResult<T> result,
ArrayPtr<const Decay<decltype(result[0])>> expected,
bool errors = false) {
if (errors) {
KJ_EXPECT(result.hadErrors);
} else {
KJ_EXPECT(!result.hadErrors);
}
KJ_EXPECT(result.size() == expected.size(), result.size(), expected.size());
for (auto i: kj::zeroTo(kj::min(result.size(), expected.size()))) {
KJ_EXPECT(result[i] == expected[i], i, hex(result[i]), hex(expected[i]));
}
}
template <typename T, size_t s>
void expectUtf(UtfResult<T> result,
const Decay<decltype(result[0])> (&expected)[s],
bool errors = false) {
expectUtf(kj::mv(result), arrayPtr(expected, s - 1), errors);
}
KJ_TEST("encode UTF-8 to UTF-16") {
expectUtf(encodeUtf16(u8"foo"), u"foo");
expectUtf(encodeUtf16(u8"Здравствуйте"), u"Здравствуйте");
expectUtf(encodeUtf16(u8"中国网络"), u"中国网络");
expectUtf(encodeUtf16(u8"😺☁☄🐵"), u"😺☁☄🐵");
}
KJ_TEST("invalid UTF-8 to UTF-16") {
// Disembodied continuation byte.
expectUtf(encodeUtf16("\x80"), u"\ufffd", true);
expectUtf(encodeUtf16("f\xbfo"), u"f\ufffdo", true);
expectUtf(encodeUtf16("f\xbf\x80\xb0o"), u"f\ufffdo", true);
// Missing continuation bytes.
expectUtf(encodeUtf16("\xc2x"), u"\ufffdx", true);
expectUtf(encodeUtf16("\xe0x"), u"\ufffdx", true);
expectUtf(encodeUtf16("\xe0\xa0x"), u"\ufffdx", true);
expectUtf(encodeUtf16("\xf0x"), u"\ufffdx", true);
expectUtf(encodeUtf16("\xf0\x90x"), u"\ufffdx", true);
expectUtf(encodeUtf16("\xf0\x90\x80x"), u"\ufffdx", true);
// Overlong sequences.
expectUtf(encodeUtf16("\xc0\x80"), u"\ufffd", true);
expectUtf(encodeUtf16("\xc1\xbf"), u"\ufffd", true);
expectUtf(encodeUtf16("\xc2\x80"), u"\u0080", false);
expectUtf(encodeUtf16("\xdf\xbf"), u"\u07ff", false);
expectUtf(encodeUtf16("\xe0\x80\x80"), u"\ufffd", true);
expectUtf(encodeUtf16("\xe0\x9f\xbf"), u"\ufffd", true);
expectUtf(encodeUtf16("\xe0\xa0\x80"), u"\u0800", false);
expectUtf(encodeUtf16("\xef\xbf\xbf"), u"\uffff", false);
expectUtf(encodeUtf16("\xf0\x80\x80\x80"), u"\ufffd", true);
expectUtf(encodeUtf16("\xf0\x8f\xbf\xbf"), u"\ufffd", true);
expectUtf(encodeUtf16("\xf0\x90\x80\x80"), u"\U00010000", false);
expectUtf(encodeUtf16("\xf4\x8f\xbf\xbf"), u"\U0010ffff", false);
// Out of Unicode range.
expectUtf(encodeUtf16("\xf5\x80\x80\x80"), u"\ufffd", true);
expectUtf(encodeUtf16("\xf8\xbf\x80\x80\x80"), u"\ufffd", true);
expectUtf(encodeUtf16("\xfc\xbf\x80\x80\x80\x80"), u"\ufffd", true);
expectUtf(encodeUtf16("\xfe\xbf\x80\x80\x80\x80\x80"), u"\ufffd", true);
expectUtf(encodeUtf16("\xff\xbf\x80\x80\x80\x80\x80\x80"), u"\ufffd", true);
}
KJ_TEST("encode UTF-8 to UTF-32") {
expectUtf(encodeUtf32(u8"foo"), U"foo");
expectUtf(encodeUtf32(u8"Здравствуйте"), U"Здравствуйте");
expectUtf(encodeUtf32(u8"中国网络"), U"中国网络");
expectUtf(encodeUtf32(u8"😺☁☄🐵"), U"😺☁☄🐵");
}
KJ_TEST("invalid UTF-8 to UTF-32") {
// Disembodied continuation byte.
expectUtf(encodeUtf32("\x80"), U"\ufffd", true);
expectUtf(encodeUtf32("f\xbfo"), U"f\ufffdo", true);
expectUtf(encodeUtf32("f\xbf\x80\xb0o"), U"f\ufffdo", true);
// Missing continuation bytes.
expectUtf(encodeUtf32("\xc2x"), U"\ufffdx", true);
expectUtf(encodeUtf32("\xe0x"), U"\ufffdx", true);
expectUtf(encodeUtf32("\xe0\xa0x"), U"\ufffdx", true);
expectUtf(encodeUtf32("\xf0x"), U"\ufffdx", true);
expectUtf(encodeUtf32("\xf0\x90x"), U"\ufffdx", true);
expectUtf(encodeUtf32("\xf0\x90\x80x"), U"\ufffdx", true);
// Overlong sequences.
expectUtf(encodeUtf32("\xc0\x80"), U"\ufffd", true);
expectUtf(encodeUtf32("\xc1\xbf"), U"\ufffd", true);
expectUtf(encodeUtf32("\xc2\x80"), U"\u0080", false);
expectUtf(encodeUtf32("\xdf\xbf"), U"\u07ff", false);
expectUtf(encodeUtf32("\xe0\x80\x80"), U"\ufffd", true);
expectUtf(encodeUtf32("\xe0\x9f\xbf"), U"\ufffd", true);
expectUtf(encodeUtf32("\xe0\xa0\x80"), U"\u0800", false);
expectUtf(encodeUtf32("\xef\xbf\xbf"), U"\uffff", false);
expectUtf(encodeUtf32("\xf0\x80\x80\x80"), U"\ufffd", true);
expectUtf(encodeUtf32("\xf0\x8f\xbf\xbf"), U"\ufffd", true);
expectUtf(encodeUtf32("\xf0\x90\x80\x80"), U"\U00010000", false);
expectUtf(encodeUtf32("\xf4\x8f\xbf\xbf"), U"\U0010ffff", false);
// Out of Unicode range.
expectUtf(encodeUtf32("\xf5\x80\x80\x80"), U"\ufffd", true);
expectUtf(encodeUtf32("\xf8\xbf\x80\x80\x80"), U"\ufffd", true);
expectUtf(encodeUtf32("\xfc\xbf\x80\x80\x80\x80"), U"\ufffd", true);
expectUtf(encodeUtf32("\xfe\xbf\x80\x80\x80\x80\x80"), U"\ufffd", true);
expectUtf(encodeUtf32("\xff\xbf\x80\x80\x80\x80\x80\x80"), U"\ufffd", true);
}
KJ_TEST("decode UTF-16 to UTF-8") {
expectUtf(decodeUtf16(u"foo"), u8"foo");
expectUtf(decodeUtf16(u"Здравствуйте"), u8"Здравствуйте");
expectUtf(decodeUtf16(u"中国网络"), u8"中国网络");
expectUtf(decodeUtf16(u"😺☁☄🐵"), u8"😺☁☄🐵");
}
KJ_TEST("invalid UTF-16 to UTF-8") {
// Surrogates in wrong order.
expectUtf(decodeUtf16(u"\xd7ff\xdc00\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true);
// Missing second surrogate.
expectUtf(decodeUtf16(u"f\xd800"), u8"f\ufffd", true);
expectUtf(decodeUtf16(u"f\xd800x"), u8"f\ufffdx", true);
expectUtf(decodeUtf16(u"f\xd800\xd800x"), u8"f\ufffd\ufffdx", true);
}
KJ_TEST("decode UTF-32 to UTF-8") {
expectUtf(decodeUtf32(U"foo"), u8"foo");
expectUtf(decodeUtf32(U"Здравствуйте"), u8"Здравствуйте");
expectUtf(decodeUtf32(U"中国网络"), u8"中国网络");
expectUtf(decodeUtf32(U"😺☁☄🐵"), u8"😺☁☄🐵");
}
KJ_TEST("invalid UTF-32 to UTF-8") {
// Surrogates rejected.
expectUtf(decodeUtf32(U"\xd7ff\xdc00\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true);
// Even if it would be a valid surrogate pair in UTF-16.
expectUtf(decodeUtf32(U"\xd7ff\xd800\xdfff\xe000"), u8"\ud7ff\ufffd\ufffd\ue000", true);
}
KJ_TEST("tryEncode / tryDecode") {
KJ_EXPECT(tryEncodeUtf16("\x80") == nullptr);
KJ_EXPECT(ArrayPtr<const char16_t>(KJ_ASSERT_NONNULL(tryEncodeUtf16("foo")))
== arrayPtr(u"foo", 3));
KJ_EXPECT(tryEncodeUtf32("\x80") == nullptr);
KJ_EXPECT(ArrayPtr<const char32_t>(KJ_ASSERT_NONNULL(tryEncodeUtf32("foo")))
== arrayPtr(U"foo", 3));
KJ_EXPECT(tryDecodeUtf16(u"\xd800") == nullptr);
KJ_EXPECT(KJ_ASSERT_NONNULL(tryDecodeUtf16(u"foo")) == "foo");
KJ_EXPECT(tryDecodeUtf32(U"\xd800") == nullptr);
KJ_EXPECT(KJ_ASSERT_NONNULL(tryDecodeUtf32(U"foo")) == "foo");
}
// =======================================================================================
KJ_TEST("hex encoding/decoding") {
byte bytes[] = {0x12, 0x34, 0xab, 0xf2};
KJ_EXPECT(encodeHex(bytes) == "1234abf2");
KJ_EXPECT(decodeHex("1234abf2").asPtr() == bytes);
}
KJ_TEST("URI encoding/decoding") {
KJ_EXPECT(encodeUriComponent("foo") == "foo");
KJ_EXPECT(encodeUriComponent("foo bar") == "foo%20bar");
KJ_EXPECT(encodeUriComponent("\xab\xba") == "%ab%ba");
KJ_EXPECT(encodeUriComponent(StringPtr("foo\0bar", 7)) == "foo%00bar");
KJ_EXPECT(decodeUriComponent("foo%20bar") == "foo bar");
KJ_EXPECT(decodeUriComponent("%ab%BA") == "\xab\xba");
byte bytes[] = {12, 34, 56};
KJ_EXPECT(decodeBinaryUriComponent(encodeUriComponent(bytes)).asPtr() == bytes);
}
KJ_TEST("base64 encoding/decoding") {
{
auto encoded = encodeBase64(StringPtr("foo").asBytes(), false);
KJ_EXPECT(encoded == "Zm9v", encoded, encoded.size());
KJ_EXPECT(heapString(decodeBase64(encoded.asArray()).asChars()) == "foo");
}
{
auto encoded = encodeBase64(StringPtr("corge").asBytes(), false);
KJ_EXPECT(encoded == "Y29yZ2U=", encoded);
KJ_EXPECT(heapString(decodeBase64(encoded.asArray()).asChars()) == "corge");
}
KJ_EXPECT(heapString(decodeBase64("Y29yZ2U").asChars()) == "corge");
KJ_EXPECT(heapString(decodeBase64("Y\n29y Z@2U=\n").asChars()) == "corge");
{
auto encoded = encodeBase64(StringPtr("corge").asBytes(), true);
KJ_EXPECT(encoded == "Y29yZ2U=\n", encoded);
}
StringPtr fullLine = "012345678901234567890123456789012345678901234567890123";
{
auto encoded = encodeBase64(fullLine.asBytes(), false);
KJ_EXPECT(
encoded == "MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIz",
encoded);
}
{
auto encoded = encodeBase64(fullLine.asBytes(), true);
KJ_EXPECT(
encoded == "MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIz\n",
encoded);
}
String multiLine = str(fullLine, "456");
{
auto encoded = encodeBase64(multiLine.asBytes(), false);
KJ_EXPECT(
encoded == "MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2",
encoded);
}
{
auto encoded = encodeBase64(multiLine.asBytes(), true);
KJ_EXPECT(
encoded == "MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIz\n"
"NDU2\n",
encoded);
}
}
} // namespace
} // namespace kj
// Copyright (c) 2017 Cloudflare, Inc.; Sandstorm Development Group, Inc.; and contributors
// Licensed under the MIT License:
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include "encoding.h"
#include "vector.h"
#include "debug.h"
namespace kj {
namespace {
#define GOTO_ERROR_IF(cond) if (KJ_UNLIKELY(cond)) goto error
inline void addChar32(Vector<char16_t>& vec, char32_t u) {
// Encode as surrogate pair.
u -= 0x10000;
vec.add(0xd800 | (u >> 10));
vec.add(0xdc00 | (u & 0x03ff));
}
inline void addChar32(Vector<char32_t>& vec, char32_t u) {
vec.add(u);
}
template <typename T>
UtfResult<Array<T>> encodeUtf(ArrayPtr<const char> text, bool nulTerminate) {
Vector<T> result(text.size() + nulTerminate);
bool hadErrors = false;
size_t i = 0;
while (i < text.size()) {
byte c = text[i++];
if (c < 0x80) {
// 0xxxxxxx -- ASCII
result.add(c);
continue;
} else if (KJ_UNLIKELY(c < 0xc0)) {
// 10xxxxxx -- malformed continuation byte
goto error;
} else if (c < 0xe0) {
// 110xxxxx -- 2-byte
byte c2;
GOTO_ERROR_IF(i == text.size() || ((c2 = text[i]) & 0xc0) != 0x80); ++i;
char16_t u = (static_cast<char16_t>(c & 0x1f) << 6)
| (static_cast<char16_t>(c2 & 0x3f) );
// Disallow overlong sequence.
GOTO_ERROR_IF(u < 0x80);
result.add(u);
continue;
} else if (c < 0xf0) {
// 1110xxxx -- 3-byte
byte c2, c3;
GOTO_ERROR_IF(i == text.size() || ((c2 = text[i]) & 0xc0) != 0x80); ++i;
GOTO_ERROR_IF(i == text.size() || ((c3 = text[i]) & 0xc0) != 0x80); ++i;
char16_t u = (static_cast<char16_t>(c & 0x0f) << 12)
| (static_cast<char16_t>(c2 & 0x3f) << 6)
| (static_cast<char16_t>(c3 & 0x3f) );
// Disallow overlong sequence.
GOTO_ERROR_IF(u < 0x0800);
// Disallow surrogate pair code points.
GOTO_ERROR_IF((u & 0xf800) == 0xd800);
result.add(u);
continue;
} else if (c < 0xf8) {
// 11110xxx -- 4-byte
byte c2, c3, c4;
GOTO_ERROR_IF(i == text.size() || ((c2 = text[i]) & 0xc0) != 0x80); ++i;
GOTO_ERROR_IF(i == text.size() || ((c3 = text[i]) & 0xc0) != 0x80); ++i;
GOTO_ERROR_IF(i == text.size() || ((c4 = text[i]) & 0xc0) != 0x80); ++i;
char32_t u = (static_cast<char32_t>(c & 0x07) << 18)
| (static_cast<char32_t>(c2 & 0x3f) << 12)
| (static_cast<char32_t>(c3 & 0x3f) << 6)
| (static_cast<char32_t>(c4 & 0x3f) );
// Disallow overlong sequence.
GOTO_ERROR_IF(u < 0x10000);
// Unicode ends at U+10FFFF
GOTO_ERROR_IF(u >= 0x110000);
addChar32(result, u);
continue;
} else {
// 5-byte and 6-byte sequences are not legal as they'd result in codepoints outside the
// range of Unicode.
goto error;
}
error:
result.add(0xfffd);
hadErrors = true;
// Ignore all continuation bytes.
while (i < text.size() && (text[i] & 0xc0) == 0x80) {
++i;
}
}
if (nulTerminate) result.add(0);
return { result.releaseAsArray(), hadErrors };
}
} // namespace
UtfResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate) {
return encodeUtf<char16_t>(text, nulTerminate);
}
UtfResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate) {
return encodeUtf<char32_t>(text, nulTerminate);
}
Maybe<Array<char16_t>> tryEncodeUtf16(ArrayPtr<const char> text, bool nulTerminate) {
auto result = encodeUtf16(text, nulTerminate);
if (result.hadErrors) {
return nullptr;
} else {
return kj::mv(result);
}
}
Maybe<Array<char32_t>> tryEncodeUtf32(ArrayPtr<const char> text, bool nulTerminate) {
auto result = encodeUtf32(text, nulTerminate);
if (result.hadErrors) {
return nullptr;
} else {
return kj::mv(result);
}
}
UtfResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16) {
Vector<char> result(utf16.size() + 1);
bool hadErrors = false;
size_t i = 0;
while (i < utf16.size()) {
char16_t u = utf16[i++];
if (u < 0x80) {
result.add(u);
continue;
} else if (u < 0x0800) {
result.addAll<std::initializer_list<char>>({
static_cast<char>(((u >> 6) ) | 0xc0),
static_cast<char>(((u ) & 0x3f) | 0x80)
});
continue;
} else if ((u & 0xf800) == 0xd800) {
// surrogate pair
char16_t u2;
GOTO_ERROR_IF(i == utf16.size() // missing second half
|| (u & 0x0400) != 0 // first half in wrong range
|| ((u2 = utf16[i]) & 0xfc00) != 0xdc00); // second half in wrong range
++i;
char32_t u32 = (((u & 0x03ff) << 10) | (u2 & 0x03ff)) + 0x10000;
result.addAll<std::initializer_list<char>>({
static_cast<char>(((u32 >> 18) ) | 0xf0),
static_cast<char>(((u32 >> 12) & 0x3f) | 0x80),
static_cast<char>(((u32 >> 6) & 0x3f) | 0x80),
static_cast<char>(((u32 ) & 0x3f) | 0x80)
});
continue;
} else {
result.addAll<std::initializer_list<char>>({
static_cast<char>(((u >> 12) ) | 0xe0),
static_cast<char>(((u >> 6) & 0x3f) | 0x80),
static_cast<char>(((u ) & 0x3f) | 0x80)
});
continue;
}
error:
result.addAll(StringPtr(u8"\ufffd"));
hadErrors = true;
}
result.add(0);
return { String(result.releaseAsArray()), hadErrors };
}
UtfResult<String> decodeUtf32(ArrayPtr<const char32_t> utf16) {
Vector<char> result(utf16.size() + 1);
bool hadErrors = false;
size_t i = 0;
while (i < utf16.size()) {
char32_t u = utf16[i++];
if (u < 0x80) {
result.add(u);
continue;
} else if (u < 0x0800) {
result.addAll<std::initializer_list<char>>({
static_cast<char>(((u >> 6) ) | 0xc0),
static_cast<char>(((u ) & 0x3f) | 0x80)
});
continue;
} else if (u < 0x10000) {
GOTO_ERROR_IF((u & 0xfffff800) == 0xd800); // no surrogates allowed in utf-32
result.addAll<std::initializer_list<char>>({
static_cast<char>(((u >> 12) ) | 0xe0),
static_cast<char>(((u >> 6) & 0x3f) | 0x80),
static_cast<char>(((u ) & 0x3f) | 0x80)
});
continue;
} else {
GOTO_ERROR_IF(u >= 0x110000); // outside Unicode range
result.addAll<std::initializer_list<char>>({
static_cast<char>(((u >> 18) ) | 0xf0),
static_cast<char>(((u >> 12) & 0x3f) | 0x80),
static_cast<char>(((u >> 6) & 0x3f) | 0x80),
static_cast<char>(((u ) & 0x3f) | 0x80)
});
continue;
}
error:
result.addAll(StringPtr(u8"\ufffd"));
hadErrors = true;
}
result.add(0);
return { String(result.releaseAsArray()), hadErrors };
}
Maybe<String> tryDecodeUtf16(ArrayPtr<const char16_t> utf16) {
auto result = decodeUtf16(utf16);
if (result.hadErrors) {
return nullptr;
} else {
return kj::mv(result);
}
}
Maybe<String> tryDecodeUtf32(ArrayPtr<const char32_t> utf32) {
auto result = decodeUtf32(utf32);
if (result.hadErrors) {
return nullptr;
} else {
return kj::mv(result);
}
}
// =======================================================================================
namespace {
const char HEX_DIGITS[] = "0123456789abcdef";
}
String encodeHex(ArrayPtr<const byte> input) {
return strArray(KJ_MAP(b, input) {
return heapArray<char>({HEX_DIGITS[b/16], HEX_DIGITS[b%16]});
}, "");
}
static uint fromDigit(char c) {
if ('0' <= c && c <= '9') {
return c - '0';
} else if ('a' <= c && c <= 'z') {
return c - ('a' - 10);
} else if ('A' <= c && c <= 'Z') {
return c - ('A' - 10);
} else {
return 0;
}
}
Array<byte> decodeHex(ArrayPtr<const char> text) {
auto result = heapArray<byte>(text.size() / 2);
for (auto i: kj::indices(result)) {
result[i] = (fromDigit(text[i*2]) << 4)
| (fromDigit(text[i*2+1]));
}
return result;
}
String encodeUriComponent(ArrayPtr<const byte> bytes) {
Vector<char> result(bytes.size() + 1);
for (byte b: bytes) {
if (('A' <= b && b <= 'Z') || ('a' <= b && b <= 'z') || ('0' <= b && b <= '9') ||
b == '-' || b == '_' || b == '.' || b == '!' || b == '~' || b == '*' || b == '\'' ||
b == '(' || b == ')') {
result.add(b);
} else {
result.add('%');
result.add(HEX_DIGITS[b/16]);
result.add(HEX_DIGITS[b%16]);
}
}
result.add('\0');
return String(result.releaseAsArray());
}
Array<byte> decodeBinaryUriComponent(ArrayPtr<const char> text, bool nulTerminate) {
Vector<byte> result(text.size() + nulTerminate);
const char* ptr = text.begin();
const char* end = text.end();
while (ptr < end) {
if (*ptr == '%') {
++ptr;
if (ptr == end) break;
byte b = fromDigit(*ptr++) << 4;
if (ptr == end) break;
b |= fromDigit(*ptr++);
result.add(b);
} else {
result.add(*ptr++);
}
}
if (nulTerminate) result.add(0);
return result.releaseAsArray();
}
// =======================================================================================
// This code is derived from libb64 which has been placed in the public domain.
// For details, see http://sourceforge.net/projects/libb64
// -------------------------------------------------------------------
// Encoder
namespace {
typedef enum {
step_A, step_B, step_C
} base64_encodestep;
typedef struct {
base64_encodestep step;
char result;
int stepcount;
} base64_encodestate;
const int CHARS_PER_LINE = 72;
void base64_init_encodestate(base64_encodestate* state_in) {
state_in->step = step_A;
state_in->result = 0;
state_in->stepcount = 0;
}
char base64_encode_value(char value_in) {
static const char* encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
if (value_in > 63) return '=';
return encoding[(int)value_in];
}
int base64_encode_block(const char* plaintext_in, int length_in,
char* code_out, base64_encodestate* state_in, bool breakLines) {
const char* plainchar = plaintext_in;
const char* const plaintextend = plaintext_in + length_in;
char* codechar = code_out;
char result;
char fragment;
result = state_in->result;
switch (state_in->step) {
while (1) {
case step_A:
if (plainchar == plaintextend) {
state_in->result = result;
state_in->step = step_A;
return codechar - code_out;
}
fragment = *plainchar++;
result = (fragment & 0x0fc) >> 2;
*codechar++ = base64_encode_value(result);
result = (fragment & 0x003) << 4;
case step_B:
if (plainchar == plaintextend) {
state_in->result = result;
state_in->step = step_B;
return codechar - code_out;
}
fragment = *plainchar++;
result |= (fragment & 0x0f0) >> 4;
*codechar++ = base64_encode_value(result);
result = (fragment & 0x00f) << 2;
case step_C:
if (plainchar == plaintextend) {
state_in->result = result;
state_in->step = step_C;
return codechar - code_out;
}
fragment = *plainchar++;
result |= (fragment & 0x0c0) >> 6;
*codechar++ = base64_encode_value(result);
result = (fragment & 0x03f) >> 0;
*codechar++ = base64_encode_value(result);
++(state_in->stepcount);
if (breakLines && state_in->stepcount == CHARS_PER_LINE/4) {
*codechar++ = '\n';
state_in->stepcount = 0;
}
}
}
/* control should not reach here */
return codechar - code_out;
}
int base64_encode_blockend(char* code_out, base64_encodestate* state_in, bool breakLines) {
char* codechar = code_out;
switch (state_in->step) {
case step_B:
*codechar++ = base64_encode_value(state_in->result);
*codechar++ = '=';
*codechar++ = '=';
++state_in->stepcount;
break;
case step_C:
*codechar++ = base64_encode_value(state_in->result);
*codechar++ = '=';
++state_in->stepcount;
break;
case step_A:
break;
}
if (breakLines && state_in->stepcount > 0) {
*codechar++ = '\n';
}
return codechar - code_out;
}
} // namespace
String encodeBase64(ArrayPtr<const byte> input, bool breakLines) {
/* set up a destination buffer large enough to hold the encoded data */
// equivalent to ceil(input.size() / 3) * 4
auto numChars = (input.size() + 2) / 3 * 4;
if (breakLines) {
// Add space for newline characters.
uint lineCount = numChars / CHARS_PER_LINE;
if (numChars % CHARS_PER_LINE > 0) {
// Partial line.
++lineCount;
}
numChars = numChars + lineCount;
}
auto output = heapString(numChars);
/* keep track of our encoded position */
char* c = output.begin();
/* store the number of bytes encoded by a single call */
int cnt = 0;
size_t total = 0;
/* we need an encoder state */
base64_encodestate s;
/*---------- START ENCODING ----------*/
/* initialise the encoder state */
base64_init_encodestate(&s);
/* gather data from the input and send it to the output */
cnt = base64_encode_block((const char *)input.begin(), input.size(), c, &s, breakLines);
c += cnt;
total += cnt;
/* since we have encoded the entire input string, we know that
there is no more input data; finalise the encoding */
cnt = base64_encode_blockend(c, &s, breakLines);
c += cnt;
total += cnt;
/*---------- STOP ENCODING ----------*/
KJ_ASSERT(total == output.size(), total, output.size());
return output;
}
// -------------------------------------------------------------------
// Decoder
namespace {
typedef enum {
step_a, step_b, step_c, step_d
} base64_decodestep;
typedef struct {
base64_decodestep step;
char plainchar;
} base64_decodestate;
int base64_decode_value(char value_in) {
static const char decoding[] = {
62,-1,-1,-1,63,52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-2,-1,-1,-1,
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,-1,
26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51};
static const char decoding_size = sizeof(decoding);
value_in -= 43;
if (value_in < 0 || value_in > decoding_size) return -1;
return decoding[(int)value_in];
}
void base64_init_decodestate(base64_decodestate* state_in) {
state_in->step = step_a;
state_in->plainchar = 0;
}
int base64_decode_block(const char* code_in, const int length_in,
char* plaintext_out, base64_decodestate* state_in) {
const char* codechar = code_in;
char* plainchar = plaintext_out;
char fragment;
*plainchar = state_in->plainchar;
switch (state_in->step)
{
while (1)
{
case step_a:
do {
if (codechar == code_in+length_in) {
state_in->step = step_a;
state_in->plainchar = *plainchar;
return plainchar - plaintext_out;
}
fragment = (char)base64_decode_value(*codechar++);
} while (fragment < 0);
*plainchar = (fragment & 0x03f) << 2;
case step_b:
do {
if (codechar == code_in+length_in) {
state_in->step = step_b;
state_in->plainchar = *plainchar;
return plainchar - plaintext_out;
}
fragment = (char)base64_decode_value(*codechar++);
} while (fragment < 0);
*plainchar++ |= (fragment & 0x030) >> 4;
*plainchar = (fragment & 0x00f) << 4;
case step_c:
do {
if (codechar == code_in+length_in) {
state_in->step = step_c;
state_in->plainchar = *plainchar;
return plainchar - plaintext_out;
}
fragment = (char)base64_decode_value(*codechar++);
} while (fragment < 0);
*plainchar++ |= (fragment & 0x03c) >> 2;
*plainchar = (fragment & 0x003) << 6;
case step_d:
do {
if (codechar == code_in+length_in) {
state_in->step = step_d;
state_in->plainchar = *plainchar;
return plainchar - plaintext_out;
}
fragment = (char)base64_decode_value(*codechar++);
} while (fragment < 0);
*plainchar++ |= (fragment & 0x03f);
}
}
/* control should not reach here */
return plainchar - plaintext_out;
}
} // namespace
Array<byte> decodeBase64(ArrayPtr<const char> input) {
base64_decodestate state;
base64_init_decodestate(&state);
auto output = heapArray<byte>((input.size() * 6 + 7) / 8);
size_t n = base64_decode_block(input.begin(), input.size(),
reinterpret_cast<char*>(output.begin()), &state);
if (n < output.size()) {
auto copy = heapArray<byte>(n);
memcpy(copy.begin(), output.begin(), n);
output = kj::mv(copy);
}
return output;
}
} // namespace kj
// Copyright (c) 2017 Cloudflare, Inc. and contributors
// Licensed under the MIT License:
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef KJ_ENCODING_H_
#define KJ_ENCODING_H_
// Functions for encoding/decoding bytes and text in common formats, including:
// - UTF-{8,16,32}
// - Hex
// - URI encoding
// - Base64
#if defined(__GNUC__) && !KJ_HEADER_WARNINGS
#pragma GCC system_header
#endif
#include "string.h"
namespace kj {
template <typename ResultType>
struct UtfResult: public ResultType {
// Equivalent to ResultType (a String or wide-char array) for all intents and purposes, except
// that the bool `hadErrors` can be inspected to see if any errors were encountered in the input,
// resulting in instances of the replacement character (U+FFFD) in the output.
inline UtfResult(ResultType&& result, bool hadErrors)
: ResultType(kj::mv(result)), hadErrors(hadErrors) {}
const bool hadErrors;
// If true, then invalid sequences were detected in the input and were replaced with the Unicode
// replacement character (U+FFFD) in the output. Many applications will chose to ignore this
// boolean and continue on with the damaged data.
};
UtfResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false);
UtfResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false);
Maybe<Array<char16_t>> tryEncodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false);
Maybe<Array<char32_t>> tryEncodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false);
// Convert UTF-8 text (which KJ strings use) to UTF-16 or UTF-32.
//
// If `nulTerminate` is true, an extra NUL character will be added to the end of the output.
//
// The `try` versions return null if the input is invalid; the non-`try` versions return data
// containing the Unicode replacement character (U+FFFD).
//
// The returned arrays are in platform-native endianness (otherwise they wouldn't really be
// char16_t / char32_t).
UtfResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16);
UtfResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32);
Maybe<String> tryDecodeUtf16(ArrayPtr<const char16_t> utf16);
Maybe<String> tryDecodeUtf32(ArrayPtr<const char32_t> utf32);
// Convert UTF-16 or UTF-32 to UTF-8 (which KJ strings use).
//
// The input should NOT include a NUL terminator; any NUL characters in the input array will be
// preserved in the output.
//
// The `try` versions return null if the input is invalid; the non-`try` versions return data
// containing the Unicode replacement character (U+FFFD).
//
// The input must be in platform-native endianness. BOMs are NOT recognized by these functions.
String encodeHex(ArrayPtr<const byte> bytes);
Array<byte> decodeHex(ArrayPtr<const char> text);
// Encode/decode bytes as hex strings.
String encodeUriComponent(ArrayPtr<const byte> bytes);
String encodeUriComponent(ArrayPtr<const char> bytes);
Array<byte> decodeBinaryUriComponent(ArrayPtr<const char> text, bool nulTerminate = false);
String decodeUriComponent(ArrayPtr<const char> text);
// Encode/decode URI components using % escapes. See Javascript's encodeURIComponent().
String encodeBase64(ArrayPtr<const byte> bytes, bool breakLines = false);
// Encode the given bytes as base64 text. If `breakLines` is true, line breaks will be inserted
// into the output every 72 characters (e.g. for encoding e-mail bodies).
Array<byte> decodeBase64(ArrayPtr<const char> text);
// Decode base64 text. Non-base64 characters are ignored.
// =======================================================================================
// inline implementation details
inline String encodeUriComponent(ArrayPtr<const char> text) {
return encodeUriComponent(text.asBytes());
}
inline String decodeUriComponent(ArrayPtr<const char> text) {
return String(decodeBinaryUriComponent(text, true).releaseAsChars());
}
// If you pass a string literal to a function taking ArrayPtr<const char>, it'll include the NUL
// termintator, which is surprising. Let's add overloads that avoid that. In practice this probably
// only even matters for encoding-test.c++.
template <size_t s>
inline UtfResult<Array<char16_t>> encodeUtf16(const char (&text)[s], bool nulTerminate = false) {
return encodeUtf16(arrayPtr(text, s - 1), nulTerminate);
}
template <size_t s>
inline UtfResult<Array<char32_t>> encodeUtf32(const char (&text)[s], bool nulTerminate = false) {
return encodeUtf32(arrayPtr(text, s - 1), nulTerminate);
}
template <size_t s>
inline Maybe<Array<char16_t>> tryEncodeUtf16(const char (&text)[s], bool nulTerminate = false) {
return tryEncodeUtf16(arrayPtr(text, s - 1), nulTerminate);
}
template <size_t s>
inline Maybe<Array<char32_t>> tryEncodeUtf32(const char (&text)[s], bool nulTerminate = false) {
return tryEncodeUtf32(arrayPtr(text, s - 1), nulTerminate);
}
template <size_t s>
inline UtfResult<String> decodeUtf16(const char16_t (&utf16)[s]) {
return decodeUtf16(arrayPtr(utf16, s - 1));
}
template <size_t s>
inline UtfResult<String> decodeUtf32(const char32_t (&utf32)[s]) {
return decodeUtf32(arrayPtr(utf32, s - 1));
}
template <size_t s>
inline Maybe<String> tryDecodeUtf16(const char16_t (&utf16)[s]) {
return tryDecodeUtf16(arrayPtr(utf16, s - 1));
}
template <size_t s>
inline Maybe<String> tryDecodeUtf32(const char32_t (&utf32)[s]) {
return tryDecodeUtf32(arrayPtr(utf32, s - 1));
}
template <size_t s>
inline Array<byte> decodeHex(const char (&text)[s]) {
return decodeHex(arrayPtr(text, s - 1));
}
template <size_t s>
inline String encodeUriComponent(const char (&text)[s]) {
return encodeUriComponent(arrayPtr(text, s - 1));
}
template <size_t s>
inline Array<byte> decodeBinaryUriComponent(const char (&text)[s]) {
return decodeBinaryUriComponent(arrayPtr(text, s - 1));
}
template <size_t s>
inline String decodeUriComponent(const char (&text)[s]) {
return String(decodeBinaryUriComponent(arrayPtr(text, s - 1), true).releaseAsChars());
}
template <size_t s>
Array<byte> decodeBase64(const char (&text)[s]) {
return decodeBase64(arrayPtr(text, s - 1));
}
} // namespace kj
#endif // KJ_ENCODING_H_
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment