Commit f3e0ed22 authored by Harris Hancock's avatar Harris Hancock

decodeBase64() reports errors required by HTML spec

This change modifies decodeBase64() to report errors as required by the WHATWG HTML spec's atob() JavaScript function. Notably, it reports errors for non-whitespace characters outside of the valid base64 character range ([+/0-9A-Za-z=]), and performs sanity checks on padding and input length.

I took care to keep the algorithm single-pass, and to support streaming via multiple calls of base64_decode_block(), though we don't currently expose that functionality.
parent 9306bc07
......@@ -277,7 +277,9 @@ KJ_TEST("base64 encoding/decoding") {
{
auto encoded = encodeBase64(StringPtr("foo").asBytes(), false);
KJ_EXPECT(encoded == "Zm9v", encoded, encoded.size());
KJ_EXPECT(heapString(decodeBase64(encoded.asArray()).asChars()) == "foo");
auto decoded = decodeBase64(encoded.asArray());
KJ_EXPECT(!decoded.hadErrors);
KJ_EXPECT(heapString(decoded.asChars()) == "foo");
}
{
......@@ -289,11 +291,35 @@ KJ_TEST("base64 encoding/decoding") {
{
auto encoded = encodeBase64(StringPtr("corge").asBytes(), false);
KJ_EXPECT(encoded == "Y29yZ2U=", encoded);
KJ_EXPECT(heapString(decodeBase64(encoded.asArray()).asChars()) == "corge");
auto decoded = decodeBase64(encoded.asArray());
KJ_EXPECT(!decoded.hadErrors);
KJ_EXPECT(heapString(decoded.asChars()) == "corge");
}
KJ_EXPECT(heapString(decodeBase64("Y29yZ2U").asChars()) == "corge");
KJ_EXPECT(heapString(decodeBase64("Y\n29y Z@2U=\n").asChars()) == "corge");
{
auto decoded = decodeBase64("Y29yZ2U");
KJ_EXPECT(!decoded.hadErrors);
KJ_EXPECT(heapString(decoded.asChars()) == "corge");
}
{
auto decoded = decodeBase64("Y\n29y Z@2U=\n");
KJ_EXPECT(decoded.hadErrors); // @-sign is invalid base64 input.
KJ_EXPECT(heapString(decoded.asChars()) == "corge");
}
{
auto decoded = decodeBase64("Y\n29y Z2U=\n");
KJ_EXPECT(!decoded.hadErrors);
KJ_EXPECT(heapString(decoded.asChars()) == "corge");
}
// Too much padding.
KJ_EXPECT(decodeBase64("Y29yZ2U==").hadErrors);
KJ_EXPECT(decodeBase64("Y29yZ===").hadErrors);
// Non-terminal padding.
KJ_EXPECT(decodeBase64("ab=c").hadErrors);
{
auto encoded = encodeBase64(StringPtr("corge").asBytes(), true);
......
......@@ -661,26 +661,37 @@ typedef enum {
} base64_decodestep;
typedef struct {
base64_decodestep step;
char plainchar;
bool hadErrors = false;
size_t nPaddingBytesSeen = 0;
// Output state. `nPaddingBytesSeen` is not guaranteed to be correct if `hadErrors` is true. It is
// included in the state purely to preserve the streaming capability of the algorithm while still
// checking for errors correctly (consider chunk 1 = "abc=", chunk 2 = "d").
base64_decodestep step = step_a;
char plainchar = 0;
} base64_decodestate;
int base64_decode_value(char value_in) {
// Returns either the fragment value or: -1 on whitespace, -2 on padding, -3 on invalid input.
//
// Note that the original libb64 implementation used -1 for invalid input, -2 on padding -- this
// new scheme allows for some simpler error checks in steps A and B.
static const char decoding[] = {
62,-1,-1,-1,63,52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-2,-1,-1,-1,
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,-1,
26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51};
-3,-3,-3,-3,-3,-3,-3,-3, -3,-1,-1,-3,-1,-1,-3,-3,
-3,-3,-3,-3,-3,-3,-3,-3, -3,-3,-3,-3,-3,-3,-3,-3,
-1,-3,-3,-3,-3,-3,-3,-3, -3,-3,-3,62,-3,-3,-3,63,
52,53,54,55,56,57,58,59, 60,61,-3,-3,-3,-2,-3,-3,
-3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,
15,16,17,18,19,20,21,22, 23,24,25,-3,-3,-3,-3,-3,
-3,26,27,28,29,30,31,32, 33,34,35,36,37,38,39,40,
41,42,43,44,45,46,47,48, 49,50,51
};
static const char decoding_size = sizeof(decoding);
value_in -= 43;
if (value_in < 0 || value_in > decoding_size) return -1;
if (value_in < 0 || value_in > decoding_size) return -3;
return decoding[(int)value_in];
}
void base64_init_decodestate(base64_decodestate* state_in) {
state_in->step = step_a;
state_in->plainchar = 0;
}
int base64_decode_block(const char* code_in, const int length_in,
char* plaintext_out, base64_decodestate* state_in) {
const char* codechar = code_in;
......@@ -691,6 +702,8 @@ int base64_decode_block(const char* code_in, const int length_in,
*plainchar = state_in->plainchar;
}
#define ERROR_IF(predicate) state_in->hadErrors = state_in->hadErrors || (predicate)
switch (state_in->step)
{
while (1)
......@@ -703,6 +716,8 @@ int base64_decode_block(const char* code_in, const int length_in,
return plainchar - plaintext_out;
}
fragment = (char)base64_decode_value(*codechar++);
// It is an error to see invalid or padding bytes in step A.
ERROR_IF(fragment < -1);
} while (fragment < 0);
*plainchar = (fragment & 0x03f) << 2;
case step_b:
......@@ -710,9 +725,15 @@ int base64_decode_block(const char* code_in, const int length_in,
if (codechar == code_in+length_in) {
state_in->step = step_b;
state_in->plainchar = *plainchar;
// It is always an error to suspend from step B, because we don't have enough bits yet.
// TODO(someday): This actually breaks the streaming use case, if base64_decode_block() is
// to be called multiple times. We'll fix it if we ever care to support streaming.
state_in->hadErrors = true;
return plainchar - plaintext_out;
}
fragment = (char)base64_decode_value(*codechar++);
// It is an error to see invalid or padding bytes in step B.
ERROR_IF(fragment < -1);
} while (fragment < 0);
*plainchar++ |= (fragment & 0x030) >> 4;
*plainchar = (fragment & 0x00f) << 4;
......@@ -721,10 +742,18 @@ int base64_decode_block(const char* code_in, const int length_in,
if (codechar == code_in+length_in) {
state_in->step = step_c;
state_in->plainchar = *plainchar;
// It is an error to complete from step C if we have seen incomplete padding.
// TODO(someday): This actually breaks the streaming use case, if base64_decode_block() is
// to be called multiple times. We'll fix it if we ever care to support streaming.
ERROR_IF(state_in->nPaddingBytesSeen == 1);
return plainchar - plaintext_out;
}
fragment = (char)base64_decode_value(*codechar++);
// It is an error to see invalid bytes or more than two padding bytes in step C.
ERROR_IF(fragment < -2 || (fragment == -2 && ++state_in->nPaddingBytesSeen > 2));
} while (fragment < 0);
// It is an error to continue from step C after having seen any padding.
ERROR_IF(state_in->nPaddingBytesSeen > 0);
*plainchar++ |= (fragment & 0x03c) >> 2;
*plainchar = (fragment & 0x003) << 6;
case step_d:
......@@ -735,19 +764,25 @@ int base64_decode_block(const char* code_in, const int length_in,
return plainchar - plaintext_out;
}
fragment = (char)base64_decode_value(*codechar++);
// It is an error to see invalid bytes or more than one padding byte in step D.
ERROR_IF(fragment < -2 || (fragment == -2 && ++state_in->nPaddingBytesSeen > 1));
} while (fragment < 0);
// It is an error to continue from step D after having seen padding bytes.
ERROR_IF(state_in->nPaddingBytesSeen > 0);
*plainchar++ |= (fragment & 0x03f);
}
}
#undef ERROR_IF
/* control should not reach here */
return plainchar - plaintext_out;
}
} // namespace
Array<byte> decodeBase64(ArrayPtr<const char> input) {
EncodingResult<Array<byte>> decodeBase64(ArrayPtr<const char> input) {
base64_decodestate state;
base64_init_decodestate(&state);
auto output = heapArray<byte>((input.size() * 6 + 7) / 8);
......@@ -760,7 +795,7 @@ Array<byte> decodeBase64(ArrayPtr<const char> input) {
output = kj::mv(copy);
}
return output;
return EncodingResult<Array<byte>>(kj::mv(output), state.hadErrors);
}
} // namespace kj
......@@ -97,9 +97,9 @@ String encodeBase64(ArrayPtr<const byte> bytes, bool breakLines = false);
// Encode the given bytes as base64 text. If `breakLines` is true, line breaks will be inserted
// into the output every 72 characters (e.g. for encoding e-mail bodies).
Array<byte> decodeBase64(ArrayPtr<const char> text);
// Decode base64 text. Non-base64 characters are ignored and padding characters are not requried;
// as such, this function never fails.
EncodingResult<Array<byte>> decodeBase64(ArrayPtr<const char> text);
// Decode base64 text. This function reports errors required by the WHATWG HTML/Infra specs: see
// https://html.spec.whatwg.org/multipage/webappapis.html#atob for details.
// =======================================================================================
// inline implementation details
......@@ -200,7 +200,7 @@ inline EncodingResult<String> decodeCEscape(const char (&text)[s]) {
return decodeCEscape(arrayPtr(text, s-1));
}
template <size_t s>
Array<byte> decodeBase64(const char (&text)[s]) {
EncodingResult<Array<byte>> decodeBase64(const char (&text)[s]) {
return decodeBase64(arrayPtr(text, s - 1));
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment