Merge pull request #595 from capnproto/add-decode-base-64-errors

Make decodeBase64() report errors

Merge pull request #595 from capnproto/add-decode-base-64-errors
Make decodeBase64() report errors
6a59486e · Kenton Varda · GitHub · 9306bc07 · c137c9fd · 6a59486e
Unverified Commit 6a59486e authored Dec 04, 2017 by Kenton Varda Committed by GitHub Dec 04, 2017
Hide whitespace changes
Inline Side-by-side

Showing with 94 additions and 25 deletions

encoding-test.c++ c++/src/kj/encoding-test.c++ +30 -4

encoding.c++ c++/src/kj/encoding.c++ +60 -17

encoding.h c++/src/kj/encoding.h +4 -4

No files found.
--- a/c++/src/kj/encoding-test.c++
+++ b/c++/src/kj/encoding-test.c++
@@ -277,7 +277,9 @@ KJ_TEST("base64 encoding/decoding") {
  {
    auto encoded = encodeBase64(StringPtr("foo").asBytes(), false);
    KJ_EXPECT(encoded == "Zm9v", encoded, encoded.size());
-    KJ_EXPECT(heapString(decodeBase64(encoded.asArray()).asChars()) == "foo");
+    auto decoded = decodeBase64(encoded.asArray());
+    KJ_EXPECT(!decoded.hadErrors);
+    KJ_EXPECT(heapString(decoded.asChars()) == "foo");
  }
  {
@@ -289,11 +291,35 @@ KJ_TEST("base64 encoding/decoding") {
  {
    auto encoded = encodeBase64(StringPtr("corge").asBytes(), false);
    KJ_EXPECT(encoded == "Y29yZ2U=", encoded);
-    KJ_EXPECT(heapString(decodeBase64(encoded.asArray()).asChars()) == "corge");
+    auto decoded = decodeBase64(encoded.asArray());
+    KJ_EXPECT(!decoded.hadErrors);
+    KJ_EXPECT(heapString(decoded.asChars()) == "corge");
  }
-  KJ_EXPECT(heapString(decodeBase64("Y29yZ2U").asChars()) == "corge");
+  {
-  KJ_EXPECT(heapString(decodeBase64("Y\n29y Z@2U=\n").asChars()) == "corge");
+    auto decoded = decodeBase64("Y29yZ2U");
+    KJ_EXPECT(!decoded.hadErrors);
+    KJ_EXPECT(heapString(decoded.asChars()) == "corge");
+  }
+  {
+    auto decoded = decodeBase64("Y\n29y Z@2U=\n");
+    KJ_EXPECT(decoded.hadErrors);  // @-sign is invalid base64 input.
+    KJ_EXPECT(heapString(decoded.asChars()) == "corge");
+  }
+  {
+    auto decoded = decodeBase64("Y\n29y Z2U=\n");
+    KJ_EXPECT(!decoded.hadErrors);
+    KJ_EXPECT(heapString(decoded.asChars()) == "corge");
+  }
+  // Too much padding.
+  KJ_EXPECT(decodeBase64("Y29yZ2U==").hadErrors);
+  KJ_EXPECT(decodeBase64("Y29yZ===").hadErrors);
+  // Non-terminal padding.
+  KJ_EXPECT(decodeBase64("ab=c").hadErrors);
  {
    auto encoded = encodeBase64(StringPtr("corge").asBytes(), true);

--- a/c++/src/kj/encoding.c++
+++ b/c++/src/kj/encoding.c++
@@ -661,24 +661,43 @@ typedef enum {
 } base64_decodestep;
 typedef struct {
-  base64_decodestep step;
+  bool hadErrors = false;
-  char plainchar;
+  size_t nPaddingBytesSeen = 0;
+  // Output state. `nPaddingBytesSeen` is not guaranteed to be correct if `hadErrors` is true. It is
+  // included in the state purely to preserve the streaming capability of the algorithm while still
+  // checking for errors correctly (consider chunk 1 = "abc=", chunk 2 = "d").
+  base64_decodestep step = step_a;
+  char plainchar = 0;
 } base64_decodestate;
 int base64_decode_value(char value_in) {
-  static const char decoding[] = {
+  // Returns either the fragment value or: -1 on whitespace, -2 on padding, -3 on invalid input.
-    62,-1,-1,-1,63,52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-2,-1,-1,-1,
+  //
-    0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,-1,
+  // Note that the original libb64 implementation used -1 for invalid input, -2 on padding -- this
-    26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51};
+  // new scheme allows for some simpler error checks in steps A and B.
-  static const char decoding_size = sizeof(decoding);
-  value_in -= 43;
-  if (value_in < 0 || value_in > decoding_size) return -1;
-  return decoding[(int)value_in];
-}
-void base64_init_decodestate(base64_decodestate* state_in) {
+  static const char decoding[] = {
-  state_in->step = step_a;
+    -3,-3,-3,-3,-3,-3,-3,-3,  -3,-1,-1,-3,-1,-1,-3,-3,
-  state_in->plainchar = 0;
+    -3,-3,-3,-3,-3,-3,-3,-3,  -3,-3,-3,-3,-3,-3,-3,-3,
+    -1,-3,-3,-3,-3,-3,-3,-3,  -3,-3,-3,62,-3,-3,-3,63,
+    52,53,54,55,56,57,58,59,  60,61,-3,-3,-3,-2,-3,-3,
+    -3, 0, 1, 2, 3, 4, 5, 6,   7, 8, 9,10,11,12,13,14,
+    15,16,17,18,19,20,21,22,  23,24,25,-3,-3,-3,-3,-3,
+    -3,26,27,28,29,30,31,32,  33,34,35,36,37,38,39,40,
+    41,42,43,44,45,46,47,48,  49,50,51,-3,-3,-3,-3,-3,
+    -3,-3,-3,-3,-3,-3,-3,-3,  -3,-3,-3,-3,-3,-3,-3,-3,
+    -3,-3,-3,-3,-3,-3,-3,-3,  -3,-3,-3,-3,-3,-3,-3,-3,
+    -3,-3,-3,-3,-3,-3,-3,-3,  -3,-3,-3,-3,-3,-3,-3,-3,
+    -3,-3,-3,-3,-3,-3,-3,-3,  -3,-3,-3,-3,-3,-3,-3,-3,
+    -3,-3,-3,-3,-3,-3,-3,-3,  -3,-3,-3,-3,-3,-3,-3,-3,
+    -3,-3,-3,-3,-3,-3,-3,-3,  -3,-3,-3,-3,-3,-3,-3,-3,
+    -3,-3,-3,-3,-3,-3,-3,-3,  -3,-3,-3,-3,-3,-3,-3,-3,
+    -3,-3,-3,-3,-3,-3,-3,-3,  -3,-3,-3,-3,-3,-3,-3,-3,
+  };
+  static_assert(sizeof(decoding) == 256, "base64 decoding table size error");
+  return decoding[(unsigned char)value_in];
 }
 int base64_decode_block(const char* code_in, const int length_in,
@@ -691,6 +710,8 @@ int base64_decode_block(const char* code_in, const int length_in,
    *plainchar = state_in->plainchar;
  }
+#define ERROR_IF(predicate) state_in->hadErrors = state_in->hadErrors || (predicate)
  switch (state_in->step)
  {
    while (1)
@@ -703,6 +724,8 @@ int base64_decode_block(const char* code_in, const int length_in,
          return plainchar - plaintext_out;
        }
        fragment = (char)base64_decode_value(*codechar++);
+        // It is an error to see invalid or padding bytes in step A.
+        ERROR_IF(fragment < -1);
      } while (fragment < 0);
      *plainchar    = (fragment & 0x03f) << 2;
  case step_b:
@@ -710,9 +733,15 @@ int base64_decode_block(const char* code_in, const int length_in,
        if (codechar == code_in+length_in) {
          state_in->step = step_b;
          state_in->plainchar = *plainchar;
+          // It is always an error to suspend from step B, because we don't have enough bits yet.
+          // TODO(someday): This actually breaks the streaming use case, if base64_decode_block() is
+          //   to be called multiple times. We'll fix it if we ever care to support streaming.
+          state_in->hadErrors = true;
          return plainchar - plaintext_out;
        }
        fragment = (char)base64_decode_value(*codechar++);
+        // It is an error to see invalid or padding bytes in step B.
+        ERROR_IF(fragment < -1);
      } while (fragment < 0);
      *plainchar++ |= (fragment & 0x030) >> 4;
      *plainchar    = (fragment & 0x00f) << 4;
@@ -721,10 +750,18 @@ int base64_decode_block(const char* code_in, const int length_in,
        if (codechar == code_in+length_in) {
          state_in->step = step_c;
          state_in->plainchar = *plainchar;
+          // It is an error to complete from step C if we have seen incomplete padding.
+          // TODO(someday): This actually breaks the streaming use case, if base64_decode_block() is
+          //   to be called multiple times. We'll fix it if we ever care to support streaming.
+          ERROR_IF(state_in->nPaddingBytesSeen == 1);
          return plainchar - plaintext_out;
        }
        fragment = (char)base64_decode_value(*codechar++);
+        // It is an error to see invalid bytes or more than two padding bytes in step C.
+        ERROR_IF(fragment < -2 || (fragment == -2 && ++state_in->nPaddingBytesSeen > 2));
      } while (fragment < 0);
+      // It is an error to continue from step C after having seen any padding.
+      ERROR_IF(state_in->nPaddingBytesSeen > 0);
      *plainchar++ |= (fragment & 0x03c) >> 2;
      *plainchar    = (fragment & 0x003) << 6;
  case step_d:
@@ -735,19 +772,25 @@ int base64_decode_block(const char* code_in, const int length_in,
          return plainchar - plaintext_out;
        }
        fragment = (char)base64_decode_value(*codechar++);
+        // It is an error to see invalid bytes or more than one padding byte in step D.
+        ERROR_IF(fragment < -2 || (fragment == -2 && ++state_in->nPaddingBytesSeen > 1));
      } while (fragment < 0);
+      // It is an error to continue from step D after having seen padding bytes.
+      ERROR_IF(state_in->nPaddingBytesSeen > 0);
      *plainchar++   |= (fragment & 0x03f);
    }
  }
+#undef ERROR_IF
  /* control should not reach here */
  return plainchar - plaintext_out;
 }
 }  // namespace
-Array<byte> decodeBase64(ArrayPtr<const char> input) {
+EncodingResult<Array<byte>> decodeBase64(ArrayPtr<const char> input) {
  base64_decodestate state;
-  base64_init_decodestate(&state);
  auto output = heapArray<byte>((input.size() * 6 + 7) / 8);
@@ -760,7 +803,7 @@ Array<byte> decodeBase64(ArrayPtr<const char> input) {
    output = kj::mv(copy);
  }
-  return output;
+  return EncodingResult<Array<byte>>(kj::mv(output), state.hadErrors);
 }
 } // namespace kj
--- a/c++/src/kj/encoding.h
+++ b/c++/src/kj/encoding.h
@@ -97,9 +97,9 @@ String encodeBase64(ArrayPtr<const byte> bytes, bool breakLines = false);
 // Encode the given bytes as base64 text. If `breakLines` is true, line breaks will be inserted
 // into the output every 72 characters (e.g. for encoding e-mail bodies).
-Array<byte> decodeBase64(ArrayPtr<const char> text);
+EncodingResult<Array<byte>> decodeBase64(ArrayPtr<const char> text);
-// Decode base64 text. Non-base64 characters are ignored and padding characters are not requried;
+// Decode base64 text. This function reports errors required by the WHATWG HTML/Infra specs: see
-// as such, this function never fails.
+// https://html.spec.whatwg.org/multipage/webappapis.html#atob for details.
 // =======================================================================================
 // inline implementation details
@@ -200,7 +200,7 @@ inline EncodingResult<String> decodeCEscape(const char (&text)[s]) {
  return decodeCEscape(arrayPtr(text, s-1));
 }
 template <size_t s>
-Array<byte> decodeBase64(const char (&text)[s]) {
+EncodingResult<Array<byte>> decodeBase64(const char (&text)[s]) {
  return decodeBase64(arrayPtr(text, s - 1));
 }