Support all JSON escape codes (including \u) for parsing & text gen.

Bug: 16624362 Change-Id: Ia09ea404c0c11dd1dc6993a8cbd155bf8152b65f Tested: on Windows & Linux.

Support all JSON escape codes (including \u) for parsing & text gen.
Bug: 16624362 Change-Id: Ia09ea404c0c11dd1dc6993a8cbd155bf8152b65f Tested: on Windows & Linux.
ebac1e19 · Wouter van Oortmerssen · f7b0d130 · ebac1e19 · ebac1e19 · ebac1e19
Commit ebac1e19 authored Aug 20, 2014 by Wouter van Oortmerssen
8 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -35,3 +35,7 @@ flatsampletext
 snapshot.sh
 tests/go_gen
 CMakeLists.txt.user
+CMakeScripts/**
+build/Xcode/FlatBuffers.xcodeproj/project.xcworkspace/**
+build/Xcode/FlatBuffers.xcodeproj/xcuserdata/**
--- a/docs/html/md__schemas.html
+++ b/docs/html/md__schemas.html
@@ -144,6 +144,20 @@ root_type Monster;
 <li>It accepts field names with and without quotes, like many JSON parsers already do. It outputs them without quotes as well, though can be made to output them using the <code>strict_json</code> flag.</li>
 <li>If a field has an enum type, the parser will recognize symbolic enum values (with or without quotes) instead of numbers, e.g. <code>field: EnumVal</code>. If a field is of integral type, you can still use symbolic names, but values need to be prefixed with their type and need to be quoted, e.g. <code>field: "Enum.EnumVal"</code>. For enums representing flags, you may place multiple inside a string separated by spaces to OR them, e.g. <code>field: "EnumVal1 EnumVal2"</code> or <code>field: "Enum.EnumVal1 Enum.EnumVal2"</code>.</li>
 </ul>
+<p>When parsing JSON, it recognizes the following escape codes in strings:</p>
+<ul>
+<li><code>\n</code> - linefeed.</li>
+<li><code>\t</code> - tab.</li>
+<li><code>\r</code> - carriage return.</li>
+<li><code>\b</code> - backspace.</li>
+<li><code>\f</code> - form feed.</li>
+<li><code>\"</code> - double quote.</li>
+<li><code>\\</code> - backslash.</li>
+<li><code>\/</code> - forward slash.</li>
+<li><code>\uXXXX</code> - 16-bit unicode code point, converted to the equivalent UTF-8 representation.</li>
+<li><code>\xXX</code> - 8-bit binary hexadecimal number XX. This is the only one that is not in the JSON spec (see <a href="http://json.org/">http://json.org/</a>), but is needed to be able to encode arbitrary binary in strings to text and back without losing information (e.g. the byte 0xFF can't be represented in standard JSON).</li>
+</ul>
+<p>It also generates these escape codes back again when generating JSON from a binary representation.</p>
 <h2>Gotchas</h2>
 <h3>Schemas and version control</h3>
 <p>FlatBuffers relies on new field declarations being added at the end, and earlier declarations to not be removed, but be marked deprecated when needed. We think this is an improvement over the manual number assignment that happens in Protocol Buffers (and which is still an option using the <code>id</code> attribute mentioned above).</p>

--- a/docs/source/Schemas.md
+++ b/docs/source/Schemas.md
@@ -268,6 +268,26 @@ JSON:
    separated by spaces to OR them, e.g.
    `field: "EnumVal1 EnumVal2"` or `field: "Enum.EnumVal1 Enum.EnumVal2"`.
+When parsing JSON, it recognizes the following escape codes in strings:
+-   `\n` - linefeed.
+-   `\t` - tab.
+-   `\r` - carriage return.
+-   `\b` - backspace.
+-   `\f` - form feed.
+-   `\"` - double quote.
+-   `\\` - backslash.
+-   `\/` - forward slash.
+-   `\uXXXX` - 16-bit unicode code point, converted to the equivalent UTF-8
+    representation.
+-   `\xXX` - 8-bit binary hexadecimal number XX. This is the only one that is
+     not in the JSON spec (see http://json.org/), but is needed to be able to
+     encode arbitrary binary in strings to text and back without losing
+     information (e.g. the byte 0xFF can't be represented in standard JSON).
+It also generates these escape codes back again when generating JSON from a
+binary representation.
 ## Gotchas
 ### Schemas and version control

--- a/include/flatbuffers/idl.h
+++ b/include/flatbuffers/idl.h
@@ -276,6 +276,7 @@ class Parser {
  void MarkGenerated();
 private:
+  int64_t ParseHexNum(int nibbles);
  void Next();
  bool IsNext(int t);
  void Expect(int t);

--- a/include/flatbuffers/util.h
+++ b/include/flatbuffers/util.h
@@ -44,13 +44,11 @@ template<> inline std::string NumToString<unsigned char>(unsigned char t) {
 }
 // Convert an integer value to a hexadecimal string.
-// The returned string length is the number of nibbles in
+// The returned string length is always xdigits long, prefixed by 0 digits.
-// the supplied value prefixed by 0 digits.  For example,
+// For example, IntToStringHex(0x23, 8) returns the string "00000023".
-// IntToStringHex(static_cast<int>(0x23)) returns the
+inline std::string IntToStringHex(int i, int xdigits) {
-// string "00000023".
-template<typename T> std::string IntToStringHex(T i) {
  std::stringstream ss;
-  ss << std::setw(sizeof(T) * 2)
+  ss << std::setw(xdigits)
     << std::setfill('0')
     << std::hex
     << std::uppercase
@@ -59,11 +57,11 @@ template<typename T> std::string IntToStringHex(T i) {
 }
 // Portable implementation of strtoull().
-inline int64_t StringToInt(const char *str) {
+inline int64_t StringToInt(const char *str, int base = 10) {
  #ifdef _MSC_VER
-    return _strtoui64(str, nullptr, 10);
+    return _strtoui64(str, nullptr, base);
  #else
-    return strtoull(str, nullptr, 10);
+    return strtoull(str, nullptr, base);
  #endif
 }
@@ -126,6 +124,60 @@ inline std::string StripFileName(const std::string &filepath) {
  return i != std::string::npos ? filepath.substr(0, i + 1) : "";
 }
+// To and from UTF-8 unicode conversion functions
+// Convert a unicode code point into a UTF-8 representation by appending it
+// to a string. Returns the number of bytes generated.
+inline int ToUTF8(uint32_t ucc, std::string *out) {
+  assert(!(ucc & 0x80000000));  // Top bit can't be set.
+  // 6 possible encodings: http://en.wikipedia.org/wiki/UTF-8
+  for (int i = 0; i < 6; i++) {
+    // Max bits this encoding can represent.
+    uint32_t max_bits = 6 + i * 5 + static_cast<int>(!i);
+    if (ucc < (1 << max_bits)) {  // does it fit?
+      // Remaining bits not encoded in the first byte, store 6 bits each
+      uint32_t remain_bits = i * 6;
+      // Store first byte:
+      (*out) += static_cast<char>((0xFE << (max_bits - remain_bits)) |
+                                 (ucc >> remain_bits));
+      // Store remaining bytes:
+      for (int j = i - 1; j >= 0; j--) {
+        (*out) += static_cast<char>(((ucc >> (j * 6)) & 0x3F) | 0x80);
+      }
+      return i + 1;  // Return the number of bytes added.
+    }
+  }
+  assert(0);  // Impossible to arrive here.
+  return -1;
+}
+// Converts whatever prefix of the incoming string corresponds to a valid
+// UTF-8 sequence into a unicode code. The incoming pointer will have been
+// advanced past all bytes parsed.
+// returns -1 upon corrupt UTF-8 encoding (ignore the incoming pointer in
+// this case).
+inline int FromUTF8(const char **in) {
+  int len = 0;
+  // Count leading 1 bits.
+  for (int mask = 0x80; mask >= 0x04; mask >>= 1) {
+    if (**in & mask) {
+      len++;
+    } else {
+      break;
+    }
+  }
+  if ((**in << len) & 0x80) return -1;  // Bit after leading 1's must be 0.
+  if (!len) return *(*in)++;
+  // Grab initial bits of the code.
+  int ucc = *(*in)++ & ((1 << (7 - len)) - 1);
+  for (int i = 0; i < len - 1; i++) {
+    if ((**in & 0xC0) != 0x80) return -1;  // Upper bits must 1 0.
+    ucc <<= 6;
+    ucc |= *(*in)++ & 0x3F;  // Grab 6 more bits of the code.
+  }
+  return ucc;
+}
 }  // namespace flatbuffers
 #endif  // FLATBUFFERS_UTIL_H_
--- a/src/idl_gen_text.cpp
+++ b/src/idl_gen_text.cpp
@@ -28,8 +28,12 @@ static void GenStruct(const StructDef &struct_def, const Table *table,
 // If indentation is less than 0, that indicates we don't want any newlines
 // either.
-const char *NewLine(int indent_step) {
+const char *NewLine(const GeneratorOptions &opts) {
-  return indent_step >= 0 ? "\n" : "";
+  return opts.indent_step >= 0 ? "\n" : "";
+}
+int Indent(const GeneratorOptions &opts) {
+  return std::max(opts.indent_step, 0);
 }
 // Output an identifier with or without quotes depending on strictness.
@@ -65,21 +69,21 @@ template<typename T> void PrintVector(const Vector<T> &v, Type type,
                                      std::string *_text) {
  std::string &text = *_text;
  text += "[";
-  text += NewLine(opts.indent_step);
+  text += NewLine(opts);
  for (uoffset_t i = 0; i < v.Length(); i++) {
    if (i) {
      text += ",";
-      text += NewLine(opts.indent_step);
+      text += NewLine(opts);
    }
-    text.append(indent + opts.indent_step, ' ');
+    text.append(indent + Indent(opts), ' ');
    if (IsStruct(type))
      Print(v.GetStructFromOffset(i * type.struct_def->bytesize), type,
-            indent + opts.indent_step, nullptr, opts, _text);
+            indent + Indent(opts), nullptr, opts, _text);
    else
-      Print(v.Get(i), type, indent + opts.indent_step, nullptr,
+      Print(v.Get(i), type, indent + Indent(opts), nullptr,
            opts, _text);
  }
-  text += NewLine(opts.indent_step);
+  text += NewLine(opts);
  text.append(indent, ' ');
  text += "]";
 }
@@ -93,15 +97,28 @@ static void EscapeString(const String &s, std::string *_text) {
      case '\n': text += "\\n"; break;
      case '\t': text += "\\t"; break;
      case '\r': text += "\\r"; break;
+      case '\b': text += "\\b"; break;
+      case '\f': text += "\\f"; break;
      case '\"': text += "\\\""; break;
      case '\\': text += "\\\\"; break;
      default:
        if (c >= ' ' && c <= '~') {
          text += c;
        } else {
-          auto u = static_cast<unsigned char>(c);
+          // Not printable ASCII data. Let's see if it's valid UTF-8 first:
-          text += "\\x";
+          const char *utf8 = s.c_str() + i;
-          text += IntToStringHex(u);
+          int ucc = FromUTF8(&utf8);
+          if (ucc >= 0x80 && ucc <= 0xFFFF) {
+            // Parses as Unicode within JSON's \uXXXX range, so use that.
+            text += "\\u";
+            text += IntToStringHex(ucc, 4);
+            i = utf8 - s.c_str() - 1;  // Skip past characters recognized.
+          } else {
+            // It's either unprintable ASCII, arbitrary binary, or Unicode data
+            // that doesn't fit \uXXXX, so use \xXX escape code instead.
+            text += "\\x";
+            text += IntToStringHex(static_cast<uint8_t>(c), 2);
+          }
        }
        break;
    }
@@ -202,15 +219,15 @@ static void GenStruct(const StructDef &struct_def, const Table *table,
      if (fieldout++) {
        text += ",";
      }
-      text += NewLine(opts.indent_step);
+      text += NewLine(opts);
-      text.append(indent + opts.indent_step, ' ');
+      text.append(indent + Indent(opts), ' ');
      OutputIdentifier(fd.name, opts, _text);
      text += ": ";
      switch (fd.value.type.base_type) {
         #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, JTYPE, GTYPE) \
           case BASE_TYPE_ ## ENUM: \
              GenField<CTYPE>(fd, table, struct_def.fixed, \
-                              opts, indent + opts.indent_step, _text); \
+                              opts, indent + Indent(opts), _text); \
              break;
          FLATBUFFERS_GEN_TYPES_SCALAR(FLATBUFFERS_TD)
        #undef FLATBUFFERS_TD
@@ -219,7 +236,7 @@ static void GenStruct(const StructDef &struct_def, const Table *table,
          case BASE_TYPE_ ## ENUM:
          FLATBUFFERS_GEN_TYPES_POINTER(FLATBUFFERS_TD)
        #undef FLATBUFFERS_TD
-            GenFieldOffset(fd, table, struct_def.fixed, indent + opts.indent_step,
+            GenFieldOffset(fd, table, struct_def.fixed, indent + Indent(opts),
                           union_sd, opts, _text);
            break;
      }
@@ -231,7 +248,7 @@ static void GenStruct(const StructDef &struct_def, const Table *table,
      }
    }
  }
-  text += NewLine(opts.indent_step);
+  text += NewLine(opts);
  text.append(indent, ' ');
  text += "}";
 }
@@ -247,7 +264,7 @@ void GenerateText(const Parser &parser, const void *flatbuffer,
            0,
            opts,
            _text);
-  text += NewLine(opts.indent_step);
+  text += NewLine(opts);
 }
 }  // namespace flatbuffers

--- a/src/idl_parser.cpp
+++ b/src/idl_parser.cpp
@@ -115,6 +115,17 @@ static std::string TokenToString(int t) {
  }
 }
+// Parses exactly nibbles worth of hex digits into a number, or error.
+int64_t Parser::ParseHexNum(int nibbles) {
+  for (int i = 0; i < nibbles; i++)
+    if (!isxdigit(cursor_[i]))
+      Error("escape code must be followed by " + NumToString(nibbles) +
+            " hex digits");
+  auto val = StringToInt(cursor_, 16);
+  cursor_ += nibbles;
+  return val;
+}
 void Parser::Next() {
  doc_comment_.clear();
  bool seen_newline = false;
@@ -142,8 +153,21 @@ void Parser::Next() {
              case 'n':  attribute_ += '\n'; cursor_++; break;
              case 't':  attribute_ += '\t'; cursor_++; break;
              case 'r':  attribute_ += '\r'; cursor_++; break;
+              case 'b':  attribute_ += '\b'; cursor_++; break;
+              case 'f':  attribute_ += '\f'; cursor_++; break;
              case '\"': attribute_ += '\"'; cursor_++; break;
              case '\\': attribute_ += '\\'; cursor_++; break;
+              case '/':  attribute_ += '/';  cursor_++; break;
+              case 'x': {  // Not in the JSON standard
+                cursor_++;
+                attribute_ += static_cast<char>(ParseHexNum(2));
+                break;
+              }
+              case 'u': {
+                cursor_++;
+                ToUTF8(static_cast<int>(ParseHexNum(4)), &attribute_);
+                break;
+              }
              default: Error("unknown escape code in string constant"); break;
            }
          } else { // printable chars + UTF-8 bytes

--- a/tests/test.cpp
+++ b/tests/test.cpp
@@ -516,7 +516,19 @@ void EnumStringsTest() {
                        "{ F:[ \"E.C\", \"E.A E.B E.C\" ] }", ""), true);
 }
+void UnicodeTest() {
+  flatbuffers::Parser parser;
+  TEST_EQ(parser.Parse("table T { F:string; }"
+                       "root_type T;"
+                       "{ F:\"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
+                       "\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\" }", ""), true);
+  std::string jsongen;
+  flatbuffers::GeneratorOptions opts;
+  opts.indent_step = -1;
+  GenerateText(parser, parser.builder_.GetBufferPointer(), opts, &jsongen);
+  TEST_EQ(jsongen == "{F: \"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
+                     "\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\"}", true);
+}
 int main(int /*argc*/, const char * /*argv*/[]) {
  // Run our various test suites:
@@ -534,6 +546,7 @@ int main(int /*argc*/, const char * /*argv*/[]) {
  ErrorTest();
  ScientificTest();
  EnumStringsTest();
+  UnicodeTest();
  if (!testing_fails) {
    TEST_OUTPUT_LINE("ALL TESTS PASSED");