Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
flatbuffers
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
flatbuffers
Commits
867dfc59
Commit
867dfc59
authored
Aug 02, 2016
by
TGIshib
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'refs/remotes/google/master'
parents
52acb4b3
73d5bf46
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
193 additions
and
14 deletions
+193
-14
idl.h
include/flatbuffers/idl.h
+2
-0
util.h
include/flatbuffers/util.h
+30
-0
flatc.cpp
src/flatc.cpp
+5
-0
idl_gen_text.cpp
src/idl_gen_text.cpp
+26
-11
idl_parser.cpp
src/idl_parser.cpp
+14
-0
test.cpp
tests/test.cpp
+116
-3
No files found.
include/flatbuffers/idl.h
View file @
867dfc59
...
...
@@ -348,6 +348,7 @@ struct IDLOptions {
bool
escape_proto_identifiers
;
bool
generate_object_based_api
;
bool
union_value_namespacing
;
bool
allow_non_utf8
;
// Possible options for the more general generator below.
enum
Language
{
kJava
,
kCSharp
,
kGo
,
kMAX
};
...
...
@@ -370,6 +371,7 @@ struct IDLOptions {
escape_proto_identifiers
(
false
),
generate_object_based_api
(
false
),
union_value_namespacing
(
true
),
allow_non_utf8
(
false
),
lang
(
IDLOptions
::
kJava
)
{}
};
...
...
include/flatbuffers/util.h
View file @
867dfc59
...
...
@@ -276,6 +276,10 @@ inline int FromUTF8(const char **in) {
}
if
((
**
in
<<
len
)
&
0x80
)
return
-
1
;
// Bit after leading 1's must be 0.
if
(
!
len
)
return
*
(
*
in
)
++
;
// UTF-8 encoded values with a length are between 2 and 4 bytes.
if
(
len
<
2
||
len
>
4
)
{
return
-
1
;
}
// Grab initial bits of the code.
int
ucc
=
*
(
*
in
)
++
&
((
1
<<
(
7
-
len
))
-
1
);
for
(
int
i
=
0
;
i
<
len
-
1
;
i
++
)
{
...
...
@@ -283,6 +287,32 @@ inline int FromUTF8(const char **in) {
ucc
<<=
6
;
ucc
|=
*
(
*
in
)
++
&
0x3F
;
// Grab 6 more bits of the code.
}
// UTF-8 cannot encode values between 0xD800 and 0xDFFF (reserved for
// UTF-16 surrogate pairs).
if
(
ucc
>=
0xD800
&&
ucc
<=
0xDFFF
)
{
return
-
1
;
}
// UTF-8 must represent code points in their shortest possible encoding.
switch
(
len
)
{
case
2
:
// Two bytes of UTF-8 can represent code points from U+0080 to U+07FF.
if
(
ucc
<
0x0080
||
ucc
>
0x07FF
)
{
return
-
1
;
}
break
;
case
3
:
// Three bytes of UTF-8 can represent code points from U+0800 to U+FFFF.
if
(
ucc
<
0x0800
||
ucc
>
0xFFFF
)
{
return
-
1
;
}
break
;
case
4
:
// Four bytes of UTF-8 can represent code points from U+10000 to U+10FFFF.
if
(
ucc
<
0x10000
||
ucc
>
0x10FFFF
)
{
return
-
1
;
}
break
;
}
return
ucc
;
}
...
...
src/flatc.cpp
View file @
867dfc59
...
...
@@ -106,6 +106,9 @@ static void Error(const std::string &err, bool usage, bool show_exe_name) {
" --version Print the version number of flatc and exit.
\n
"
" --strict-json Strict JSON: field names must be / will be quoted,
\n
"
" no trailing commas in tables/vectors.
\n
"
" --allow-non-utf8 Pass non-UTF-8 input through parser and emit nonstandard
\n
"
"
\\
x escapes in JSON. (Default is to raise parse error on
\n
"
" non-UTF-8 input.)
\n
"
" --defaults-json Output fields whose value is the default when
\n
"
" writing JSON
\n
"
" --unknown-json Allow fields in JSON that are not defined in the
\n
"
...
...
@@ -184,6 +187,8 @@ int main(int argc, const char *argv[]) {
conform_to_schema
=
argv
[
argi
];
}
else
if
(
arg
==
"--strict-json"
)
{
opts
.
strict_json
=
true
;
}
else
if
(
arg
==
"--allow-non-utf8"
)
{
opts
.
allow_non_utf8
=
true
;
}
else
if
(
arg
==
"--no-js-exports"
)
{
opts
.
skip_js_exports
=
true
;
}
else
if
(
arg
==
"--defaults-json"
)
{
...
...
src/idl_gen_text.cpp
View file @
867dfc59
...
...
@@ -93,7 +93,7 @@ template<typename T> void PrintVector(const Vector<T> &v, Type type,
text
+=
"]"
;
}
static
void
EscapeString
(
const
String
&
s
,
std
::
string
*
_text
)
{
static
void
EscapeString
(
const
String
&
s
,
std
::
string
*
_text
,
const
IDLOptions
&
opts
)
{
std
::
string
&
text
=
*
_text
;
text
+=
"
\"
"
;
for
(
uoffset_t
i
=
0
;
i
<
s
.
size
();
i
++
)
{
...
...
@@ -113,17 +113,32 @@ static void EscapeString(const String &s, std::string *_text) {
// Not printable ASCII data. Let's see if it's valid UTF-8 first:
const
char
*
utf8
=
s
.
c_str
()
+
i
;
int
ucc
=
FromUTF8
(
&
utf8
);
if
(
ucc
>=
0x80
&&
ucc
<=
0xFFFF
)
{
// Parses as Unicode within JSON's \uXXXX range, so use that.
text
+=
"
\\
u"
;
text
+=
IntToStringHex
(
ucc
,
4
);
if
(
ucc
<
0
)
{
if
(
opts
.
allow_non_utf8
)
{
text
+=
"
\\
x"
;
text
+=
IntToStringHex
(
static_cast
<
uint8_t
>
(
c
),
2
);
}
else
{
// We previously checked for non-UTF-8 and returned a parse error,
// so we shouldn't reach here.
assert
(
0
);
}
}
else
{
if
(
ucc
<=
0xFFFF
)
{
// Parses as Unicode within JSON's \uXXXX range, so use that.
text
+=
"
\\
u"
;
text
+=
IntToStringHex
(
ucc
,
4
);
}
else
if
(
ucc
<=
0x10FFFF
)
{
// Encode Unicode SMP values to a surrogate pair using two \u escapes.
uint32_t
base
=
ucc
-
0x10000
;
uint16_t
highSurrogate
=
(
base
>>
10
)
+
0xD800
;
uint16_t
lowSurrogate
=
(
base
&
0x03FF
)
+
0xDC00
;
text
+=
"
\\
u"
;
text
+=
IntToStringHex
(
highSurrogate
,
4
);
text
+=
"
\\
u"
;
text
+=
IntToStringHex
(
lowSurrogate
,
4
);
}
// Skip past characters recognized.
i
=
static_cast
<
uoffset_t
>
(
utf8
-
s
.
c_str
()
-
1
);
}
else
{
// It's either unprintable ASCII, arbitrary binary, or Unicode data
// that doesn't fit \uXXXX, so use \xXX escape code instead.
text
+=
"
\\
x"
;
text
+=
IntToStringHex
(
static_cast
<
uint8_t
>
(
c
),
2
);
}
}
break
;
...
...
@@ -157,7 +172,7 @@ template<> void Print<const void *>(const void *val,
_text
);
break
;
case
BASE_TYPE_STRING
:
{
EscapeString
(
*
reinterpret_cast
<
const
String
*>
(
val
),
_text
);
EscapeString
(
*
reinterpret_cast
<
const
String
*>
(
val
),
_text
,
opts
);
break
;
}
case
BASE_TYPE_VECTOR
:
...
...
src/idl_parser.cpp
View file @
867dfc59
...
...
@@ -61,6 +61,17 @@ static_assert(BASE_TYPE_UNION ==
#define NEXT() ECHECK(Next())
#define EXPECT(tok) ECHECK(Expect(tok))
static
bool
ValidateUTF8
(
const
std
::
string
&
str
)
{
const
char
*
s
=
&
str
[
0
];
const
char
*
const
sEnd
=
s
+
str
.
length
();
while
(
s
<
sEnd
)
{
if
(
FromUTF8
(
&
s
)
<
0
)
{
return
false
;
}
}
return
true
;
}
CheckedError
Parser
::
Error
(
const
std
::
string
&
msg
)
{
error_
=
file_being_parsed_
.
length
()
?
AbsolutePath
(
file_being_parsed_
)
:
""
;
#ifdef _WIN32
...
...
@@ -320,6 +331,9 @@ CheckedError Parser::Next() {
"illegal Unicode sequence (unpaired high surrogate)"
);
}
cursor_
++
;
if
(
!
opts
.
allow_non_utf8
&&
!
ValidateUTF8
(
attribute_
))
{
return
Error
(
"illegal UTF-8 sequence"
);
}
token_
=
kTokenStringConstant
;
return
NoError
();
}
...
...
tests/test.cpp
View file @
867dfc59
...
...
@@ -978,15 +978,36 @@ void IntegerOutOfRangeTest() {
void
UnicodeTest
()
{
flatbuffers
::
Parser
parser
;
// Without setting allow_non_utf8 = true, we treat \x sequences as byte sequences
// which are then validated as UTF-8.
TEST_EQ
(
parser
.
Parse
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\\
u20AC
\\
u00A2
\\
u30E6
\\
u30FC
\\
u30B6
\\
u30FC"
"
\\
u5225
\\
u30B5
\\
u30A4
\\
u30C8
\\
x01
\\
x80
\"
}"
),
true
);
"
\\
u5225
\\
u30B5
\\
u30A4
\\
u30C8
\\
xE2
\\
x82
\\
xAC
\\
u0080
\\
uD83D
\\
uDE0E
\"
}"
),
true
);
std
::
string
jsongen
;
parser
.
opts
.
indent_step
=
-
1
;
GenerateText
(
parser
,
parser
.
builder_
.
GetBufferPointer
(),
&
jsongen
);
TEST_EQ
(
jsongen
==
"{F:
\"\\
u20AC
\\
u00A2
\\
u30E6
\\
u30FC
\\
u30B6
\\
u30FC"
"
\\
u5225
\\
u30B5
\\
u30A4
\\
u30C8
\\
x01
\\
x80
\"
}"
,
true
);
TEST_EQ
(
jsongen
,
std
::
string
(
"{F:
\"\\
u20AC
\\
u00A2
\\
u30E6
\\
u30FC
\\
u30B6
\\
u30FC"
"
\\
u5225
\\
u30B5
\\
u30A4
\\
u30C8
\\
u20AC
\\
u0080
\\
uD83D
\\
uDE0E
\"
}"
));
}
void
UnicodeTestAllowNonUTF8
()
{
flatbuffers
::
Parser
parser
;
parser
.
opts
.
allow_non_utf8
=
true
;
TEST_EQ
(
parser
.
Parse
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\\
u20AC
\\
u00A2
\\
u30E6
\\
u30FC
\\
u30B6
\\
u30FC"
"
\\
u5225
\\
u30B5
\\
u30A4
\\
u30C8
\\
x01
\\
x80
\\
u0080
\\
uD83D
\\
uDE0E
\"
}"
),
true
);
std
::
string
jsongen
;
parser
.
opts
.
indent_step
=
-
1
;
GenerateText
(
parser
,
parser
.
builder_
.
GetBufferPointer
(),
&
jsongen
);
TEST_EQ
(
jsongen
,
std
::
string
(
"{F:
\"\\
u20AC
\\
u00A2
\\
u30E6
\\
u30FC
\\
u30B6
\\
u30FC"
"
\\
u5225
\\
u30B5
\\
u30A4
\\
u30C8
\\
u0001
\\
x80
\\
u0080
\\
uD83D
\\
uDE0E
\"
}"
));
}
void
UnicodeSurrogatesTest
()
{
...
...
@@ -1027,6 +1048,96 @@ void UnicodeInvalidSurrogatesTest() {
"{ F:
\"\\
uDC00
\"
}"
,
"unpaired low surrogate"
);
}
void
InvalidUTF8Test
()
{
// "1 byte" pattern, under min length of 2 bytes
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\x80\"
}"
,
"illegal UTF-8 sequence"
);
// 2 byte pattern, string too short
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xDF\"
}"
,
"illegal UTF-8 sequence"
);
// 3 byte pattern, string too short
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xEF\xBF\"
}"
,
"illegal UTF-8 sequence"
);
// 4 byte pattern, string too short
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xF7\xBF\xBF\"
}"
,
"illegal UTF-8 sequence"
);
// "5 byte" pattern, string too short
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xFB\xBF\xBF\xBF\"
}"
,
"illegal UTF-8 sequence"
);
// "6 byte" pattern, string too short
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xFD\xBF\xBF\xBF\xBF\"
}"
,
"illegal UTF-8 sequence"
);
// "7 byte" pattern, string too short
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xFE\xBF\xBF\xBF\xBF\xBF\"
}"
,
"illegal UTF-8 sequence"
);
// "5 byte" pattern, over max length of 4 bytes
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xFB\xBF\xBF\xBF\xBF\"
}"
,
"illegal UTF-8 sequence"
);
// "6 byte" pattern, over max length of 4 bytes
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xFD\xBF\xBF\xBF\xBF\xBF\"
}"
,
"illegal UTF-8 sequence"
);
// "7 byte" pattern, over max length of 4 bytes
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xFE\xBF\xBF\xBF\xBF\xBF\xBF\"
}"
,
"illegal UTF-8 sequence"
);
// Three invalid encodings for U+000A (\n, aka NEWLINE)
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xC0\x8A\"
}"
,
"illegal UTF-8 sequence"
);
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xE0\x80\x8A\"
}"
,
"illegal UTF-8 sequence"
);
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xF0\x80\x80\x8A\"
}"
,
"illegal UTF-8 sequence"
);
// Two invalid encodings for U+00A9 (COPYRIGHT SYMBOL)
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xE0\x81\xA9\"
}"
,
"illegal UTF-8 sequence"
);
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xF0\x80\x81\xA9\"
}"
,
"illegal UTF-8 sequence"
);
// Invalid encoding for U+20AC (EURO SYMBOL)
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xF0\x82\x82\xAC\"
}"
,
"illegal UTF-8 sequence"
);
// UTF-16 surrogate values between U+D800 and U+DFFF cannot be encoded in UTF-8
TestError
(
"table T { F:string; }"
"root_type T;"
// U+10400 "encoded" as U+D801 U+DC00
"{ F:
\"\xED\xA0\x81\xED\xB0\x80\"
}"
,
"illegal UTF-8 sequence"
);
}
void
UnknownFieldsTest
()
{
flatbuffers
::
IDLOptions
opts
;
opts
.
skip_unexpected_fields_in_json
=
true
;
...
...
@@ -1105,8 +1216,10 @@ int main(int /*argc*/, const char * /*argv*/[]) {
EnumStringsTest
();
IntegerOutOfRangeTest
();
UnicodeTest
();
UnicodeTestAllowNonUTF8
();
UnicodeSurrogatesTest
();
UnicodeInvalidSurrogatesTest
();
InvalidUTF8Test
();
UnknownFieldsTest
();
ParseUnionTest
();
ConformTest
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment