Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
F
flatbuffers
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
flatbuffers
Commits
f6416d84
Commit
f6416d84
authored
Aug 01, 2016
by
Ben Hamilton
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Validate UTF-8 by default when parsing IDL. Support Unicode values > U+FFFF in parse
parent
d70f5ac6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
193 additions
and
14 deletions
+193
-14
idl.h
include/flatbuffers/idl.h
+2
-0
util.h
include/flatbuffers/util.h
+30
-0
flatc.cpp
src/flatc.cpp
+5
-0
idl_gen_text.cpp
src/idl_gen_text.cpp
+26
-11
idl_parser.cpp
src/idl_parser.cpp
+14
-0
test.cpp
tests/test.cpp
+116
-3
No files found.
include/flatbuffers/idl.h
View file @
f6416d84
...
...
@@ -348,6 +348,7 @@ struct IDLOptions {
bool
escape_proto_identifiers
;
bool
generate_object_based_api
;
bool
union_value_namespacing
;
bool
allow_non_utf8
;
// Possible options for the more general generator below.
enum
Language
{
kJava
,
kCSharp
,
kGo
,
kMAX
};
...
...
@@ -370,6 +371,7 @@ struct IDLOptions {
escape_proto_identifiers
(
false
),
generate_object_based_api
(
false
),
union_value_namespacing
(
true
),
allow_non_utf8
(
false
),
lang
(
IDLOptions
::
kJava
)
{}
};
...
...
include/flatbuffers/util.h
View file @
f6416d84
...
...
@@ -276,6 +276,10 @@ inline int FromUTF8(const char **in) {
}
if
((
**
in
<<
len
)
&
0x80
)
return
-
1
;
// Bit after leading 1's must be 0.
if
(
!
len
)
return
*
(
*
in
)
++
;
// UTF-8 encoded values with a length are between 2 and 4 bytes.
if
(
len
<
2
||
len
>
4
)
{
return
-
1
;
}
// Grab initial bits of the code.
int
ucc
=
*
(
*
in
)
++
&
((
1
<<
(
7
-
len
))
-
1
);
for
(
int
i
=
0
;
i
<
len
-
1
;
i
++
)
{
...
...
@@ -283,6 +287,32 @@ inline int FromUTF8(const char **in) {
ucc
<<=
6
;
ucc
|=
*
(
*
in
)
++
&
0x3F
;
// Grab 6 more bits of the code.
}
// UTF-8 cannot encode values between 0xD800 and 0xDFFF (reserved for
// UTF-16 surrogate pairs).
if
(
ucc
>=
0xD800
&&
ucc
<=
0xDFFF
)
{
return
-
1
;
}
// UTF-8 must represent code points in their shortest possible encoding.
switch
(
len
)
{
case
2
:
// Two bytes of UTF-8 can represent code points from U+0080 to U+07FF.
if
(
ucc
<
0x0080
||
ucc
>
0x07FF
)
{
return
-
1
;
}
break
;
case
3
:
// Three bytes of UTF-8 can represent code points from U+0800 to U+FFFF.
if
(
ucc
<
0x0800
||
ucc
>
0xFFFF
)
{
return
-
1
;
}
break
;
case
4
:
// Four bytes of UTF-8 can represent code points from U+10000 to U+10FFFF.
if
(
ucc
<
0x10000
||
ucc
>
0x10FFFF
)
{
return
-
1
;
}
break
;
}
return
ucc
;
}
...
...
src/flatc.cpp
View file @
f6416d84
...
...
@@ -106,6 +106,9 @@ static void Error(const std::string &err, bool usage, bool show_exe_name) {
" --version Print the version number of flatc and exit.
\n
"
" --strict-json Strict JSON: field names must be / will be quoted,
\n
"
" no trailing commas in tables/vectors.
\n
"
" --allow-non-utf8 Pass non-UTF-8 input through parser and emit nonstandard
\n
"
"
\\
x escapes in JSON. (Default is to raise parse error on
\n
"
" non-UTF-8 input.)
\n
"
" --defaults-json Output fields whose value is the default when
\n
"
" writing JSON
\n
"
" --unknown-json Allow fields in JSON that are not defined in the
\n
"
...
...
@@ -184,6 +187,8 @@ int main(int argc, const char *argv[]) {
conform_to_schema
=
argv
[
argi
];
}
else
if
(
arg
==
"--strict-json"
)
{
opts
.
strict_json
=
true
;
}
else
if
(
arg
==
"--allow-non-utf8"
)
{
opts
.
allow_non_utf8
=
true
;
}
else
if
(
arg
==
"--no-js-exports"
)
{
opts
.
skip_js_exports
=
true
;
}
else
if
(
arg
==
"--defaults-json"
)
{
...
...
src/idl_gen_text.cpp
View file @
f6416d84
...
...
@@ -93,7 +93,7 @@ template<typename T> void PrintVector(const Vector<T> &v, Type type,
text
+=
"]"
;
}
static
void
EscapeString
(
const
String
&
s
,
std
::
string
*
_text
)
{
static
void
EscapeString
(
const
String
&
s
,
std
::
string
*
_text
,
const
IDLOptions
&
opts
)
{
std
::
string
&
text
=
*
_text
;
text
+=
"
\"
"
;
for
(
uoffset_t
i
=
0
;
i
<
s
.
size
();
i
++
)
{
...
...
@@ -113,17 +113,32 @@ static void EscapeString(const String &s, std::string *_text) {
// Not printable ASCII data. Let's see if it's valid UTF-8 first:
const
char
*
utf8
=
s
.
c_str
()
+
i
;
int
ucc
=
FromUTF8
(
&
utf8
);
if
(
ucc
>=
0x80
&&
ucc
<=
0xFFFF
)
{
// Parses as Unicode within JSON's \uXXXX range, so use that.
text
+=
"
\\
u"
;
text
+=
IntToStringHex
(
ucc
,
4
);
if
(
ucc
<
0
)
{
if
(
opts
.
allow_non_utf8
)
{
text
+=
"
\\
x"
;
text
+=
IntToStringHex
(
static_cast
<
uint8_t
>
(
c
),
2
);
}
else
{
// We previously checked for non-UTF-8 and returned a parse error,
// so we shouldn't reach here.
assert
(
0
);
}
}
else
{
if
(
ucc
<=
0xFFFF
)
{
// Parses as Unicode within JSON's \uXXXX range, so use that.
text
+=
"
\\
u"
;
text
+=
IntToStringHex
(
ucc
,
4
);
}
else
if
(
ucc
<=
0x10FFFF
)
{
// Encode Unicode SMP values to a surrogate pair using two \u escapes.
uint32_t
base
=
ucc
-
0x10000
;
uint16_t
highSurrogate
=
(
base
>>
10
)
+
0xD800
;
uint16_t
lowSurrogate
=
(
base
&
0x03FF
)
+
0xDC00
;
text
+=
"
\\
u"
;
text
+=
IntToStringHex
(
highSurrogate
,
4
);
text
+=
"
\\
u"
;
text
+=
IntToStringHex
(
lowSurrogate
,
4
);
}
// Skip past characters recognized.
i
=
static_cast
<
uoffset_t
>
(
utf8
-
s
.
c_str
()
-
1
);
}
else
{
// It's either unprintable ASCII, arbitrary binary, or Unicode data
// that doesn't fit \uXXXX, so use \xXX escape code instead.
text
+=
"
\\
x"
;
text
+=
IntToStringHex
(
static_cast
<
uint8_t
>
(
c
),
2
);
}
}
break
;
...
...
@@ -157,7 +172,7 @@ template<> void Print<const void *>(const void *val,
_text
);
break
;
case
BASE_TYPE_STRING
:
{
EscapeString
(
*
reinterpret_cast
<
const
String
*>
(
val
),
_text
);
EscapeString
(
*
reinterpret_cast
<
const
String
*>
(
val
),
_text
,
opts
);
break
;
}
case
BASE_TYPE_VECTOR
:
...
...
src/idl_parser.cpp
View file @
f6416d84
...
...
@@ -61,6 +61,17 @@ static_assert(BASE_TYPE_UNION ==
#define NEXT() ECHECK(Next())
#define EXPECT(tok) ECHECK(Expect(tok))
static
bool
ValidateUTF8
(
const
std
::
string
&
str
)
{
const
char
*
s
=
&
str
[
0
];
const
char
*
const
sEnd
=
s
+
str
.
length
();
while
(
s
<
sEnd
)
{
if
(
FromUTF8
(
&
s
)
<
0
)
{
return
false
;
}
}
return
true
;
}
CheckedError
Parser
::
Error
(
const
std
::
string
&
msg
)
{
error_
=
file_being_parsed_
.
length
()
?
AbsolutePath
(
file_being_parsed_
)
:
""
;
#ifdef _WIN32
...
...
@@ -320,6 +331,9 @@ CheckedError Parser::Next() {
"illegal Unicode sequence (unpaired high surrogate)"
);
}
cursor_
++
;
if
(
!
opts
.
allow_non_utf8
&&
!
ValidateUTF8
(
attribute_
))
{
return
Error
(
"illegal UTF-8 sequence"
);
}
token_
=
kTokenStringConstant
;
return
NoError
();
}
...
...
tests/test.cpp
View file @
f6416d84
...
...
@@ -978,15 +978,36 @@ void IntegerOutOfRangeTest() {
void
UnicodeTest
()
{
flatbuffers
::
Parser
parser
;
// Without setting allow_non_utf8 = true, we treat \x sequences as byte sequences
// which are then validated as UTF-8.
TEST_EQ
(
parser
.
Parse
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\\
u20AC
\\
u00A2
\\
u30E6
\\
u30FC
\\
u30B6
\\
u30FC"
"
\\
u5225
\\
u30B5
\\
u30A4
\\
u30C8
\\
x01
\\
x80
\"
}"
),
true
);
"
\\
u5225
\\
u30B5
\\
u30A4
\\
u30C8
\\
xE2
\\
x82
\\
xAC
\\
u0080
\\
uD83D
\\
uDE0E
\"
}"
),
true
);
std
::
string
jsongen
;
parser
.
opts
.
indent_step
=
-
1
;
GenerateText
(
parser
,
parser
.
builder_
.
GetBufferPointer
(),
&
jsongen
);
TEST_EQ
(
jsongen
==
"{F:
\"\\
u20AC
\\
u00A2
\\
u30E6
\\
u30FC
\\
u30B6
\\
u30FC"
"
\\
u5225
\\
u30B5
\\
u30A4
\\
u30C8
\\
x01
\\
x80
\"
}"
,
true
);
TEST_EQ
(
jsongen
,
std
::
string
(
"{F:
\"\\
u20AC
\\
u00A2
\\
u30E6
\\
u30FC
\\
u30B6
\\
u30FC"
"
\\
u5225
\\
u30B5
\\
u30A4
\\
u30C8
\\
u20AC
\\
u0080
\\
uD83D
\\
uDE0E
\"
}"
));
}
void
UnicodeTestAllowNonUTF8
()
{
flatbuffers
::
Parser
parser
;
parser
.
opts
.
allow_non_utf8
=
true
;
TEST_EQ
(
parser
.
Parse
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\\
u20AC
\\
u00A2
\\
u30E6
\\
u30FC
\\
u30B6
\\
u30FC"
"
\\
u5225
\\
u30B5
\\
u30A4
\\
u30C8
\\
x01
\\
x80
\\
u0080
\\
uD83D
\\
uDE0E
\"
}"
),
true
);
std
::
string
jsongen
;
parser
.
opts
.
indent_step
=
-
1
;
GenerateText
(
parser
,
parser
.
builder_
.
GetBufferPointer
(),
&
jsongen
);
TEST_EQ
(
jsongen
,
std
::
string
(
"{F:
\"\\
u20AC
\\
u00A2
\\
u30E6
\\
u30FC
\\
u30B6
\\
u30FC"
"
\\
u5225
\\
u30B5
\\
u30A4
\\
u30C8
\\
u0001
\\
x80
\\
u0080
\\
uD83D
\\
uDE0E
\"
}"
));
}
void
UnicodeSurrogatesTest
()
{
...
...
@@ -1027,6 +1048,96 @@ void UnicodeInvalidSurrogatesTest() {
"{ F:
\"\\
uDC00
\"
}"
,
"unpaired low surrogate"
);
}
void
InvalidUTF8Test
()
{
// "1 byte" pattern, under min length of 2 bytes
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\x80\"
}"
,
"illegal UTF-8 sequence"
);
// 2 byte pattern, string too short
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xDF\"
}"
,
"illegal UTF-8 sequence"
);
// 3 byte pattern, string too short
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xEF\xBF\"
}"
,
"illegal UTF-8 sequence"
);
// 4 byte pattern, string too short
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xF7\xBF\xBF\"
}"
,
"illegal UTF-8 sequence"
);
// "5 byte" pattern, string too short
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xFB\xBF\xBF\xBF\"
}"
,
"illegal UTF-8 sequence"
);
// "6 byte" pattern, string too short
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xFD\xBF\xBF\xBF\xBF\"
}"
,
"illegal UTF-8 sequence"
);
// "7 byte" pattern, string too short
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xFE\xBF\xBF\xBF\xBF\xBF\"
}"
,
"illegal UTF-8 sequence"
);
// "5 byte" pattern, over max length of 4 bytes
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xFB\xBF\xBF\xBF\xBF\"
}"
,
"illegal UTF-8 sequence"
);
// "6 byte" pattern, over max length of 4 bytes
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xFD\xBF\xBF\xBF\xBF\xBF\"
}"
,
"illegal UTF-8 sequence"
);
// "7 byte" pattern, over max length of 4 bytes
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xFE\xBF\xBF\xBF\xBF\xBF\xBF\"
}"
,
"illegal UTF-8 sequence"
);
// Three invalid encodings for U+000A (\n, aka NEWLINE)
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xC0\x8A\"
}"
,
"illegal UTF-8 sequence"
);
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xE0\x80\x8A\"
}"
,
"illegal UTF-8 sequence"
);
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xF0\x80\x80\x8A\"
}"
,
"illegal UTF-8 sequence"
);
// Two invalid encodings for U+00A9 (COPYRIGHT SYMBOL)
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xE0\x81\xA9\"
}"
,
"illegal UTF-8 sequence"
);
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xF0\x80\x81\xA9\"
}"
,
"illegal UTF-8 sequence"
);
// Invalid encoding for U+20AC (EURO SYMBOL)
TestError
(
"table T { F:string; }"
"root_type T;"
"{ F:
\"\xF0\x82\x82\xAC\"
}"
,
"illegal UTF-8 sequence"
);
// UTF-16 surrogate values between U+D800 and U+DFFF cannot be encoded in UTF-8
TestError
(
"table T { F:string; }"
"root_type T;"
// U+10400 "encoded" as U+D801 U+DC00
"{ F:
\"\xED\xA0\x81\xED\xB0\x80\"
}"
,
"illegal UTF-8 sequence"
);
}
void
UnknownFieldsTest
()
{
flatbuffers
::
IDLOptions
opts
;
opts
.
skip_unexpected_fields_in_json
=
true
;
...
...
@@ -1105,8 +1216,10 @@ int main(int /*argc*/, const char * /*argv*/[]) {
EnumStringsTest
();
IntegerOutOfRangeTest
();
UnicodeTest
();
UnicodeTestAllowNonUTF8
();
UnicodeSurrogatesTest
();
UnicodeInvalidSurrogatesTest
();
InvalidUTF8Test
();
UnknownFieldsTest
();
ParseUnionTest
();
ConformTest
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment