Commit 6704b19d authored by Ben Gertzfield's avatar Ben Gertzfield

Handle \u-escaped surrogate pairs correctly in IDL parser

parent 208c15f2
...@@ -236,12 +236,19 @@ CheckedError Parser::Next() { ...@@ -236,12 +236,19 @@ CheckedError Parser::Next() {
if(!isdigit(static_cast<const unsigned char>(*cursor_))) return NoError(); if(!isdigit(static_cast<const unsigned char>(*cursor_))) return NoError();
return Error("floating point constant can\'t start with \".\""); return Error("floating point constant can\'t start with \".\"");
case '\"': case '\"':
case '\'': case '\'': {
int unicode_high_surrogate = -1;
while (*cursor_ != c) { while (*cursor_ != c) {
if (*cursor_ < ' ' && *cursor_ >= 0) if (*cursor_ < ' ' && *cursor_ >= 0)
return Error("illegal character in string constant"); return Error("illegal character in string constant");
if (*cursor_ == '\\') { if (*cursor_ == '\\') {
cursor_++; cursor_++;
if (unicode_high_surrogate != -1 &&
*cursor_ != 'u') {
return Error(
"illegal Unicode sequence (unpaired high surrogate)");
}
switch (*cursor_) { switch (*cursor_) {
case 'n': attribute_ += '\n'; cursor_++; break; case 'n': attribute_ += '\n'; cursor_++; break;
case 't': attribute_ += '\t'; cursor_++; break; case 't': attribute_ += '\t'; cursor_++; break;
...@@ -263,18 +270,51 @@ CheckedError Parser::Next() { ...@@ -263,18 +270,51 @@ CheckedError Parser::Next() {
cursor_++; cursor_++;
int64_t val; int64_t val;
ECHECK(ParseHexNum(4, &val)); ECHECK(ParseHexNum(4, &val));
ToUTF8(static_cast<int>(val), &attribute_); if (val >= 0xD800 && val <= 0xDBFF) {
if (unicode_high_surrogate != -1) {
return Error(
"illegal Unicode sequence (multiple high surrogates)");
} else {
unicode_high_surrogate = val;
}
} else if (val >= 0xDC00 && val <= 0xDFFF) {
if (unicode_high_surrogate == -1) {
return Error(
"illegal Unicode sequence (unpaired low surrogate)");
} else {
int code_point = 0x10000 +
((unicode_high_surrogate & 0x03FF) << 10) +
(val & 0x03FF);
ToUTF8(code_point, &attribute_);
unicode_high_surrogate = -1;
}
} else {
if (unicode_high_surrogate != -1) {
return Error(
"illegal Unicode sequence (unpaired high surrogate)");
}
ToUTF8(static_cast<int>(val), &attribute_);
}
break; break;
} }
default: return Error("unknown escape code in string constant"); default: return Error("unknown escape code in string constant");
} }
} else { // printable chars + UTF-8 bytes } else { // printable chars + UTF-8 bytes
if (unicode_high_surrogate != -1) {
return Error(
"illegal Unicode sequence (unpaired high surrogate)");
}
attribute_ += *cursor_++; attribute_ += *cursor_++;
} }
} }
if (unicode_high_surrogate != -1) {
return Error(
"illegal Unicode sequence (unpaired high surrogate)");
}
cursor_++; cursor_++;
token_ = kTokenStringConstant; token_ = kTokenStringConstant;
return NoError(); return NoError();
}
case '/': case '/':
if (*cursor_ == '/') { if (*cursor_ == '/') {
const char *start = ++cursor_; const char *start = ++cursor_;
......
...@@ -859,6 +859,44 @@ void UnicodeTest() { ...@@ -859,6 +859,44 @@ void UnicodeTest() {
"\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\"}", true); "\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\"}", true);
} }
void UnicodeSurrogatesTest() {
flatbuffers::Parser parser;
TEST_EQ(
parser.Parse(
"table T { F:string (id: 0); }"
"root_type T;"
"{ F:\"\\uD83D\\uDCA9\"}"), true);
auto root = flatbuffers::GetRoot<flatbuffers::Table>(
parser.builder_.GetBufferPointer());
auto string = root->GetPointer<flatbuffers::String *>(
flatbuffers::FieldIndexToOffset(0));
TEST_EQ(strcmp(string->c_str(), "\xF0\x9F\x92\xA9"), 0);
}
void UnicodeInvalidSurrogatesTest() {
TestError(
"table T { F:string; }"
"root_type T;"
"{ F:\"\\uD800\"}", "unpaired high surrogate");
TestError(
"table T { F:string; }"
"root_type T;"
"{ F:\"\\uD800abcd\"}", "unpaired high surrogate");
TestError(
"table T { F:string; }"
"root_type T;"
"{ F:\"\\uD800\\n\"}", "unpaired high surrogate");
TestError(
"table T { F:string; }"
"root_type T;"
"{ F:\"\\uD800\\uD800\"}", "multiple high surrogates");
TestError(
"table T { F:string; }"
"root_type T;"
"{ F:\"\\uDC00\"}", "unpaired low surrogate");
}
void UnknownFieldsTest() { void UnknownFieldsTest() {
flatbuffers::IDLOptions opts; flatbuffers::IDLOptions opts;
opts.skip_unexpected_fields_in_json = true; opts.skip_unexpected_fields_in_json = true;
...@@ -907,6 +945,8 @@ int main(int /*argc*/, const char * /*argv*/[]) { ...@@ -907,6 +945,8 @@ int main(int /*argc*/, const char * /*argv*/[]) {
EnumStringsTest(); EnumStringsTest();
IntegerOutOfRangeTest(); IntegerOutOfRangeTest();
UnicodeTest(); UnicodeTest();
UnicodeSurrogatesTest();
UnicodeInvalidSurrogatesTest();
UnknownFieldsTest(); UnknownFieldsTest();
if (!testing_fails) { if (!testing_fails) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment