Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
P
protobuf
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
protobuf
Commits
c6f3d700
Commit
c6f3d700
authored
Nov 15, 2016
by
Joshua Haberman
Committed by
GitHub
Nov 15, 2016
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #1907 from evokly/js-utf8-fix
JS: Fix for high utf-8 codepoints.
parents
6e93fa41
bd850a25
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
71 additions
and
18 deletions
+71
-18
decoder.js
js/binary/decoder.js
+25
-12
decoder_test.js
js/binary/decoder_test.js
+23
-0
encoder.js
js/binary/encoder.js
+23
-6
No files found.
js/binary/decoder.js
View file @
c6f3d700
...
...
@@ -895,11 +895,9 @@ jspb.BinaryDecoder.prototype.readEnum = function() {
/**
* Reads and parses a UTF-8 encoded unicode string from the stream.
* The code is inspired by maps.vectortown.parse.StreamedDataViewReader, with
* the exception that the implementation here does not get confused if it
* encounters characters longer than three bytes. These characters are ignored
* though, as they are extremely rare: three UTF-8 bytes cover virtually all
* characters in common use (http://en.wikipedia.org/wiki/UTF-8).
* The code is inspired by maps.vectortown.parse.StreamedDataViewReader.
* Supports codepoints from U+0000 up to U+10FFFF.
* (http://en.wikipedia.org/wiki/UTF-8).
* @param {number} length The length of the string to read.
* @return {string} The decoded string.
*/
...
...
@@ -907,30 +905,45 @@ jspb.BinaryDecoder.prototype.readString = function(length) {
var
bytes
=
this
.
bytes_
;
var
cursor
=
this
.
cursor_
;
var
end
=
cursor
+
length
;
var
c
har
s
=
[];
var
c
odeUnit
s
=
[];
while
(
cursor
<
end
)
{
var
c
=
bytes
[
cursor
++
];
if
(
c
<
128
)
{
// Regular 7-bit ASCII.
c
har
s
.
push
(
c
);
c
odeUnit
s
.
push
(
c
);
}
else
if
(
c
<
192
)
{
// UTF-8 continuation mark. We are out of sync. This
// might happen if we attempted to read a character
// with more than
three
bytes.
// with more than
four
bytes.
continue
;
}
else
if
(
c
<
224
)
{
// UTF-8 with two bytes.
var
c2
=
bytes
[
cursor
++
];
c
har
s
.
push
(((
c
&
31
)
<<
6
)
|
(
c2
&
63
));
c
odeUnit
s
.
push
(((
c
&
31
)
<<
6
)
|
(
c2
&
63
));
}
else
if
(
c
<
240
)
{
// UTF-8 with three bytes.
var
c2
=
bytes
[
cursor
++
];
var
c3
=
bytes
[
cursor
++
];
chars
.
push
(((
c
&
15
)
<<
12
)
|
((
c2
&
63
)
<<
6
)
|
(
c3
&
63
));
codeUnits
.
push
(((
c
&
15
)
<<
12
)
|
((
c2
&
63
)
<<
6
)
|
(
c3
&
63
));
}
else
if
(
c
<
248
)
{
// UTF-8 with 4 bytes.
var
c2
=
bytes
[
cursor
++
];
var
c3
=
bytes
[
cursor
++
];
var
c4
=
bytes
[
cursor
++
];
// Characters written on 4 bytes have 21 bits for a codepoint.
// We can't fit that on 16bit characters, so we use surrogates.
var
codepoint
=
((
c
&
7
)
<<
18
)
|
((
c2
&
63
)
<<
12
)
|
((
c3
&
63
)
<<
6
)
|
(
c4
&
63
);
// Surrogates formula from wikipedia.
// 1. Subtract 0x10000 from codepoint
codepoint
-=
0x10000
;
// 2. Split this into the high 10-bit value and the low 10-bit value
// 3. Add 0xD800 to the high value to form the high surrogate
// 4. Add 0xDC00 to the low value to form the low surrogate:
var
low
=
(
codepoint
&
1023
)
+
0xDC00
;
var
high
=
((
codepoint
>>
10
)
&
1023
)
+
0xD800
;
codeUnits
.
push
(
high
,
low
)
}
}
// String.fromCharCode.apply is faster than manually appending characters on
// Chrome 25+, and generates no additional cons string garbage.
var
result
=
String
.
fromCharCode
.
apply
(
null
,
c
har
s
);
var
result
=
String
.
fromCharCode
.
apply
(
null
,
c
odeUnit
s
);
this
.
cursor_
=
cursor
;
return
result
;
};
...
...
js/binary/decoder_test.js
View file @
c6f3d700
...
...
@@ -209,7 +209,30 @@ describe('binaryDecoderTest', function() {
assertEquals
(
hashC
,
decoder
.
readFixedHash64
());
assertEquals
(
hashD
,
decoder
.
readFixedHash64
());
});
/**
* Test encoding and decoding utf-8.
*/
it
(
'testUtf8'
,
function
()
{
var
encoder
=
new
jspb
.
BinaryEncoder
();
var
ascii
=
"ASCII should work in 3, 2, 1..."
var
utf8_two_bytes
=
"©"
;
var
utf8_three_bytes
=
"❄"
;
var
utf8_four_bytes
=
"😁"
;
encoder
.
writeString
(
ascii
);
encoder
.
writeString
(
utf8_two_bytes
);
encoder
.
writeString
(
utf8_three_bytes
);
encoder
.
writeString
(
utf8_four_bytes
);
var
decoder
=
jspb
.
BinaryDecoder
.
alloc
(
encoder
.
end
());
assertEquals
(
ascii
,
decoder
.
readString
(
ascii
.
length
));
assertEquals
(
utf8_two_bytes
,
decoder
.
readString
(
utf8_two_bytes
.
length
));
assertEquals
(
utf8_three_bytes
,
decoder
.
readString
(
utf8_three_bytes
.
length
));
assertEquals
(
utf8_four_bytes
,
decoder
.
readString
(
utf8_four_bytes
.
length
));
});
/**
* Verifies that misuse of the decoder class triggers assertions.
...
...
js/binary/encoder.js
View file @
c6f3d700
...
...
@@ -409,19 +409,36 @@ jspb.BinaryEncoder.prototype.writeFixedHash64 = function(hash) {
*/
jspb
.
BinaryEncoder
.
prototype
.
writeString
=
function
(
value
)
{
var
oldLength
=
this
.
buffer_
.
length
;
// UTF16 to UTF8 conversion loop swiped from goog.crypt.stringToUtf8ByteArray.
for
(
var
i
=
0
;
i
<
value
.
length
;
i
++
)
{
var
c
=
value
.
charCodeAt
(
i
);
if
(
c
<
128
)
{
this
.
buffer_
.
push
(
c
);
}
else
if
(
c
<
2048
)
{
this
.
buffer_
.
push
((
c
>>
6
)
|
192
);
this
.
buffer_
.
push
((
c
&
63
)
|
128
);
}
else
{
this
.
buffer_
.
push
((
c
>>
12
)
|
224
);
this
.
buffer_
.
push
(((
c
>>
6
)
&
63
)
|
128
);
this
.
buffer_
.
push
((
c
&
63
)
|
128
);
}
else
if
(
c
<
65536
)
{
// Look for surrogates
if
(
c
>=
0xD800
&&
c
<=
0xDBFF
&&
i
+
1
<
value
.
length
)
{
var
second
=
value
.
charCodeAt
(
i
+
1
);
if
(
second
>=
0xDC00
&&
second
<=
0xDFFF
)
{
// low surrogate
// http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
c
=
(
c
-
0xD800
)
*
0x400
+
second
-
0xDC00
+
0x10000
;
this
.
buffer_
.
push
((
c
>>
18
)
|
240
);
this
.
buffer_
.
push
(((
c
>>
12
)
&
63
)
|
128
);
this
.
buffer_
.
push
(((
c
>>
6
)
&
63
)
|
128
);
this
.
buffer_
.
push
((
c
&
63
)
|
128
);
i
++
;
}
}
else
{
this
.
buffer_
.
push
((
c
>>
12
)
|
224
);
this
.
buffer_
.
push
(((
c
>>
6
)
&
63
)
|
128
);
this
.
buffer_
.
push
((
c
&
63
)
|
128
);
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment