Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
P
protobuf
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
protobuf
Commits
23f108d4
Commit
23f108d4
authored
Aug 02, 2016
by
Wojciech Mandrysz
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
JS: Fixed UTF-8 string encoder/decoder for high codepoints.
parent
8d8115bf
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
36 additions
and
15 deletions
+36
-15
decoder.js
js/binary/decoder.js
+28
-13
encoder.js
js/binary/encoder.js
+8
-2
No files found.
js/binary/decoder.js
View file @
23f108d4
...
...
@@ -895,11 +895,9 @@ jspb.BinaryDecoder.prototype.readEnum = function() {
/**
* Reads and parses a UTF-8 encoded unicode string from the stream.
* The code is inspired by maps.vectortown.parse.StreamedDataViewReader, with
* the exception that the implementation here does not get confused if it
* encounters characters longer than three bytes. These characters are ignored
* though, as they are extremely rare: three UTF-8 bytes cover virtually all
* characters in common use (http://en.wikipedia.org/wiki/UTF-8).
* The code is inspired by maps.vectortown.parse.StreamedDataViewReader.
* Supports codepoints from U+0000 up to U+10FFFF.
* (http://en.wikipedia.org/wiki/UTF-8).
* @param {number} length The length of the string to read.
* @return {string} The decoded string.
*/
...
...
@@ -907,30 +905,47 @@ jspb.BinaryDecoder.prototype.readString = function(length) {
var
bytes
=
this
.
bytes_
;
var
cursor
=
this
.
cursor_
;
var
end
=
cursor
+
length
;
var
c
har
s
=
[];
var
c
odepoint
s
=
[];
while
(
cursor
<
end
)
{
var
c
=
bytes
[
cursor
++
];
if
(
c
<
128
)
{
// Regular 7-bit ASCII.
c
har
s
.
push
(
c
);
c
odepoint
s
.
push
(
c
);
}
else
if
(
c
<
192
)
{
// UTF-8 continuation mark. We are out of sync. This
// might happen if we attempted to read a character
// with more than
three
bytes.
// with more than
four
bytes.
continue
;
}
else
if
(
c
<
224
)
{
// UTF-8 with two bytes.
var
c2
=
bytes
[
cursor
++
];
c
har
s
.
push
(((
c
&
31
)
<<
6
)
|
(
c2
&
63
));
c
odepoint
s
.
push
(((
c
&
31
)
<<
6
)
|
(
c2
&
63
));
}
else
if
(
c
<
240
)
{
// UTF-8 with three bytes.
var
c2
=
bytes
[
cursor
++
];
var
c3
=
bytes
[
cursor
++
];
chars
.
push
(((
c
&
15
)
<<
12
)
|
((
c2
&
63
)
<<
6
)
|
(
c3
&
63
));
codepoints
.
push
(((
c
&
15
)
<<
12
)
|
((
c2
&
63
)
<<
6
)
|
(
c3
&
63
));
}
else
if
(
c
<
248
)
{
// UTF-8 with 4 bytes.
var
c2
=
bytes
[
cursor
++
];
var
c3
=
bytes
[
cursor
++
];
var
c4
=
bytes
[
cursor
++
];
// Characters written on 4 bytes have 21 bits for a codepoint.
// We can't fit that on 16bit characters, so we use surrogates.
var
codepoint
=
((
c
&
7
)
<<
18
)
|
((
c2
&
63
)
<<
12
)
|
((
c3
&
63
)
<<
6
)
|
(
c4
&
63
);
// Surrogates formula from wikipedia.
// 1. Subtract 0x10000 from codepoint
codepoint
-=
65536
;
// 2. Split this into the high 10-bit value and the low 10-bit value
var
low
=
codepoint
&
1023
;
var
high
=
(
codepoint
>>
10
)
&
1023
;
// 3. Add 0xD800 to the high value to form the high surrogate
high
+=
55296
;
// 4. Add 0xDC00 to the low value to form the low surrogate:
low
+=
56320
;
codepoints
.
push
(
high
);
codepoints
.
push
(
low
);
}
}
// String.fromCharCode.apply is faster than manually appending characters on
// Chrome 25+, and generates no additional cons string garbage.
var
result
=
String
.
fromCharCode
.
apply
(
null
,
chars
);
var
result
=
String
.
fromCodePoint
.
apply
(
null
,
codepoints
);
this
.
cursor_
=
cursor
;
return
result
;
};
...
...
js/binary/encoder.js
View file @
23f108d4
...
...
@@ -412,16 +412,22 @@ jspb.BinaryEncoder.prototype.writeString = function(value) {
// UTF16 to UTF8 conversion loop swiped from goog.crypt.stringToUtf8ByteArray.
for
(
var
i
=
0
;
i
<
value
.
length
;
i
++
)
{
var
c
=
value
.
c
harCode
At
(
i
);
var
c
=
value
.
c
odePoint
At
(
i
);
if
(
c
<
128
)
{
this
.
buffer_
.
push
(
c
);
}
else
if
(
c
<
2048
)
{
this
.
buffer_
.
push
((
c
>>
6
)
|
192
);
this
.
buffer_
.
push
((
c
&
63
)
|
128
);
}
else
{
}
else
if
(
c
<
65536
)
{
this
.
buffer_
.
push
((
c
>>
12
)
|
224
);
this
.
buffer_
.
push
(((
c
>>
6
)
&
63
)
|
128
);
this
.
buffer_
.
push
((
c
&
63
)
|
128
);
}
else
{
this
.
buffer_
.
push
((
c
>>
18
)
|
240
);
this
.
buffer_
.
push
(((
c
>>
12
)
&
63
)
|
128
);
this
.
buffer_
.
push
(((
c
>>
6
)
&
63
)
|
128
);
this
.
buffer_
.
push
((
c
&
63
)
|
128
);
i
++
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment