Commit c6f3d700 authored by Joshua Haberman's avatar Joshua Haberman Committed by GitHub

Merge pull request #1907 from evokly/js-utf8-fix

JS: Fix for high utf-8 codepoints.
parents 6e93fa41 bd850a25
...@@ -895,11 +895,9 @@ jspb.BinaryDecoder.prototype.readEnum = function() { ...@@ -895,11 +895,9 @@ jspb.BinaryDecoder.prototype.readEnum = function() {
/** /**
* Reads and parses a UTF-8 encoded unicode string from the stream. * Reads and parses a UTF-8 encoded unicode string from the stream.
* The code is inspired by maps.vectortown.parse.StreamedDataViewReader, with * The code is inspired by maps.vectortown.parse.StreamedDataViewReader.
* the exception that the implementation here does not get confused if it * Supports codepoints from U+0000 up to U+10FFFF.
* encounters characters longer than three bytes. These characters are ignored * (http://en.wikipedia.org/wiki/UTF-8).
* though, as they are extremely rare: three UTF-8 bytes cover virtually all
* characters in common use (http://en.wikipedia.org/wiki/UTF-8).
* @param {number} length The length of the string to read. * @param {number} length The length of the string to read.
* @return {string} The decoded string. * @return {string} The decoded string.
*/ */
...@@ -907,30 +905,45 @@ jspb.BinaryDecoder.prototype.readString = function(length) { ...@@ -907,30 +905,45 @@ jspb.BinaryDecoder.prototype.readString = function(length) {
var bytes = this.bytes_; var bytes = this.bytes_;
var cursor = this.cursor_; var cursor = this.cursor_;
var end = cursor + length; var end = cursor + length;
var chars = []; var codeUnits = [];
while (cursor < end) { while (cursor < end) {
var c = bytes[cursor++]; var c = bytes[cursor++];
if (c < 128) { // Regular 7-bit ASCII. if (c < 128) { // Regular 7-bit ASCII.
chars.push(c); codeUnits.push(c);
} else if (c < 192) { } else if (c < 192) {
// UTF-8 continuation mark. We are out of sync. This // UTF-8 continuation mark. We are out of sync. This
// might happen if we attempted to read a character // might happen if we attempted to read a character
// with more than three bytes. // with more than four bytes.
continue; continue;
} else if (c < 224) { // UTF-8 with two bytes. } else if (c < 224) { // UTF-8 with two bytes.
var c2 = bytes[cursor++]; var c2 = bytes[cursor++];
chars.push(((c & 31) << 6) | (c2 & 63)); codeUnits.push(((c & 31) << 6) | (c2 & 63));
} else if (c < 240) { // UTF-8 with three bytes. } else if (c < 240) { // UTF-8 with three bytes.
var c2 = bytes[cursor++]; var c2 = bytes[cursor++];
var c3 = bytes[cursor++]; var c3 = bytes[cursor++];
chars.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63)); codeUnits.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
} else if (c < 248) { // UTF-8 with 4 bytes.
var c2 = bytes[cursor++];
var c3 = bytes[cursor++];
var c4 = bytes[cursor++];
// Characters written on 4 bytes have 21 bits for a codepoint.
// We can't fit that on 16bit characters, so we use surrogates.
var codepoint = ((c & 7) << 18) | ((c2 & 63) << 12) | ((c3 & 63) << 6) | (c4 & 63);
// Surrogates formula from wikipedia.
// 1. Subtract 0x10000 from codepoint
codepoint -= 0x10000;
// 2. Split this into the high 10-bit value and the low 10-bit value
// 3. Add 0xD800 to the high value to form the high surrogate
// 4. Add 0xDC00 to the low value to form the low surrogate:
var low = (codepoint & 1023) + 0xDC00;
var high = ((codepoint >> 10) & 1023) + 0xD800;
codeUnits.push(high, low)
} }
} }
// String.fromCharCode.apply is faster than manually appending characters on // String.fromCharCode.apply is faster than manually appending characters on
// Chrome 25+, and generates no additional cons string garbage. // Chrome 25+, and generates no additional cons string garbage.
var result = String.fromCharCode.apply(null, chars); var result = String.fromCharCode.apply(null, codeUnits);
this.cursor_ = cursor; this.cursor_ = cursor;
return result; return result;
}; };
......
...@@ -209,7 +209,30 @@ describe('binaryDecoderTest', function() { ...@@ -209,7 +209,30 @@ describe('binaryDecoderTest', function() {
assertEquals(hashC, decoder.readFixedHash64()); assertEquals(hashC, decoder.readFixedHash64());
assertEquals(hashD, decoder.readFixedHash64()); assertEquals(hashD, decoder.readFixedHash64());
}); });
/**
* Test encoding and decoding utf-8.
*/
it('testUtf8', function() {
var encoder = new jspb.BinaryEncoder();
var ascii = "ASCII should work in 3, 2, 1..."
var utf8_two_bytes = "©";
var utf8_three_bytes = "❄";
var utf8_four_bytes = "😁";
encoder.writeString(ascii);
encoder.writeString(utf8_two_bytes);
encoder.writeString(utf8_three_bytes);
encoder.writeString(utf8_four_bytes);
var decoder = jspb.BinaryDecoder.alloc(encoder.end());
assertEquals(ascii, decoder.readString(ascii.length));
assertEquals(utf8_two_bytes, decoder.readString(utf8_two_bytes.length));
assertEquals(utf8_three_bytes, decoder.readString(utf8_three_bytes.length));
assertEquals(utf8_four_bytes, decoder.readString(utf8_four_bytes.length));
});
/** /**
* Verifies that misuse of the decoder class triggers assertions. * Verifies that misuse of the decoder class triggers assertions.
......
...@@ -409,19 +409,36 @@ jspb.BinaryEncoder.prototype.writeFixedHash64 = function(hash) { ...@@ -409,19 +409,36 @@ jspb.BinaryEncoder.prototype.writeFixedHash64 = function(hash) {
*/ */
jspb.BinaryEncoder.prototype.writeString = function(value) { jspb.BinaryEncoder.prototype.writeString = function(value) {
var oldLength = this.buffer_.length; var oldLength = this.buffer_.length;
// UTF16 to UTF8 conversion loop swiped from goog.crypt.stringToUtf8ByteArray.
for (var i = 0; i < value.length; i++) { for (var i = 0; i < value.length; i++) {
var c = value.charCodeAt(i); var c = value.charCodeAt(i);
if (c < 128) { if (c < 128) {
this.buffer_.push(c); this.buffer_.push(c);
} else if (c < 2048) { } else if (c < 2048) {
this.buffer_.push((c >> 6) | 192); this.buffer_.push((c >> 6) | 192);
this.buffer_.push((c & 63) | 128); this.buffer_.push((c & 63) | 128);
} else { } else if (c < 65536) {
this.buffer_.push((c >> 12) | 224); // Look for surrogates
this.buffer_.push(((c >> 6) & 63) | 128); if (c >= 0xD800 && c <= 0xDBFF && i + 1 < value.length) {
this.buffer_.push((c & 63) | 128); var second = value.charCodeAt(i + 1);
if (second >= 0xDC00 && second <= 0xDFFF) { // low surrogate
// http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
c = (c - 0xD800) * 0x400 + second - 0xDC00 + 0x10000;
this.buffer_.push((c >> 18) | 240);
this.buffer_.push(((c >> 12) & 63 ) | 128);
this.buffer_.push(((c >> 6) & 63) | 128);
this.buffer_.push((c & 63) | 128);
i++;
}
}
else {
this.buffer_.push((c >> 12) | 224);
this.buffer_.push(((c >> 6) & 63) | 128);
this.buffer_.push((c & 63) | 128);
}
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment