perf: String#getBytes(Charset) vs getBytes(String)

e84893f6 · Viktor Szathmáry · Tamir Duberstein · 7139d1ef · e84893f6 · e84893f6
Commit e84893f6 authored Sep 09, 2014 by Viktor Szathmáry Committed by Tamir Duberstein Apr 02, 2015
7 changed files
--- a/java/src/main/java/com/google/protobuf/ByteString.java
+++ b/java/src/main/java/com/google/protobuf/ByteString.java
@@ -37,6 +37,8 @@ import java.io.OutputStream;
 import java.io.Serializable;
 import java.io.UnsupportedEncodingException;
 import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.UnsupportedCharsetException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Iterator;
@@ -76,8 +78,7 @@ public abstract class ByteString implements Iterable<Byte>, Serializable {
  static final int MIN_READ_FROM_CHUNK_SIZE = 0x100;  // 256b
  static final int MAX_READ_FROM_CHUNK_SIZE = 0x2000;  // 8k
-  // Defined by java.nio.charset.Charset
+  protected static final Charset UTF_8 = Charset.forName("UTF-8");
-  protected static final String UTF_8 = "UTF-8";
  /**
   * Empty {@code ByteString}.
@@ -269,11 +270,7 @@ public abstract class ByteString implements Iterable<Byte>, Serializable {
   * @return new {@code ByteString}
   */
  public static ByteString copyFromUtf8(String text) {
-    try {
+    return new LiteralByteString(text.getBytes(UTF_8));
-      return new LiteralByteString(text.getBytes(UTF_8));
-    } catch (UnsupportedEncodingException e) {
-      throw new RuntimeException("UTF-8 not supported?", e);
-    }
  }
  // =================================================================
@@ -612,8 +609,36 @@ public abstract class ByteString implements Iterable<Byte>, Serializable {
   * @return new string
   * @throws UnsupportedEncodingException if charset isn't recognized
   */
-  public abstract String toString(String charsetName)
+  public String toString(String charsetName)
-      throws UnsupportedEncodingException;
+      throws UnsupportedEncodingException {
+    try {
+      return toString(Charset.forName(charsetName));
+    } catch (UnsupportedCharsetException e) {
+      UnsupportedEncodingException exception = new UnsupportedEncodingException(charsetName);
+      exception.initCause(e);
+      throw exception;
+    }
+  }
+  /**
+   * Constructs a new {@code String} by decoding the bytes using the
+   * specified charset. Returns the same empty String if empty.
+   *
+   * @param charset encode using this charset
+   * @return new string
+   */
+  public String toString(Charset charset) {
+    return size() == 0 ? "" : toStringInternal(charset);
+  }
+  /**
+   * Constructs a new {@code String} by decoding the bytes using the
+   * specified charset.
+   *
+   * @param charset encode using this charset
+   * @return new string
+   */
+  protected abstract String toStringInternal(Charset charset);
  // =================================================================
  // UTF-8 decoding
@@ -624,11 +649,7 @@ public abstract class ByteString implements Iterable<Byte>, Serializable {
   * @return new string using UTF-8 encoding
   */
  public String toStringUtf8() {
-    try {
+    return toString(UTF_8);
-      return toString(UTF_8);
-    } catch (UnsupportedEncodingException e) {
-      throw new RuntimeException("UTF-8 not supported?", e);
-    }
  }
  /**

--- a/java/src/main/java/com/google/protobuf/LiteralByteString.java
+++ b/java/src/main/java/com/google/protobuf/LiteralByteString.java
@@ -36,6 +36,7 @@ import java.io.InputStream;
 import java.io.OutputStream;
 import java.io.UnsupportedEncodingException;
 import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.NoSuchElementException;
@@ -152,13 +153,8 @@ class LiteralByteString extends ByteString {
  }
  @Override
-  public String toString(String charsetName)
+  protected String toStringInternal(Charset charset) {
-      throws UnsupportedEncodingException {
+    return new String(bytes, getOffsetIntoBytes(), size(), charset);
-    // Optimize for empty strings, but ensure we don't silently ignore invalid
-    // encodings.
-    return size() == 0 && UTF_8.equals(charsetName)
-        ? ""
-        : new String(bytes, getOffsetIntoBytes(), size(), charsetName);
  }
  // =================================================================

--- a/java/src/main/java/com/google/protobuf/RopeByteString.java
+++ b/java/src/main/java/com/google/protobuf/RopeByteString.java
@@ -38,6 +38,7 @@ import java.io.OutputStream;
 import java.io.UnsupportedEncodingException;
 import java.io.ByteArrayInputStream;
 import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Iterator;
@@ -418,13 +419,8 @@ class RopeByteString extends ByteString {
  }
  @Override
-  public String toString(String charsetName)
+  protected String toStringInternal(Charset charset) {
-      throws UnsupportedEncodingException {
+    return new String(toByteArray(), charset);
-    // Optimize for empty strings, but ensure we don't silently ignore invalid
-    // encodings.
-    return size() == 0 && UTF_8.equals(charsetName)
-        ? ""
-        : new String(toByteArray(), charsetName);
  }
  // =================================================================

--- a/java/src/test/java/com/google/protobuf/BoundedByteStringTest.java
+++ b/java/src/test/java/com/google/protobuf/BoundedByteStringTest.java
@@ -72,6 +72,19 @@ public class BoundedByteStringTest extends LiteralByteStringTest {
        testString.substring(2, testString.length() - 6), roundTripString);
  }
+  @Override
+  public void testCharsetToString() throws UnsupportedEncodingException {
+    String testString = "I love unicode \u1234\u5678 characters";
+    LiteralByteString unicode = new LiteralByteString(testString.getBytes(ByteString.UTF_8));
+    ByteString chopped = unicode.substring(2, unicode.size() - 6);
+    assertEquals(classUnderTest + ".substring() must have the expected type",
+        classUnderTest, getActualClassName(chopped));
+    String roundTripString = chopped.toString(ByteString.UTF_8);
+    assertEquals(classUnderTest + " unicode bytes must match",
+        testString.substring(2, testString.length() - 6), roundTripString);
+  }
  public void testJavaSerialization() throws Exception {
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    ObjectOutputStream oos = new ObjectOutputStream(out);

--- a/java/src/test/java/com/google/protobuf/LiteralByteStringTest.java
+++ b/java/src/test/java/com/google/protobuf/LiteralByteStringTest.java
@@ -298,6 +298,13 @@ public class LiteralByteStringTest extends TestCase {
    assertEquals(classUnderTest + " unicode must match", testString, roundTripString);
  }
+  public void testCharsetToString() throws UnsupportedEncodingException {
+    String testString = "I love unicode \u1234\u5678 characters";
+    LiteralByteString unicode = new LiteralByteString(testString.getBytes(ByteString.UTF_8));
+    String roundTripString = unicode.toString(ByteString.UTF_8);
+    assertEquals(classUnderTest + " unicode must match", testString, roundTripString);
+  }
  public void testToString_returnsCanonicalEmptyString() throws UnsupportedEncodingException{
    assertSame(classUnderTest + " must be the same string references",
        ByteString.EMPTY.toString(UTF_8), new LiteralByteString(new byte[]{}).toString(UTF_8));

--- a/java/src/test/java/com/google/protobuf/RopeByteStringSubstringTest.java
+++ b/java/src/test/java/com/google/protobuf/RopeByteStringSubstringTest.java
@@ -94,4 +94,34 @@ public class RopeByteStringSubstringTest extends LiteralByteStringTest {
    assertEquals(classUnderTest + " string must must have same hashCode as the flat string",
        flatString.hashCode(), unicode.hashCode());
  }
+  @Override
+  public void testCharsetToString() throws UnsupportedEncodingException {
+    String sourceString = "I love unicode \u1234\u5678 characters";
+    ByteString sourceByteString = ByteString.copyFromUtf8(sourceString);
+    int copies = 250;
+    // By building the RopeByteString by concatenating, this is actually a fairly strenuous test.
+    StringBuilder builder = new StringBuilder(copies * sourceString.length());
+    ByteString unicode = ByteString.EMPTY;
+    for (int i = 0; i < copies; ++i) {
+      builder.append(sourceString);
+      unicode = RopeByteString.concatenate(unicode, sourceByteString);
+    }
+    String testString = builder.toString();
+    // Do the substring part
+    testString = testString.substring(2, testString.length() - 6);
+    unicode = unicode.substring(2, unicode.size() - 6);
+    assertEquals(classUnderTest + " from string must have the expected type",
+        classUnderTest, getActualClassName(unicode));
+    String roundTripString = unicode.toString(ByteString.UTF_8);
+    assertEquals(classUnderTest + " unicode bytes must match",
+        testString, roundTripString);
+    ByteString flatString = ByteString.copyFromUtf8(testString);
+    assertEquals(classUnderTest + " string must equal the flat string", flatString, unicode);
+    assertEquals(classUnderTest + " string must must have same hashCode as the flat string",
+        flatString.hashCode(), unicode.hashCode());
+  }
 }
--- a/java/src/test/java/com/google/protobuf/RopeByteStringTest.java
+++ b/java/src/test/java/com/google/protobuf/RopeByteStringTest.java
@@ -118,6 +118,32 @@ public class RopeByteStringTest extends LiteralByteStringTest {
        flatString.hashCode(), unicode.hashCode());
  }
+  @Override
+  public void testCharsetToString() throws UnsupportedEncodingException {
+    String sourceString = "I love unicode \u1234\u5678 characters";
+    ByteString sourceByteString = ByteString.copyFromUtf8(sourceString);
+    int copies = 250;
+    // By building the RopeByteString by concatenating, this is actually a fairly strenuous test.
+    StringBuilder builder = new StringBuilder(copies * sourceString.length());
+    ByteString unicode = ByteString.EMPTY;
+    for (int i = 0; i < copies; ++i) {
+      builder.append(sourceString);
+      unicode = RopeByteString.concatenate(unicode, sourceByteString);
+    }
+    String testString = builder.toString();
+    assertEquals(classUnderTest + " from string must have the expected type",
+        classUnderTest, getActualClassName(unicode));
+    String roundTripString = unicode.toString(ByteString.UTF_8);
+    assertEquals(classUnderTest + " unicode bytes must match",
+        testString, roundTripString);
+    ByteString flatString = ByteString.copyFromUtf8(testString);
+    assertEquals(classUnderTest + " string must equal the flat string", flatString, unicode);
+    assertEquals(classUnderTest + " string must must have same hashCode as the flat string",
+        flatString.hashCode(), unicode.hashCode());
+  }
  @Override
  public void testToString_returnsCanonicalEmptyString() throws UnsupportedEncodingException {
    RopeByteString ropeByteString =