Commit 162b656f authored by Ayende Rahien's avatar Ayende Rahien

Implementing string interning

parent e3aff478
using System;
namespace Google.ProtocolBuffers
{
public class ByteBuffer
{
public byte[] Buffer;
public int Offset;
public int Length;
private int hash;
public void ResetHash()
{
hash = 23;
for (var i = Offset; i < Offset + Length; i++)
{
hash = (hash * 23) ^ Buffer[i];
}
}
public ByteBuffer(byte[] buffer, int offset, int length)
{
Buffer = buffer;
Offset = offset;
Length = length;
ResetHash();
}
public ByteString ToByteString()
{
return ByteString.CopyFrom(Buffer, Offset, Length);
}
public override int GetHashCode()
{
return hash;
}
public override bool Equals(object obj)
{
var other = obj as ByteBuffer;
if (other == null)
return false;
if (other.Offset != Offset)
return false;
if (other.Length != Length)
return false;
for (int i = Offset; i < Offset + Length; i++)
{
if (Buffer[i] != other.Buffer[i])
return false;
}
return true;
}
}
}
\ No newline at end of file
using System;
using System.Collections.Generic;
using System.Text;
using System.Threading;
namespace Google.ProtocolBuffers
{
/// <summary>
/// This class tries hard to allow us to generate strings directly from buffer outputs without having to
///
/// Note, non thread safe
/// </summary>
public class ByteStringStringInterning
{
private class ByteStringOrByteBuffer : IEquatable<ByteStringOrByteBuffer>
{
private readonly ByteString str;
private readonly ByteBuffer buffer;
public ByteStringOrByteBuffer(ByteString str)
{
this.str = str;
}
public ByteStringOrByteBuffer(ByteBuffer buffer)
{
this.buffer = buffer;
}
public bool Equals(ByteStringOrByteBuffer other)
{
if (ReferenceEquals(null, other)) return false;
if (ReferenceEquals(this, other)) return true;
if(other.str!=null && str != null)
return Equals(other.str, str);
if (other.buffer != null && buffer != null)
return Equals(other.buffer, buffer);
if (other.str != null && str == null)
return StringEqualsToBuffer(other.str, buffer);
return StringEqualsToBuffer(str, other.buffer);
}
private static bool StringEqualsToBuffer(ByteString byteString, ByteBuffer byteBuffer)
{
var strLen = byteString.Length;
if(strLen != byteBuffer.Length)
return false;
for (int i = 0; i < strLen; i++)
{
if(byteString.bytes[i] != byteBuffer.Buffer[byteBuffer.Offset+i])
return false;
}
return true;
}
public override bool Equals(object obj)
{
if (ReferenceEquals(null, obj)) return false;
if (ReferenceEquals(this, obj)) return true;
return Equals(obj as ByteStringOrByteBuffer);
}
public override int GetHashCode()
{
return str != null ? str.GetHashCode() : buffer.GetHashCode();
}
}
private readonly int limit;
private int timestamp;
private readonly IDictionary<ByteStringOrByteBuffer, Data> strings = new Dictionary<ByteStringOrByteBuffer, Data>();
public static ByteStringStringInterning CreateInstance()
{
return new ByteStringStringInterning(65536);
}
[Serializable]
private class Data
{
public string Value;
public int Timestamp;
}
private ByteStringStringInterning(int limit)
{
this.limit = limit;
}
public void Clear()
{
strings.Clear();
}
public string Intern(ByteBuffer str)
{
Data val;
int currentTimestamp = Interlocked.Increment(ref timestamp);
if (strings.TryGetValue(new ByteStringOrByteBuffer(str), out val))
{
Interlocked.Exchange(ref val.Timestamp, currentTimestamp);
return val.Value;
}
var byteString = str.ToByteString();
val = new Data { Timestamp = currentTimestamp, Value = byteString.ToStringUtf8() };
strings.Add(new ByteStringOrByteBuffer(byteString), val);
DoCleanupIfNeeded();
return val.Value;
}
private void DoCleanupIfNeeded()
{
if (strings.Count <= limit)
return;
// to avoid frequent thrashing, we will remove the bottom 10% of the current pool in one go
// that means that we will hit the limit fairly infrequently
var list = new List<KeyValuePair<ByteStringOrByteBuffer, Data>>(strings);
list.Sort((x, y) => x.Value.Timestamp - y.Value.Timestamp);
for (int i = 0; i < limit/10; i++)
{
strings.Remove(list[i].Key);
}
}
}
}
\ No newline at end of file
......@@ -62,7 +62,10 @@ namespace Google.ProtocolBuffers {
private int bufferSizeAfterLimit = 0;
private int bufferPos = 0;
private readonly Stream input;
private uint lastTag = 0;
private uint lastTag = 0;
private readonly ByteBuffer rawBytesBuffer = new ByteBuffer(new byte[BufferSize], 0, 0);
private readonly ByteStringStringInterning byteStringStringInterning = ByteStringStringInterning.CreateInstance();
internal const int DefaultRecursionLimit = 64;
internal const int DefaultSizeLimit = 64 << 20; // 64MB
......@@ -237,13 +240,13 @@ namespace Google.ProtocolBuffers {
}
if (size <= bufferSize - bufferPos) {
// Fast path: We already have the bytes in a contiguous buffer, so
// just copy directly from it.
String result = Encoding.UTF8.GetString(buffer, bufferPos, size);
// just copy directly from it.
String result = byteStringStringInterning.Intern(new ByteBuffer(buffer, bufferPos, size));
bufferPos += size;
return result;
}
// Slow path: Build a byte array first then copy it.
return Encoding.UTF8.GetString(ReadRawBytes(size), 0, size);
// Slow path: Build a byte array first then copy it.
return byteStringStringInterning.Intern(ReadRawBytes(size));
}
/// <summary>
......@@ -302,8 +305,9 @@ namespace Google.ProtocolBuffers {
bufferPos += size;
return result;
} else {
// Slow path: Build a byte array first then copy it.
return ByteString.CopyFrom(ReadRawBytes(size));
// Slow path: Build a byte array first then copy it.
ByteBuffer rawBytes = ReadRawBytes(size);
return ByteString.CopyFrom(rawBytes.Buffer, rawBytes.Offset, rawBytes.Length);
}
}
......@@ -763,7 +767,7 @@ namespace Google.ProtocolBuffers {
/// <exception cref="InvalidProtocolBufferException">
/// the end of the stream or the current limit was reached
/// </exception>
public byte[] ReadRawBytes(int size) {
public ByteBuffer ReadRawBytes(int size) {
if (size < 0) {
throw InvalidProtocolBufferException.NegativeSize();
}
......@@ -776,19 +780,19 @@ namespace Google.ProtocolBuffers {
}
if (size <= bufferSize - bufferPos) {
// We have all the bytes we need already.
byte[] bytes = new byte[size];
Array.Copy(buffer, bufferPos, bytes, 0, size);
bufferPos += size;
return bytes;
// We have all the bytes we need already.
var result = new ByteBuffer(buffer, bufferPos, size);
bufferPos += size;
return result;
} else if (size < BufferSize) {
// Reading more bytes than are in the buffer, but not an excessive number
// of bytes. We can safely allocate the resulting array ahead of time.
// First copy what we have.
byte[] bytes = new byte[size];
// First copy what we have.
rawBytesBuffer.Length = size;
rawBytesBuffer.Offset = 0;
int pos = bufferSize - bufferPos;
Array.Copy(buffer, bufferPos, bytes, 0, pos);
Array.Copy(buffer, bufferPos, rawBytesBuffer.Buffer, 0, pos);
bufferPos = bufferSize;
// We want to use RefillBuffer() and then copy from the buffer into our
......@@ -797,16 +801,16 @@ namespace Google.ProtocolBuffers {
RefillBuffer(true);
while (size - pos > bufferSize) {
Array.Copy(buffer, 0, bytes, pos, bufferSize);
Array.Copy(buffer, 0, rawBytesBuffer.Buffer, pos, bufferSize);
pos += bufferSize;
bufferPos = bufferSize;
RefillBuffer(true);
}
Array.Copy(buffer, 0, bytes, pos, size - pos);
bufferPos = size - pos;
return bytes;
Array.Copy(buffer, 0, rawBytesBuffer.Buffer, pos, size - pos);
bufferPos = size - pos;
rawBytesBuffer.ResetHash();
return rawBytesBuffer;
} else {
// The size is very large. For security reasons, we can't allocate the
// entire byte array yet. The size comes directly from the input, so a
......@@ -859,7 +863,7 @@ namespace Google.ProtocolBuffers {
}
// Done.
return bytes;
return new ByteBuffer(buffer, 0, size);
}
}
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment