Commit 162b656f authored by Ayende Rahien's avatar Ayende Rahien

Implementing string interning

parent e3aff478
using System;
namespace Google.ProtocolBuffers
{
public class ByteBuffer
{
public byte[] Buffer;
public int Offset;
public int Length;
private int hash;
public void ResetHash()
{
hash = 23;
for (var i = Offset; i < Offset + Length; i++)
{
hash = (hash * 23) ^ Buffer[i];
}
}
public ByteBuffer(byte[] buffer, int offset, int length)
{
Buffer = buffer;
Offset = offset;
Length = length;
ResetHash();
}
public ByteString ToByteString()
{
return ByteString.CopyFrom(Buffer, Offset, Length);
}
public override int GetHashCode()
{
return hash;
}
public override bool Equals(object obj)
{
var other = obj as ByteBuffer;
if (other == null)
return false;
if (other.Offset != Offset)
return false;
if (other.Length != Length)
return false;
for (int i = Offset; i < Offset + Length; i++)
{
if (Buffer[i] != other.Buffer[i])
return false;
}
return true;
}
}
}
\ No newline at end of file
using System;
using System.Collections.Generic;
using System.Text;
using System.Threading;
namespace Google.ProtocolBuffers
{
/// <summary>
/// This class tries hard to allow us to generate strings directly from buffer outputs without having to
///
/// Note, non thread safe
/// </summary>
public class ByteStringStringInterning
{
private class ByteStringOrByteBuffer : IEquatable<ByteStringOrByteBuffer>
{
private readonly ByteString str;
private readonly ByteBuffer buffer;
public ByteStringOrByteBuffer(ByteString str)
{
this.str = str;
}
public ByteStringOrByteBuffer(ByteBuffer buffer)
{
this.buffer = buffer;
}
public bool Equals(ByteStringOrByteBuffer other)
{
if (ReferenceEquals(null, other)) return false;
if (ReferenceEquals(this, other)) return true;
if(other.str!=null && str != null)
return Equals(other.str, str);
if (other.buffer != null && buffer != null)
return Equals(other.buffer, buffer);
if (other.str != null && str == null)
return StringEqualsToBuffer(other.str, buffer);
return StringEqualsToBuffer(str, other.buffer);
}
private static bool StringEqualsToBuffer(ByteString byteString, ByteBuffer byteBuffer)
{
var strLen = byteString.Length;
if(strLen != byteBuffer.Length)
return false;
for (int i = 0; i < strLen; i++)
{
if(byteString.bytes[i] != byteBuffer.Buffer[byteBuffer.Offset+i])
return false;
}
return true;
}
public override bool Equals(object obj)
{
if (ReferenceEquals(null, obj)) return false;
if (ReferenceEquals(this, obj)) return true;
return Equals(obj as ByteStringOrByteBuffer);
}
public override int GetHashCode()
{
return str != null ? str.GetHashCode() : buffer.GetHashCode();
}
}
private readonly int limit;
private int timestamp;
private readonly IDictionary<ByteStringOrByteBuffer, Data> strings = new Dictionary<ByteStringOrByteBuffer, Data>();
public static ByteStringStringInterning CreateInstance()
{
return new ByteStringStringInterning(65536);
}
[Serializable]
private class Data
{
public string Value;
public int Timestamp;
}
private ByteStringStringInterning(int limit)
{
this.limit = limit;
}
public void Clear()
{
strings.Clear();
}
public string Intern(ByteBuffer str)
{
Data val;
int currentTimestamp = Interlocked.Increment(ref timestamp);
if (strings.TryGetValue(new ByteStringOrByteBuffer(str), out val))
{
Interlocked.Exchange(ref val.Timestamp, currentTimestamp);
return val.Value;
}
var byteString = str.ToByteString();
val = new Data { Timestamp = currentTimestamp, Value = byteString.ToStringUtf8() };
strings.Add(new ByteStringOrByteBuffer(byteString), val);
DoCleanupIfNeeded();
return val.Value;
}
private void DoCleanupIfNeeded()
{
if (strings.Count <= limit)
return;
// to avoid frequent thrashing, we will remove the bottom 10% of the current pool in one go
// that means that we will hit the limit fairly infrequently
var list = new List<KeyValuePair<ByteStringOrByteBuffer, Data>>(strings);
list.Sort((x, y) => x.Value.Timestamp - y.Value.Timestamp);
for (int i = 0; i < limit/10; i++)
{
strings.Remove(list[i].Key);
}
}
}
}
\ No newline at end of file
...@@ -63,6 +63,9 @@ namespace Google.ProtocolBuffers { ...@@ -63,6 +63,9 @@ namespace Google.ProtocolBuffers {
private int bufferPos = 0; private int bufferPos = 0;
private readonly Stream input; private readonly Stream input;
private uint lastTag = 0; private uint lastTag = 0;
private readonly ByteBuffer rawBytesBuffer = new ByteBuffer(new byte[BufferSize], 0, 0);
private readonly ByteStringStringInterning byteStringStringInterning = ByteStringStringInterning.CreateInstance();
internal const int DefaultRecursionLimit = 64; internal const int DefaultRecursionLimit = 64;
internal const int DefaultSizeLimit = 64 << 20; // 64MB internal const int DefaultSizeLimit = 64 << 20; // 64MB
...@@ -238,12 +241,12 @@ namespace Google.ProtocolBuffers { ...@@ -238,12 +241,12 @@ namespace Google.ProtocolBuffers {
if (size <= bufferSize - bufferPos) { if (size <= bufferSize - bufferPos) {
// Fast path: We already have the bytes in a contiguous buffer, so // Fast path: We already have the bytes in a contiguous buffer, so
// just copy directly from it. // just copy directly from it.
String result = Encoding.UTF8.GetString(buffer, bufferPos, size); String result = byteStringStringInterning.Intern(new ByteBuffer(buffer, bufferPos, size));
bufferPos += size; bufferPos += size;
return result; return result;
} }
// Slow path: Build a byte array first then copy it. // Slow path: Build a byte array first then copy it.
return Encoding.UTF8.GetString(ReadRawBytes(size), 0, size); return byteStringStringInterning.Intern(ReadRawBytes(size));
} }
/// <summary> /// <summary>
...@@ -303,7 +306,8 @@ namespace Google.ProtocolBuffers { ...@@ -303,7 +306,8 @@ namespace Google.ProtocolBuffers {
return result; return result;
} else { } else {
// Slow path: Build a byte array first then copy it. // Slow path: Build a byte array first then copy it.
return ByteString.CopyFrom(ReadRawBytes(size)); ByteBuffer rawBytes = ReadRawBytes(size);
return ByteString.CopyFrom(rawBytes.Buffer, rawBytes.Offset, rawBytes.Length);
} }
} }
...@@ -763,7 +767,7 @@ namespace Google.ProtocolBuffers { ...@@ -763,7 +767,7 @@ namespace Google.ProtocolBuffers {
/// <exception cref="InvalidProtocolBufferException"> /// <exception cref="InvalidProtocolBufferException">
/// the end of the stream or the current limit was reached /// the end of the stream or the current limit was reached
/// </exception> /// </exception>
public byte[] ReadRawBytes(int size) { public ByteBuffer ReadRawBytes(int size) {
if (size < 0) { if (size < 0) {
throw InvalidProtocolBufferException.NegativeSize(); throw InvalidProtocolBufferException.NegativeSize();
} }
...@@ -777,18 +781,18 @@ namespace Google.ProtocolBuffers { ...@@ -777,18 +781,18 @@ namespace Google.ProtocolBuffers {
if (size <= bufferSize - bufferPos) { if (size <= bufferSize - bufferPos) {
// We have all the bytes we need already. // We have all the bytes we need already.
byte[] bytes = new byte[size]; var result = new ByteBuffer(buffer, bufferPos, size);
Array.Copy(buffer, bufferPos, bytes, 0, size);
bufferPos += size; bufferPos += size;
return bytes; return result;
} else if (size < BufferSize) { } else if (size < BufferSize) {
// Reading more bytes than are in the buffer, but not an excessive number // Reading more bytes than are in the buffer, but not an excessive number
// of bytes. We can safely allocate the resulting array ahead of time. // of bytes. We can safely allocate the resulting array ahead of time.
// First copy what we have. // First copy what we have.
byte[] bytes = new byte[size]; rawBytesBuffer.Length = size;
rawBytesBuffer.Offset = 0;
int pos = bufferSize - bufferPos; int pos = bufferSize - bufferPos;
Array.Copy(buffer, bufferPos, bytes, 0, pos); Array.Copy(buffer, bufferPos, rawBytesBuffer.Buffer, 0, pos);
bufferPos = bufferSize; bufferPos = bufferSize;
// We want to use RefillBuffer() and then copy from the buffer into our // We want to use RefillBuffer() and then copy from the buffer into our
...@@ -797,16 +801,16 @@ namespace Google.ProtocolBuffers { ...@@ -797,16 +801,16 @@ namespace Google.ProtocolBuffers {
RefillBuffer(true); RefillBuffer(true);
while (size - pos > bufferSize) { while (size - pos > bufferSize) {
Array.Copy(buffer, 0, bytes, pos, bufferSize); Array.Copy(buffer, 0, rawBytesBuffer.Buffer, pos, bufferSize);
pos += bufferSize; pos += bufferSize;
bufferPos = bufferSize; bufferPos = bufferSize;
RefillBuffer(true); RefillBuffer(true);
} }
Array.Copy(buffer, 0, bytes, pos, size - pos); Array.Copy(buffer, 0, rawBytesBuffer.Buffer, pos, size - pos);
bufferPos = size - pos; bufferPos = size - pos;
rawBytesBuffer.ResetHash();
return bytes; return rawBytesBuffer;
} else { } else {
// The size is very large. For security reasons, we can't allocate the // The size is very large. For security reasons, we can't allocate the
// entire byte array yet. The size comes directly from the input, so a // entire byte array yet. The size comes directly from the input, so a
...@@ -859,7 +863,7 @@ namespace Google.ProtocolBuffers { ...@@ -859,7 +863,7 @@ namespace Google.ProtocolBuffers {
} }
// Done. // Done.
return bytes; return new ByteBuffer(buffer, 0, size);
} }
} }
......
...@@ -50,7 +50,9 @@ ...@@ -50,7 +50,9 @@
<ItemGroup> <ItemGroup>
<Compile Include="AbstractBuilder.cs" /> <Compile Include="AbstractBuilder.cs" />
<Compile Include="AbstractMessage.cs" /> <Compile Include="AbstractMessage.cs" />
<Compile Include="ByteBuffer.cs" />
<Compile Include="ByteString.cs" /> <Compile Include="ByteString.cs" />
<Compile Include="ByteStringStringInterning.cs" />
<Compile Include="Collections\Enumerables.cs" /> <Compile Include="Collections\Enumerables.cs" />
<Compile Include="Collections\IPopsicleList.cs" /> <Compile Include="Collections\IPopsicleList.cs" />
<Compile Include="Collections\PopsicleList.cs" /> <Compile Include="Collections\PopsicleList.cs" />
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment