Commit a33af83e authored by Milo Yip's avatar Milo Yip

Optimization for Regex and Schema

parent a0066483
...@@ -71,13 +71,17 @@ class GenericRegex { ...@@ -71,13 +71,17 @@ class GenericRegex {
public: public:
typedef typename Encoding::Ch Ch; typedef typename Encoding::Ch Ch;
GenericRegex(const Ch* source, Allocator* allocator = 0) : states_(allocator, 256), ranges_(allocator, 256), root_(kRegexInvalidState), stateCount_(), rangeCount_(), anchorBegin_(), anchorEnd_() { GenericRegex(const Ch* source, Allocator* allocator = 0) :
states_(allocator, 256), ranges_(allocator, 256), root_(kRegexInvalidState), stateCount_(), rangeCount_(),
stateSet_(), state0_(allocator, 0), state1_(allocator, 0), anchorBegin_(), anchorEnd_()
{
GenericStringStream<Encoding> ss(source); GenericStringStream<Encoding> ss(source);
DecodedStream<GenericStringStream<Encoding> > ds(ss); DecodedStream<GenericStringStream<Encoding> > ds(ss);
Parse(ds); Parse(ds);
} }
~GenericRegex() { ~GenericRegex() {
Allocator::Free(stateSet_);
} }
bool IsValid() const { bool IsValid() const {
...@@ -308,6 +312,14 @@ private: ...@@ -308,6 +312,14 @@ private:
printf("\n"); printf("\n");
#endif #endif
} }
// Preallocate buffer for SearchWithAnchoring()
RAPIDJSON_ASSERT(stateSet_ == 0);
if (stateCount_ > 0) {
stateSet_ = static_cast<unsigned*>(states_.GetAllocator().Malloc(GetStateSetSize()));
state0_.Reserve<SizeType>(stateCount_);
state1_.Reserve<SizeType>(stateCount_);
}
} }
SizeType NewState(SizeType out, SizeType out1, unsigned codepoint) { SizeType NewState(SizeType out, SizeType out1, unsigned codepoint) {
...@@ -568,21 +580,15 @@ private: ...@@ -568,21 +580,15 @@ private:
RAPIDJSON_ASSERT(IsValid()); RAPIDJSON_ASSERT(IsValid());
DecodedStream<InputStream> ds(is); DecodedStream<InputStream> ds(is);
Allocator allocator; state0_.Clear();
Stack<Allocator> state0(&allocator, stateCount_ * sizeof(SizeType)); Stack<Allocator> *current = &state0_, *next = &state1_;
Stack<Allocator> state1(&allocator, stateCount_ * sizeof(SizeType)); const size_t stateSetSize = GetStateSetSize();
Stack<Allocator> *current = &state0, *next = &state1; std::memset(stateSet_, 0, stateSetSize);
const size_t stateSetSize = (stateCount_ + 31) / 32 * 4;
unsigned* stateSet = static_cast<unsigned*>(allocator.Malloc(stateSetSize));
std::memset(stateSet, 0, stateSetSize);
bool matched = false;
matched = AddState(stateSet, *current, root_);
bool matched = AddState(*current, root_);
unsigned codepoint; unsigned codepoint;
while (!current->Empty() && (codepoint = ds.Take()) != 0) { while (!current->Empty() && (codepoint = ds.Take()) != 0) {
std::memset(stateSet, 0, stateSetSize); std::memset(stateSet_, 0, stateSetSize);
next->Clear(); next->Clear();
matched = false; matched = false;
for (const SizeType* s = current->template Bottom<SizeType>(); s != current->template End<SizeType>(); ++s) { for (const SizeType* s = current->template Bottom<SizeType>(); s != current->template End<SizeType>(); ++s) {
...@@ -591,39 +597,38 @@ private: ...@@ -591,39 +597,38 @@ private:
sr.codepoint == kAnyCharacterClass || sr.codepoint == kAnyCharacterClass ||
(sr.codepoint == kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint))) (sr.codepoint == kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint)))
{ {
matched = AddState(stateSet, *next, sr.out) || matched; matched = AddState(*next, sr.out) || matched;
if (!anchorEnd && matched) if (!anchorEnd && matched)
goto exit; return true;
} }
if (!anchorBegin) if (!anchorBegin)
AddState(stateSet, *next, root_); AddState(*next, root_);
} }
Stack<Allocator>* temp = current; internal::Swap(current, next);
current = next;
next = temp;
} }
exit:
Allocator::Free(stateSet);
return matched; return matched;
} }
size_t GetStateSetSize() const {
return (stateCount_ + 31) / 32 * 4;
}
// Return whether the added states is a match state // Return whether the added states is a match state
bool AddState(unsigned* stateSet, Stack<Allocator>& l, SizeType index) const { bool AddState(Stack<Allocator>& l, SizeType index) const {
if (index == kRegexInvalidState) if (index == kRegexInvalidState)
return true; return true;
const State& s = GetState(index); const State& s = GetState(index);
if (s.out1 != kRegexInvalidState) { // Split if (s.out1 != kRegexInvalidState) { // Split
bool matched = AddState(stateSet, l, s.out); bool matched = AddState(l, s.out);
matched = AddState(stateSet, l, s.out1) || matched; return AddState(l, s.out1) || matched;
return matched;
} }
else if (!(stateSet[index >> 5] & (1 << (index & 31)))) { else if (!(stateSet_[index >> 5] & (1 << (index & 31)))) {
stateSet[index >> 5] |= (1 << (index & 31)); stateSet_[index >> 5] |= (1 << (index & 31));
*l.template Push<SizeType>() = index; *l.template PushUnsafe<SizeType>() = index;
} }
return GetState(index).out == kRegexInvalidState; return s.out == kRegexInvalidState; // by using PushUnsafe() above, we can ensure s is not validated due to reallocation.
} }
bool MatchRange(SizeType rangeIndex, unsigned codepoint) const { bool MatchRange(SizeType rangeIndex, unsigned codepoint) const {
...@@ -642,6 +647,11 @@ private: ...@@ -642,6 +647,11 @@ private:
SizeType root_; SizeType root_;
SizeType stateCount_; SizeType stateCount_;
SizeType rangeCount_; SizeType rangeCount_;
// For SearchWithAnchoring()
uint32_t* stateSet_; // allocated by states_.GetAllocator()
mutable Stack<Allocator> state0_;
mutable Stack<Allocator> state1_;
bool anchorBegin_; bool anchorBegin_;
bool anchorEnd_; bool anchorEnd_;
}; };
......
...@@ -38,7 +38,6 @@ public: ...@@ -38,7 +38,6 @@ public:
// Optimization note: Do not allocate memory for stack_ in constructor. // Optimization note: Do not allocate memory for stack_ in constructor.
// Do it lazily when first Push() -> Expand() -> Resize(). // Do it lazily when first Push() -> Expand() -> Resize().
Stack(Allocator* allocator, size_t stackCapacity) : allocator_(allocator), ownAllocator_(0), stack_(0), stackTop_(0), stackEnd_(0), initialCapacity_(stackCapacity) { Stack(Allocator* allocator, size_t stackCapacity) : allocator_(allocator), ownAllocator_(0), stack_(0), stackTop_(0), stackEnd_(0), initialCapacity_(stackCapacity) {
RAPIDJSON_ASSERT(stackCapacity > 0);
} }
#if RAPIDJSON_HAS_CXX11_RVALUE_REFS #if RAPIDJSON_HAS_CXX11_RVALUE_REFS
......
...@@ -300,15 +300,17 @@ struct SchemaValidationContext { ...@@ -300,15 +300,17 @@ struct SchemaValidationContext {
factory.DestroySchemaValidator(patternPropertiesValidators[i]); factory.DestroySchemaValidator(patternPropertiesValidators[i]);
factory.FreeState(patternPropertiesValidators); factory.FreeState(patternPropertiesValidators);
} }
factory.FreeState(patternPropertiesSchemas); if (patternPropertiesSchemas)
factory.FreeState(objectDependencies); factory.FreeState(patternPropertiesSchemas);
if (objectDependencies)
factory.FreeState(objectDependencies);
} }
SchemaValidatorFactoryType& factory; SchemaValidatorFactoryType& factory;
const SchemaType* schema; const SchemaType* schema;
const SchemaType* valueSchema; const SchemaType* valueSchema;
const Ch* invalidKeyword; const Ch* invalidKeyword;
void* hasher; // Only calidator access void* hasher; // Only validator access
void* arrayElementHashCodes; // Only validator access this void* arrayElementHashCodes; // Only validator access this
ISchemaValidator** validators; ISchemaValidator** validators;
SizeType validatorCount; SizeType validatorCount;
...@@ -613,7 +615,7 @@ public: ...@@ -613,7 +615,7 @@ public:
return true; return true;
} }
bool EndValue(Context& context) const { RAPIDJSON_FORCEINLINE bool EndValue(Context& context) const {
if (context.patternPropertiesValidatorCount > 0) { if (context.patternPropertiesValidatorCount > 0) {
bool otherValid = false; bool otherValid = false;
SizeType count = context.patternPropertiesValidatorCount; SizeType count = context.patternPropertiesValidatorCount;
...@@ -1080,8 +1082,12 @@ private: ...@@ -1080,8 +1082,12 @@ private:
// O(n) // O(n)
template <typename ValueType> template <typename ValueType>
bool FindPropertyIndex(const ValueType& name, SizeType* outIndex) const { bool FindPropertyIndex(const ValueType& name, SizeType* outIndex) const {
SizeType len = name.GetStringLength();
const Ch* str = name.GetString();
for (SizeType index = 0; index < propertyCount_; index++) for (SizeType index = 0; index < propertyCount_; index++)
if (properties_[index].name == name) { if (properties_[index].name.GetStringLength() == len &&
(std::memcmp(properties_[index].name.GetString(), str, sizeof(Ch) * len) == 0))
{
*outIndex = index; *outIndex = index;
return true; return true;
} }
...@@ -1703,7 +1709,7 @@ private: ...@@ -1703,7 +1709,7 @@ private:
PushSchema(root_); PushSchema(root_);
else { else {
if (CurrentContext().inArray) if (CurrentContext().inArray)
AppendToken(CurrentContext().arrayElementIndex); AppendToken<Ch>(CurrentContext().arrayElementIndex);
if (!CurrentSchema().BeginValue(CurrentContext())) if (!CurrentSchema().BeginValue(CurrentContext()))
return false; return false;
...@@ -1767,21 +1773,23 @@ private: ...@@ -1767,21 +1773,23 @@ private:
} }
void AppendToken(const Ch* str, SizeType len) { void AppendToken(const Ch* str, SizeType len) {
*documentStack_.template Push<Ch>() = '/'; documentStack_.template Reserve<Ch>(1 + len * 2); // worst case all characters are escaped as two characters
*documentStack_.template PushUnsafe<Ch>() = '/';
for (SizeType i = 0; i < len; i++) { for (SizeType i = 0; i < len; i++) {
if (str[i] == '~') { if (str[i] == '~') {
*documentStack_.template Push<Ch>() = '~'; *documentStack_.template PushUnsafe<Ch>() = '~';
*documentStack_.template Push<Ch>() = '0'; *documentStack_.template PushUnsafe<Ch>() = '0';
} }
else if (str[i] == '/') { else if (str[i] == '/') {
*documentStack_.template Push<Ch>() = '~'; *documentStack_.template PushUnsafe<Ch>() = '~';
*documentStack_.template Push<Ch>() = '1'; *documentStack_.template PushUnsafe<Ch>() = '1';
} }
else else
*documentStack_.template Push<Ch>() = str[i]; *documentStack_.template PushUnsafe<Ch>() = str[i];
} }
} }
template<typename Ch>
void AppendToken(SizeType index) { void AppendToken(SizeType index) {
*documentStack_.template Push<Ch>() = '/'; *documentStack_.template Push<Ch>() = '/';
char buffer[21]; char buffer[21];
...@@ -1790,9 +1798,27 @@ private: ...@@ -1790,9 +1798,27 @@ private:
*documentStack_.template Push<Ch>() = buffer[i]; *documentStack_.template Push<Ch>() = buffer[i];
} }
void PushSchema(const SchemaType& schema) { new (schemaStack_.template Push<Context>()) Context(*this, &schema); } // Specialized version for char to prevent buffer copying.
template <>
void AppendToken<char>(SizeType index) {
if (sizeof(SizeType) == 4) {
char *buffer = documentStack_.template Push<Ch>(1 + 10); // '/' + uint
*buffer++ = '/';
const char* end = internal::u32toa(index, buffer);
documentStack_.template Pop<Ch>(static_cast<size_t>(10 - (end - buffer)));
}
else {
char *buffer = documentStack_.template Push<Ch>(1 + 20); // '/' + uint64
*buffer++ = '/';
const char* end = internal::u64toa(index, buffer);
documentStack_.template Pop<Ch>(static_cast<size_t>(20 - (end - buffer)));
}
}
RAPIDJSON_FORCEINLINE void PushSchema(const SchemaType& schema) { new (schemaStack_.template Push<Context>()) Context(*this, &schema); }
void PopSchema() { RAPIDJSON_FORCEINLINE void PopSchema() {
Context* c = schemaStack_.template Pop<Context>(1); Context* c = schemaStack_.template Pop<Context>(1);
if (HashCodeArray* a = static_cast<HashCodeArray*>(c->arrayElementHashCodes)) { if (HashCodeArray* a = static_cast<HashCodeArray*>(c->arrayElementHashCodes)) {
a->~HashCodeArray(); a->~HashCodeArray();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment