Added RFC4627's encoding detection method, with refactored BOM detection.

git-svn-id: https://rapidjson.googlecode.com/svn/trunk@45 c5894555-1306-4e8d-425f-1f6f381ee07c

Added RFC4627's encoding detection method, with refactored BOM detection.
git-svn-id: https://rapidjson.googlecode.com/svn/trunk@45 c5894555-1306-4e8d-425f-1f6f381ee07c
c51c90b2 · miloyip@gmail.com · 8c15b1d5 · c51c90b2 · c51c90b2 · c51c90b2
Commit c51c90b2 authored Dec 01, 2011 by miloyip@gmail.com
4 changed files
--- a/include/rapidjson/encodedstream.h
+++ b/include/rapidjson/encodedstream.h
@@ -63,12 +63,16 @@ public:
 	typedef CharType Ch;

 	AutoUTFInputStream(InputStream& is, UTFType type = kUTF8) : is_(is), type_(type) {
-		TakeBOM(is);
-		Read();
+		DetectType(is);
+		static const TakeFunc f[] = { ENCODINGS_FUNC(Take) };
+		takeFunc_ = f[type_];
+		current_ = takeFunc_(is_);
 	}

+	UTFType GetType() const { return type_; }
+
 	Ch Peek() const { return current_; }
-	Ch Take() { Ch c = current_; Read(); return c; }
+	Ch Take() { Ch c = current_; current_ = takeFunc_(is_); return c; }
 	size_t Tell() const { is_.Tell(); }

 	// Not implemented
@@ -78,21 +82,47 @@ public:
 	size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }

 private:
-	friend struct AutoUTF<Ch>;
-
-	void TakeBOM(InputStream& is) {
-#define ASSUME(x) if ((unsigned char)is.Peek() != x) break; is.Take()
-		switch ((unsigned char)is.Peek()) {
-		case 0x00: is.Take(); ASSUME(0x00); ASSUME(0xFE); ASSUME(0xFF); type_ = kUTF32BE; break;
-		case 0xEF: is.Take(); ASSUME(0xBB); ASSUME(0xBF); type_ = kUTF8; break;
-		case 0xFE: is.Take(); ASSUME(0xFF); type_ = kUTF16BE; break;
-		case 0xFF: is.Take(); ASSUME(0xFE); 
-			if (is.Peek() == 0x00) {
-				is.Take(); ASSUME(0x00); type_ = kUTF32LE; break;
-			}
-			type_ = kUTF16LE;
+	// Detect encoding type with BOM or RFC 4627
+	void DetectType(InputStream& is) {
+		// BOM (Byte Order Mark):
+		// 00 00 FE FF  UTF-32BE
+		// FF FE 00 00  UTF-32LE
+		// FE FF		UTF-16BE
+		// FF FE		UTF-16LE
+		// EF BB BF		UTF-8
+
+		const unsigned char* c = (const unsigned char *)is.Peek4();
+		if (!c)
+			return;
+
+		unsigned bom = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24);
+		if (bom == 0xFFFE0000)					{ type_ = kUTF32BE; is.Take(); is.Take(); is.Take(); is.Take(); goto sizecheck; }
+		else if (bom == 0x0000FEFF)				{ type_ = kUTF32LE;	is.Take(); is.Take(); is.Take(); is.Take();	goto sizecheck;	}
+		else if ((bom & 0xFFFF) == 0xFFFE)		{ type_ = kUTF16BE; is.Take(); is.Take();						goto sizecheck; }
+		else if ((bom & 0xFFFF) == 0xFEFF)		{ type_ = kUTF16LE; is.Take(); is.Take();						goto sizecheck; }
+		else if ((bom & 0xFFFFFF) == 0xBFBBEF)	{ type_ = kUTF8;	is.Take(); is.Take(); is.Take();			goto sizecheck; }
+
+		// RFC 4627: Section 3
+		// "Since the first two characters of a JSON text will always be ASCII
+		// characters [RFC0020], it is possible to determine whether an octet
+		// stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking
+		// at the pattern of nulls in the first four octets."
+		// 00 00 00 xx  UTF-32BE
+		// 00 xx 00 xx  UTF-16BE
+		// xx 00 00 00  UTF-32LE
+		// xx 00 xx 00  UTF-16LE
+		// xx xx xx xx  UTF-8
+
+		unsigned pattern = (c[0] ? 1 : 0) | (c[1] ? 2 : 0) | (c[2] ? 4 : 0) | (c[3] ? 8 : 0);
+		switch (pattern) {
+		case 0x08: type_ = kUTF32BE; break;
+		case 0x0A: type_ = kUTF16BE; break;
+		case 0x01: type_ = kUTF32LE; break;
+		case 0x05: type_ = kUTF16LE; break;
+		case 0x0F: type_ = kUTF8;    break;
 		}
-#undef ASSUME
+
+	sizecheck:
 		// RUntime check whether the size of character type is sufficient. It only perform checks with assertion.
 		switch (type_) {
 		case kUTF16LE:
@@ -106,15 +136,11 @@ private:
 		}
 	}

-	void Read() {
-		typedef Ch (*TakeFunc)(InputStream& is);
-		static const TakeFunc f[] = { ENCODINGS_FUNC(Take) };
-		current_ = f[type_](is_);
-	}
-
+	typedef Ch (*TakeFunc)(InputStream& is);
 	InputStream& is_;
 	UTFType type_;
 	Ch current_;
+	TakeFunc takeFunc_;
 };

 template <typename CharType, typename OutputStream>
@@ -135,14 +161,17 @@ public:
 			break;
 		}

+		static const PutFunc f[] = { ENCODINGS_FUNC(Put) };
+		putFunc_ = f[type_];
+
 		if (putBOM)
 			PutBOM();
 	}

+	UTFType GetType() const { return type_; }
+
 	void Put(Ch c) { 
-		typedef void (*PutFunc)(OutputStream&, Ch);
-		static const PutFunc f[] = { ENCODINGS_FUNC(Put) };
-		f[type_](os_, c);
+		putFunc_(os_, c);
 	}

 	void Flush() { os_.Flush(); } 
@@ -155,17 +184,17 @@ public:
 	size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }

 private:
-	friend struct AutoUTF<Ch>;
-
 	void PutBOM() { 
 		typedef void (*PutBOMFunc)(OutputStream&);
 		static const PutBOMFunc f[] = { ENCODINGS_FUNC(PutBOM) };
 		f[type_](os_);
 	}

+	typedef void (*PutFunc)(OutputStream&, Ch);

 	OutputStream& os_;
 	UTFType type_;
+	PutFunc putFunc_;
 };

 #undef ENCODINGS_FUNC

--- a/include/rapidjson/encodings.h
+++ b/include/rapidjson/encodings.h
@@ -99,7 +99,7 @@ struct UTF8 {
 	template <typename InputStream, typename OutputStream>
 	RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
 #define COPY() os.Put(c = is.Take())
-#define TRANS(mask) result &= ((GetType(c) & mask) != 0)
+#define TRANS(mask) result &= ((GetType((unsigned char)c) & mask) != 0)
 #define TAIL() COPY(); TRANS(0x70)
 		Ch c;
 		COPY();
@@ -107,7 +107,7 @@ struct UTF8 {
 			return true;

 		bool result = true;
-		switch (GetType(c)) {
+		switch (GetType((unsigned char)c)) {
 		case 2:	TAIL(); return result;
 		case 3:	TAIL(); TAIL(); return result;
 		case 4:	COPY(); TRANS(0x50); TAIL(); return result;
@@ -397,21 +397,21 @@ struct AutoUTF {
 	RAPIDJSON_FORCEINLINE static void Encode(OutputStream& os, unsigned codepoint) {
 		typedef void (*EncodeFunc)(OutputStream&, unsigned);
 		static const EncodeFunc f[] = { ENCODINGS_FUNC(Encode) };
-		(*f[os.type_])(os, codepoint);
+		(*f[os.GetType()])(os, codepoint);
 	}

 	template <typename InputStream>
 	RAPIDJSON_FORCEINLINE static bool Decode(InputStream& is, unsigned* codepoint) {
 		typedef bool (*DecodeFunc)(InputStream&, unsigned*);
 		static const DecodeFunc f[] = { ENCODINGS_FUNC(Decode) };
-		return (*f[is.type_])(is, codepoint);
+		return (*f[is.GetType()])(is, codepoint);
 	}

 	template <typename InputStream, typename OutputStream>
 	RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
 		typedef bool (*ValidateFunc)(InputStream&, unsigned*);
 		static const ValidateFunc f[] = { ENCODINGS_FUNC(Validate) };
-		return (*f[is.type_])(is, os);
+		return (*f[is.GetType()])(is, os);
 	}

 #undef ENCODINGS_FUNC

--- a/include/rapidjson/filereadstream.h
+++ b/include/rapidjson/filereadstream.h
@@ -16,6 +16,7 @@ public:

 	FileReadStream(FILE* fp, char* buffer, size_t bufferSize) : fp_(fp), buffer_(buffer), bufferSize_(bufferSize), bufferLast_(0), current_(buffer_), readCount_(0), count_(0), eof_(false) { 
 		RAPIDJSON_ASSERT(fp_ != 0);
+		RAPIDJSON_ASSERT(bufferSize >= 4);
 		Read();
 	}

@@ -29,6 +30,11 @@ public:
 	char* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
 	size_t PutEnd(char*) { RAPIDJSON_ASSERT(false); return 0; }

+	// For encoding detection only.
+	const char* Peek4() const {
+		return (current_ + 4 <= bufferLast_) ? current_ : 0;
+	}
+
 private:
 	void Read() {
 		if (current_ < bufferLast_)

--- a/test/unittest/encodingstest.cpp
+++ b/test/unittest/encodingstest.cpp
@@ -151,23 +151,27 @@ TEST_F(EncodingsTest, EncodedInputStream) {
 TEST_F(EncodingsTest, AutoUTFInputStream) {
 	TestAutoUTFInputStream("utf8.json");
 	TestAutoUTFInputStream("utf8bom.json");
+	TestAutoUTFInputStream("utf16le.json");
 	TestAutoUTFInputStream("utf16lebom.json");
+	TestAutoUTFInputStream("utf16be.json");
 	TestAutoUTFInputStream("utf16bebom.json");
+	TestAutoUTFInputStream("utf32le.json");
 	TestAutoUTFInputStream("utf32lebom.json");
+	TestAutoUTFInputStream("utf32be.json");
 	TestAutoUTFInputStream("utf32bebom.json");
 }

 TEST_F(EncodingsTest, EncodedOutputStream) {
 	TestEncodedOutputStream<UTF8<>,		UTF8<>	>("utf8.json",		false);
-	TestEncodedOutputStream<UTF8<>,		UTF8<>	>("utf8bom.json",		true);
+	TestEncodedOutputStream<UTF8<>,		UTF8<>	>("utf8bom.json",	true);
 	TestEncodedOutputStream<UTF16LE<>,	UTF16<> >("utf16le.json",	false);
-	TestEncodedOutputStream<UTF16LE<>,	UTF16<> >("utf16lebom.json",	true);
+	TestEncodedOutputStream<UTF16LE<>,	UTF16<> >("utf16lebom.json",true);
 	TestEncodedOutputStream<UTF16BE<>,	UTF16<> >("utf16be.json",	false);
-	TestEncodedOutputStream<UTF16BE<>,	UTF16<> >("utf16bebom.json",	true);
+	TestEncodedOutputStream<UTF16BE<>,	UTF16<> >("utf16bebom.json",true);
 	TestEncodedOutputStream<UTF32LE<>,	UTF32<> >("utf32le.json",	false);
-	TestEncodedOutputStream<UTF32LE<>,	UTF32<> >("utf32lebom.json",	true);
+	TestEncodedOutputStream<UTF32LE<>,	UTF32<> >("utf32lebom.json",true);
 	TestEncodedOutputStream<UTF32BE<>,	UTF32<> >("utf32be.json",	false);
-	TestEncodedOutputStream<UTF32BE<>,	UTF32<> >("utf32bebom.json",	true);
+	TestEncodedOutputStream<UTF32BE<>,	UTF32<> >("utf32bebom.json",true);
 }

 TEST_F(EncodingsTest, AutoUTFOutputStream) {