Commit c51c90b2 authored by miloyip@gmail.com's avatar miloyip@gmail.com

Added RFC4627's encoding detection method, with refactored BOM detection.

git-svn-id: https://rapidjson.googlecode.com/svn/trunk@45 c5894555-1306-4e8d-425f-1f6f381ee07c
parent 8c15b1d5
...@@ -63,12 +63,16 @@ public: ...@@ -63,12 +63,16 @@ public:
typedef CharType Ch; typedef CharType Ch;
AutoUTFInputStream(InputStream& is, UTFType type = kUTF8) : is_(is), type_(type) { AutoUTFInputStream(InputStream& is, UTFType type = kUTF8) : is_(is), type_(type) {
TakeBOM(is); DetectType(is);
Read(); static const TakeFunc f[] = { ENCODINGS_FUNC(Take) };
takeFunc_ = f[type_];
current_ = takeFunc_(is_);
} }
UTFType GetType() const { return type_; }
Ch Peek() const { return current_; } Ch Peek() const { return current_; }
Ch Take() { Ch c = current_; Read(); return c; } Ch Take() { Ch c = current_; current_ = takeFunc_(is_); return c; }
size_t Tell() const { is_.Tell(); } size_t Tell() const { is_.Tell(); }
// Not implemented // Not implemented
...@@ -78,21 +82,47 @@ public: ...@@ -78,21 +82,47 @@ public:
size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
private: private:
friend struct AutoUTF<Ch>; // Detect encoding type with BOM or RFC 4627
void DetectType(InputStream& is) {
void TakeBOM(InputStream& is) { // BOM (Byte Order Mark):
#define ASSUME(x) if ((unsigned char)is.Peek() != x) break; is.Take() // 00 00 FE FF UTF-32BE
switch ((unsigned char)is.Peek()) { // FF FE 00 00 UTF-32LE
case 0x00: is.Take(); ASSUME(0x00); ASSUME(0xFE); ASSUME(0xFF); type_ = kUTF32BE; break; // FE FF UTF-16BE
case 0xEF: is.Take(); ASSUME(0xBB); ASSUME(0xBF); type_ = kUTF8; break; // FF FE UTF-16LE
case 0xFE: is.Take(); ASSUME(0xFF); type_ = kUTF16BE; break; // EF BB BF UTF-8
case 0xFF: is.Take(); ASSUME(0xFE);
if (is.Peek() == 0x00) { const unsigned char* c = (const unsigned char *)is.Peek4();
is.Take(); ASSUME(0x00); type_ = kUTF32LE; break; if (!c)
} return;
type_ = kUTF16LE;
unsigned bom = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24);
if (bom == 0xFFFE0000) { type_ = kUTF32BE; is.Take(); is.Take(); is.Take(); is.Take(); goto sizecheck; }
else if (bom == 0x0000FEFF) { type_ = kUTF32LE; is.Take(); is.Take(); is.Take(); is.Take(); goto sizecheck; }
else if ((bom & 0xFFFF) == 0xFFFE) { type_ = kUTF16BE; is.Take(); is.Take(); goto sizecheck; }
else if ((bom & 0xFFFF) == 0xFEFF) { type_ = kUTF16LE; is.Take(); is.Take(); goto sizecheck; }
else if ((bom & 0xFFFFFF) == 0xBFBBEF) { type_ = kUTF8; is.Take(); is.Take(); is.Take(); goto sizecheck; }
// RFC 4627: Section 3
// "Since the first two characters of a JSON text will always be ASCII
// characters [RFC0020], it is possible to determine whether an octet
// stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking
// at the pattern of nulls in the first four octets."
// 00 00 00 xx UTF-32BE
// 00 xx 00 xx UTF-16BE
// xx 00 00 00 UTF-32LE
// xx 00 xx 00 UTF-16LE
// xx xx xx xx UTF-8
unsigned pattern = (c[0] ? 1 : 0) | (c[1] ? 2 : 0) | (c[2] ? 4 : 0) | (c[3] ? 8 : 0);
switch (pattern) {
case 0x08: type_ = kUTF32BE; break;
case 0x0A: type_ = kUTF16BE; break;
case 0x01: type_ = kUTF32LE; break;
case 0x05: type_ = kUTF16LE; break;
case 0x0F: type_ = kUTF8; break;
} }
#undef ASSUME
sizecheck:
// RUntime check whether the size of character type is sufficient. It only perform checks with assertion. // RUntime check whether the size of character type is sufficient. It only perform checks with assertion.
switch (type_) { switch (type_) {
case kUTF16LE: case kUTF16LE:
...@@ -106,15 +136,11 @@ private: ...@@ -106,15 +136,11 @@ private:
} }
} }
void Read() { typedef Ch (*TakeFunc)(InputStream& is);
typedef Ch (*TakeFunc)(InputStream& is);
static const TakeFunc f[] = { ENCODINGS_FUNC(Take) };
current_ = f[type_](is_);
}
InputStream& is_; InputStream& is_;
UTFType type_; UTFType type_;
Ch current_; Ch current_;
TakeFunc takeFunc_;
}; };
template <typename CharType, typename OutputStream> template <typename CharType, typename OutputStream>
...@@ -135,14 +161,17 @@ public: ...@@ -135,14 +161,17 @@ public:
break; break;
} }
static const PutFunc f[] = { ENCODINGS_FUNC(Put) };
putFunc_ = f[type_];
if (putBOM) if (putBOM)
PutBOM(); PutBOM();
} }
UTFType GetType() const { return type_; }
void Put(Ch c) { void Put(Ch c) {
typedef void (*PutFunc)(OutputStream&, Ch); putFunc_(os_, c);
static const PutFunc f[] = { ENCODINGS_FUNC(Put) };
f[type_](os_, c);
} }
void Flush() { os_.Flush(); } void Flush() { os_.Flush(); }
...@@ -155,17 +184,17 @@ public: ...@@ -155,17 +184,17 @@ public:
size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
private: private:
friend struct AutoUTF<Ch>;
void PutBOM() { void PutBOM() {
typedef void (*PutBOMFunc)(OutputStream&); typedef void (*PutBOMFunc)(OutputStream&);
static const PutBOMFunc f[] = { ENCODINGS_FUNC(PutBOM) }; static const PutBOMFunc f[] = { ENCODINGS_FUNC(PutBOM) };
f[type_](os_); f[type_](os_);
} }
typedef void (*PutFunc)(OutputStream&, Ch);
OutputStream& os_; OutputStream& os_;
UTFType type_; UTFType type_;
PutFunc putFunc_;
}; };
#undef ENCODINGS_FUNC #undef ENCODINGS_FUNC
......
...@@ -99,7 +99,7 @@ struct UTF8 { ...@@ -99,7 +99,7 @@ struct UTF8 {
template <typename InputStream, typename OutputStream> template <typename InputStream, typename OutputStream>
RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) { RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
#define COPY() os.Put(c = is.Take()) #define COPY() os.Put(c = is.Take())
#define TRANS(mask) result &= ((GetType(c) & mask) != 0) #define TRANS(mask) result &= ((GetType((unsigned char)c) & mask) != 0)
#define TAIL() COPY(); TRANS(0x70) #define TAIL() COPY(); TRANS(0x70)
Ch c; Ch c;
COPY(); COPY();
...@@ -107,7 +107,7 @@ struct UTF8 { ...@@ -107,7 +107,7 @@ struct UTF8 {
return true; return true;
bool result = true; bool result = true;
switch (GetType(c)) { switch (GetType((unsigned char)c)) {
case 2: TAIL(); return result; case 2: TAIL(); return result;
case 3: TAIL(); TAIL(); return result; case 3: TAIL(); TAIL(); return result;
case 4: COPY(); TRANS(0x50); TAIL(); return result; case 4: COPY(); TRANS(0x50); TAIL(); return result;
...@@ -397,21 +397,21 @@ struct AutoUTF { ...@@ -397,21 +397,21 @@ struct AutoUTF {
RAPIDJSON_FORCEINLINE static void Encode(OutputStream& os, unsigned codepoint) { RAPIDJSON_FORCEINLINE static void Encode(OutputStream& os, unsigned codepoint) {
typedef void (*EncodeFunc)(OutputStream&, unsigned); typedef void (*EncodeFunc)(OutputStream&, unsigned);
static const EncodeFunc f[] = { ENCODINGS_FUNC(Encode) }; static const EncodeFunc f[] = { ENCODINGS_FUNC(Encode) };
(*f[os.type_])(os, codepoint); (*f[os.GetType()])(os, codepoint);
} }
template <typename InputStream> template <typename InputStream>
RAPIDJSON_FORCEINLINE static bool Decode(InputStream& is, unsigned* codepoint) { RAPIDJSON_FORCEINLINE static bool Decode(InputStream& is, unsigned* codepoint) {
typedef bool (*DecodeFunc)(InputStream&, unsigned*); typedef bool (*DecodeFunc)(InputStream&, unsigned*);
static const DecodeFunc f[] = { ENCODINGS_FUNC(Decode) }; static const DecodeFunc f[] = { ENCODINGS_FUNC(Decode) };
return (*f[is.type_])(is, codepoint); return (*f[is.GetType()])(is, codepoint);
} }
template <typename InputStream, typename OutputStream> template <typename InputStream, typename OutputStream>
RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) { RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
typedef bool (*ValidateFunc)(InputStream&, unsigned*); typedef bool (*ValidateFunc)(InputStream&, unsigned*);
static const ValidateFunc f[] = { ENCODINGS_FUNC(Validate) }; static const ValidateFunc f[] = { ENCODINGS_FUNC(Validate) };
return (*f[is.type_])(is, os); return (*f[is.GetType()])(is, os);
} }
#undef ENCODINGS_FUNC #undef ENCODINGS_FUNC
......
...@@ -16,6 +16,7 @@ public: ...@@ -16,6 +16,7 @@ public:
FileReadStream(FILE* fp, char* buffer, size_t bufferSize) : fp_(fp), buffer_(buffer), bufferSize_(bufferSize), bufferLast_(0), current_(buffer_), readCount_(0), count_(0), eof_(false) { FileReadStream(FILE* fp, char* buffer, size_t bufferSize) : fp_(fp), buffer_(buffer), bufferSize_(bufferSize), bufferLast_(0), current_(buffer_), readCount_(0), count_(0), eof_(false) {
RAPIDJSON_ASSERT(fp_ != 0); RAPIDJSON_ASSERT(fp_ != 0);
RAPIDJSON_ASSERT(bufferSize >= 4);
Read(); Read();
} }
...@@ -29,6 +30,11 @@ public: ...@@ -29,6 +30,11 @@ public:
char* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } char* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
size_t PutEnd(char*) { RAPIDJSON_ASSERT(false); return 0; } size_t PutEnd(char*) { RAPIDJSON_ASSERT(false); return 0; }
// For encoding detection only.
const char* Peek4() const {
return (current_ + 4 <= bufferLast_) ? current_ : 0;
}
private: private:
void Read() { void Read() {
if (current_ < bufferLast_) if (current_ < bufferLast_)
......
...@@ -151,23 +151,27 @@ TEST_F(EncodingsTest, EncodedInputStream) { ...@@ -151,23 +151,27 @@ TEST_F(EncodingsTest, EncodedInputStream) {
TEST_F(EncodingsTest, AutoUTFInputStream) { TEST_F(EncodingsTest, AutoUTFInputStream) {
TestAutoUTFInputStream("utf8.json"); TestAutoUTFInputStream("utf8.json");
TestAutoUTFInputStream("utf8bom.json"); TestAutoUTFInputStream("utf8bom.json");
TestAutoUTFInputStream("utf16le.json");
TestAutoUTFInputStream("utf16lebom.json"); TestAutoUTFInputStream("utf16lebom.json");
TestAutoUTFInputStream("utf16be.json");
TestAutoUTFInputStream("utf16bebom.json"); TestAutoUTFInputStream("utf16bebom.json");
TestAutoUTFInputStream("utf32le.json");
TestAutoUTFInputStream("utf32lebom.json"); TestAutoUTFInputStream("utf32lebom.json");
TestAutoUTFInputStream("utf32be.json");
TestAutoUTFInputStream("utf32bebom.json"); TestAutoUTFInputStream("utf32bebom.json");
} }
TEST_F(EncodingsTest, EncodedOutputStream) { TEST_F(EncodingsTest, EncodedOutputStream) {
TestEncodedOutputStream<UTF8<>, UTF8<> >("utf8.json", false); TestEncodedOutputStream<UTF8<>, UTF8<> >("utf8.json", false);
TestEncodedOutputStream<UTF8<>, UTF8<> >("utf8bom.json", true); TestEncodedOutputStream<UTF8<>, UTF8<> >("utf8bom.json", true);
TestEncodedOutputStream<UTF16LE<>, UTF16<> >("utf16le.json", false); TestEncodedOutputStream<UTF16LE<>, UTF16<> >("utf16le.json", false);
TestEncodedOutputStream<UTF16LE<>, UTF16<> >("utf16lebom.json", true); TestEncodedOutputStream<UTF16LE<>, UTF16<> >("utf16lebom.json",true);
TestEncodedOutputStream<UTF16BE<>, UTF16<> >("utf16be.json", false); TestEncodedOutputStream<UTF16BE<>, UTF16<> >("utf16be.json", false);
TestEncodedOutputStream<UTF16BE<>, UTF16<> >("utf16bebom.json", true); TestEncodedOutputStream<UTF16BE<>, UTF16<> >("utf16bebom.json",true);
TestEncodedOutputStream<UTF32LE<>, UTF32<> >("utf32le.json", false); TestEncodedOutputStream<UTF32LE<>, UTF32<> >("utf32le.json", false);
TestEncodedOutputStream<UTF32LE<>, UTF32<> >("utf32lebom.json", true); TestEncodedOutputStream<UTF32LE<>, UTF32<> >("utf32lebom.json",true);
TestEncodedOutputStream<UTF32BE<>, UTF32<> >("utf32be.json", false); TestEncodedOutputStream<UTF32BE<>, UTF32<> >("utf32be.json", false);
TestEncodedOutputStream<UTF32BE<>, UTF32<> >("utf32bebom.json", true); TestEncodedOutputStream<UTF32BE<>, UTF32<> >("utf32bebom.json",true);
} }
TEST_F(EncodingsTest, AutoUTFOutputStream) { TEST_F(EncodingsTest, AutoUTFOutputStream) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment