#include "unittest.h"
#include "rapidjson/filereadstream.h"
#include "rapidjson/filewritestream.h"
#include "rapidjson/encodedstream.h"
#include "rapidjson/stringbuffer.h"

using namespace rapidjson;

// Verification of encoders/decoders with Hoehrmann's UTF8 decoder

// http://www.unicode.org/Public/UNIDATA/Blocks.txt
static const unsigned kCodepointRanges[] = {
	0x0000,		0x007F,		// Basic Latin
	0x0080,		0x00FF,		// Latin-1 Supplement
	0x0100,		0x017F,		// Latin Extended-A
	0x0180,		0x024F,		// Latin Extended-B
	0x0250,		0x02AF,		// IPA Extensions
	0x02B0,		0x02FF,		// Spacing Modifier Letters
	0x0300,		0x036F,		// Combining Diacritical Marks
	0x0370,		0x03FF,		// Greek and Coptic
	0x0400,		0x04FF,		// Cyrillic
	0x0500,		0x052F,		// Cyrillic Supplement
	0x0530,		0x058F,		// Armenian
	0x0590,		0x05FF,		// Hebrew
	0x0600,		0x06FF,		// Arabic
	0x0700,		0x074F,		// Syriac
	0x0750,		0x077F,		// Arabic Supplement
	0x0780,		0x07BF,		// Thaana
	0x07C0,		0x07FF,		// NKo
	0x0800,		0x083F,		// Samaritan
	0x0840,		0x085F,		// Mandaic
	0x0900,		0x097F,		// Devanagari
	0x0980,		0x09FF,		// Bengali
	0x0A00,		0x0A7F,		// Gurmukhi
	0x0A80,		0x0AFF,		// Gujarati
	0x0B00,		0x0B7F,		// Oriya
	0x0B80,		0x0BFF,		// Tamil
	0x0C00,		0x0C7F,		// Telugu
	0x0C80,		0x0CFF,		// Kannada
	0x0D00,		0x0D7F,		// Malayalam
	0x0D80,		0x0DFF,		// Sinhala
	0x0E00,		0x0E7F,		// Thai
	0x0E80,		0x0EFF,		// Lao
	0x0F00,		0x0FFF,		// Tibetan
	0x1000,		0x109F,		// Myanmar
	0x10A0,		0x10FF,		// Georgian
	0x1100,		0x11FF,		// Hangul Jamo
	0x1200,		0x137F,		// Ethiopic
	0x1380,		0x139F,		// Ethiopic Supplement
	0x13A0,		0x13FF,		// Cherokee
	0x1400,		0x167F,		// Unified Canadian Aboriginal Syllabics
	0x1680,		0x169F,		// Ogham
	0x16A0,		0x16FF,		// Runic
	0x1700,		0x171F,		// Tagalog
	0x1720,		0x173F,		// Hanunoo
	0x1740,		0x175F,		// Buhid
	0x1760,		0x177F,		// Tagbanwa
	0x1780,		0x17FF,		// Khmer
	0x1800,		0x18AF,		// Mongolian
	0x18B0,		0x18FF,		// Unified Canadian Aboriginal Syllabics Extended
	0x1900,		0x194F,		// Limbu
	0x1950,		0x197F,		// Tai Le
	0x1980,		0x19DF,		// New Tai Lue
	0x19E0,		0x19FF,		// Khmer Symbols
	0x1A00,		0x1A1F,		// Buginese
	0x1A20,		0x1AAF,		// Tai Tham
	0x1B00,		0x1B7F,		// Balinese
	0x1B80,		0x1BBF,		// Sundanese
	0x1BC0,		0x1BFF,		// Batak
	0x1C00,		0x1C4F,		// Lepcha
	0x1C50,		0x1C7F,		// Ol Chiki
	0x1CD0,		0x1CFF,		// Vedic Extensions
	0x1D00,		0x1D7F,		// Phonetic Extensions
	0x1D80,		0x1DBF,		// Phonetic Extensions Supplement
	0x1DC0,		0x1DFF,		// Combining Diacritical Marks Supplement
	0x1E00,		0x1EFF,		// Latin Extended Additional
	0x1F00,		0x1FFF,		// Greek Extended
	0x2000,		0x206F,		// General Punctuation
	0x2070,		0x209F,		// Superscripts and Subscripts
	0x20A0,		0x20CF,		// Currency Symbols
	0x20D0,		0x20FF,		// Combining Diacritical Marks for Symbols
	0x2100,		0x214F,		// Letterlike Symbols
	0x2150,		0x218F,		// Number Forms
	0x2190,		0x21FF,		// Arrows
	0x2200,		0x22FF,		// Mathematical Operators
	0x2300,		0x23FF,		// Miscellaneous Technical
	0x2400,		0x243F,		// Control Pictures
	0x2440,		0x245F,		// Optical Character Recognition
	0x2460,		0x24FF,		// Enclosed Alphanumerics
	0x2500,		0x257F,		// Box Drawing
	0x2580,		0x259F,		// Block Elements
	0x25A0,		0x25FF,		// Geometric Shapes
	0x2600,		0x26FF,		// Miscellaneous Symbols
	0x2700,		0x27BF,		// Dingbats
	0x27C0,		0x27EF,		// Miscellaneous Mathematical Symbols-A
	0x27F0,		0x27FF,		// Supplemental Arrows-A
	0x2800,		0x28FF,		// Braille Patterns
	0x2900,		0x297F,		// Supplemental Arrows-B
	0x2980,		0x29FF,		// Miscellaneous Mathematical Symbols-B
	0x2A00,		0x2AFF,		// Supplemental Mathematical Operators
	0x2B00,		0x2BFF,		// Miscellaneous Symbols and Arrows
	0x2C00,		0x2C5F,		// Glagolitic
	0x2C60,		0x2C7F,		// Latin Extended-C
	0x2C80,		0x2CFF,		// Coptic
	0x2D00,		0x2D2F,		// Georgian Supplement
	0x2D30,		0x2D7F,		// Tifinagh
	0x2D80,		0x2DDF,		// Ethiopic Extended
	0x2DE0,		0x2DFF,		// Cyrillic Extended-A
	0x2E00,		0x2E7F,		// Supplemental Punctuation
	0x2E80,		0x2EFF,		// CJK Radicals Supplement
	0x2F00,		0x2FDF,		// Kangxi Radicals
	0x2FF0,		0x2FFF,		// Ideographic Description Characters
	0x3000,		0x303F,		// CJK Symbols and Punctuation
	0x3040,		0x309F,		// Hiragana
	0x30A0,		0x30FF,		// Katakana
	0x3100,		0x312F,		// Bopomofo
	0x3130,		0x318F,		// Hangul Compatibility Jamo
	0x3190,		0x319F,		// Kanbun
	0x31A0,		0x31BF,		// Bopomofo Extended
	0x31C0,		0x31EF,		// CJK Strokes
	0x31F0,		0x31FF,		// Katakana Phonetic Extensions
	0x3200,		0x32FF,		// Enclosed CJK Letters and Months
	0x3300,		0x33FF,		// CJK Compatibility
	0x3400,		0x4DBF,		// CJK Unified Ideographs Extension A
	0x4DC0,		0x4DFF,		// Yijing Hexagram Symbols
	0x4E00,		0x9FFF,		// CJK Unified Ideographs
	0xA000,		0xA48F,		// Yi Syllables
	0xA490,		0xA4CF,		// Yi Radicals
	0xA4D0,		0xA4FF,		// Lisu
	0xA500,		0xA63F,		// Vai
	0xA640,		0xA69F,		// Cyrillic Extended-B
	0xA6A0,		0xA6FF,		// Bamum
	0xA700,		0xA71F,		// Modifier Tone Letters
	0xA720,		0xA7FF,		// Latin Extended-D
	0xA800,		0xA82F,		// Syloti Nagri
	0xA830,		0xA83F,		// Common Indic Number Forms
	0xA840,		0xA87F,		// Phags-pa
	0xA880,		0xA8DF,		// Saurashtra
	0xA8E0,		0xA8FF,		// Devanagari Extended
	0xA900,		0xA92F,		// Kayah Li
	0xA930,		0xA95F,		// Rejang
	0xA960,		0xA97F,		// Hangul Jamo Extended-A
	0xA980,		0xA9DF,		// Javanese
	0xAA00,		0xAA5F,		// Cham
	0xAA60,		0xAA7F,		// Myanmar Extended-A
	0xAA80,		0xAADF,		// Tai Viet
	0xAB00,		0xAB2F,		// Ethiopic Extended-A
	0xABC0,		0xABFF,		// Meetei Mayek
	0xAC00,		0xD7AF,		// Hangul Syllables
	0xD7B0,		0xD7FF,		// Hangul Jamo Extended-B
	//0xD800,		0xDB7F,		// High Surrogates
	//0xDB80,		0xDBFF,		// High Private Use Surrogates
	//0xDC00,		0xDFFF,		// Low Surrogates
	0xE000,		0xF8FF,		// Private Use Area
	0xF900,		0xFAFF,		// CJK Compatibility Ideographs
	0xFB00,		0xFB4F,		// Alphabetic Presentation Forms
	0xFB50,		0xFDFF,		// Arabic Presentation Forms-A
	0xFE00,		0xFE0F,		// Variation Selectors
	0xFE10,		0xFE1F,		// Vertical Forms
	0xFE20,		0xFE2F,		// Combining Half Marks
	0xFE30,		0xFE4F,		// CJK Compatibility Forms
	0xFE50,		0xFE6F,		// Small Form Variants
	0xFE70,		0xFEFF,		// Arabic Presentation Forms-B
	0xFF00,		0xFFEF,		// Halfwidth and Fullwidth Forms
	0xFFF0,		0xFFFF,		// Specials
	0x10000,	0x1007F,	// Linear B Syllabary
	0x10080,	0x100FF,	// Linear B Ideograms
	0x10100,	0x1013F,	// Aegean Numbers
	0x10140,	0x1018F,	// Ancient Greek Numbers
	0x10190,	0x101CF,	// Ancient Symbols
	0x101D0,	0x101FF,	// Phaistos Disc
	0x10280,	0x1029F,	// Lycian
	0x102A0,	0x102DF,	// Carian
	0x10300,	0x1032F,	// Old Italic
	0x10330,	0x1034F,	// Gothic
	0x10380,	0x1039F,	// Ugaritic
	0x103A0,	0x103DF,	// Old Persian
	0x10400,	0x1044F,	// Deseret
	0x10450,	0x1047F,	// Shavian
	0x10480,	0x104AF,	// Osmanya
	0x10800,	0x1083F,	// Cypriot Syllabary
	0x10840,	0x1085F,	// Imperial Aramaic
	0x10900,	0x1091F,	// Phoenician
	0x10920,	0x1093F,	// Lydian
	0x10A00,	0x10A5F,	// Kharoshthi
	0x10A60,	0x10A7F,	// Old South Arabian
	0x10B00,	0x10B3F,	// Avestan
	0x10B40,	0x10B5F,	// Inscriptional Parthian
	0x10B60,	0x10B7F,	// Inscriptional Pahlavi
	0x10C00,	0x10C4F,	// Old Turkic
	0x10E60,	0x10E7F,	// Rumi Numeral Symbols
	0x11000,	0x1107F,	// Brahmi
	0x11080,	0x110CF,	// Kaithi
	0x12000,	0x123FF,	// Cuneiform
	0x12400,	0x1247F,	// Cuneiform Numbers and Punctuation
	0x13000,	0x1342F,	// Egyptian Hieroglyphs
	0x16800,	0x16A3F,	// Bamum Supplement
	0x1B000,	0x1B0FF,	// Kana Supplement
	0x1D000,	0x1D0FF,	// Byzantine Musical Symbols
	0x1D100,	0x1D1FF,	// Musical Symbols
	0x1D200,	0x1D24F,	// Ancient Greek Musical Notation
	0x1D300,	0x1D35F,	// Tai Xuan Jing Symbols
	0x1D360,	0x1D37F,	// Counting Rod Numerals
	0x1D400,	0x1D7FF,	// Mathematical Alphanumeric Symbols
	0x1F000,	0x1F02F,	// Mahjong Tiles
	0x1F030,	0x1F09F,	// Domino Tiles
	0x1F0A0,	0x1F0FF,	// Playing Cards
	0x1F100,	0x1F1FF,	// Enclosed Alphanumeric Supplement
	0x1F200,	0x1F2FF,	// Enclosed Ideographic Supplement
	0x1F300,	0x1F5FF,	// Miscellaneous Symbols And Pictographs
	0x1F600,	0x1F64F,	// Emoticons
	0x1F680,	0x1F6FF,	// Transport And Map Symbols
	0x1F700,	0x1F77F,	// Alchemical Symbols
	0x20000,	0x2A6DF,	// CJK Unified Ideographs Extension B
	0x2A700,	0x2B73F,	// CJK Unified Ideographs Extension C
	0x2B740,	0x2B81F,	// CJK Unified Ideographs Extension D
	0x2F800,	0x2FA1F,	// CJK Compatibility Ideographs Supplement
	0xE0000,	0xE007F,	// Tags
	0xE0100,	0xE01EF,	// Variation Selectors Supplement
	0xF0000,	0xFFFFF,	// Supplementary Private Use Area-A
	0x100000,	0x10FFFF,	// Supplementary Private Use Area-B
	0xFFFFFFFF
};

// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.

#define UTF8_ACCEPT 0u
#define UTF8_REJECT 12u

static const unsigned char utf8d[] = {
	// The first part of the table maps bytes to character classes that
	// to reduce the size of the transition table and create bitmasks.
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,

	// The second part is a transition table that maps a combination
	// of a state of the automaton and a character class to a state.
	0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
	12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
	12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
	12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
	12,36,12,12,12,12,12,12,12,12,12,12, 
};

static unsigned inline decode(unsigned* state, unsigned* codep, unsigned byte) {
	unsigned type = utf8d[byte];

	*codep = (*state != UTF8_ACCEPT) ?
		(byte & 0x3fu) | (*codep << 6) :
	(0xff >> type) & (byte);

	*state = utf8d[256 + *state + type];
	return *state;
}

//static bool IsUTF8(unsigned char* s) {
//	unsigned codepoint, state = 0;
//
//	while (*s)
//		decode(&state, &codepoint, *s++);
//
//	return state == UTF8_ACCEPT;
//}

TEST(EncodingsTest, UTF8) {
	StringBuffer os, os2;
	for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) {
		for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) {
			os.Clear();
			UTF8<>::Encode(os, codepoint);
			const char* encodedStr = os.GetString();

			// Decode with Hoehrmann
			{
				unsigned decodedCodepoint = 0;
				unsigned state = 0;

				unsigned decodedCount = 0;
				for (const char* s = encodedStr; *s; ++s)
					if (!decode(&state, &decodedCodepoint, (unsigned char)*s)) {
						EXPECT_EQ(codepoint, decodedCodepoint);
						decodedCount++;
					}

				if (*encodedStr)				// This decoder cannot handle U+0000
					EXPECT_EQ(1u, decodedCount);	// Should only contain one code point

				EXPECT_EQ(UTF8_ACCEPT, state);
				if (UTF8_ACCEPT != state)
					std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
			}

			// Decode
			{
				StringStream is(encodedStr);
				unsigned decodedCodepoint;
				bool result = UTF8<>::Decode(is, &decodedCodepoint);
				EXPECT_TRUE(result);
				EXPECT_EQ(codepoint, decodedCodepoint);
				if (!result || codepoint != decodedCodepoint)
					std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
			}

			// Validate
			{
				StringStream is(encodedStr);
				os2.Clear();
				bool result = UTF8<>::Validate(is, os2);
				EXPECT_TRUE(result);
				EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString()));
			}
		}
	}
}

TEST(EncodingsTest, UTF16) {
	GenericStringBuffer<UTF16<> > os, os2;
	GenericStringBuffer<UTF8<> > utf8os;
	for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) {
		for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) {
			os.Clear();
			UTF16<>::Encode(os, codepoint);
			const UTF16<>::Ch* encodedStr = os.GetString();

			// Encode with Hoehrmann's code
			if (codepoint != 0)	// cannot handle U+0000
			{
				// encode with UTF8<> first
				utf8os.Clear();
				UTF8<>::Encode(utf8os, codepoint);

				// transcode from UTF8 to UTF16 with Hoehrmann's code
				unsigned decodedCodepoint = 0;
				unsigned state = 0;
				UTF16<>::Ch buffer[3], *p = &buffer[0];
				for (const char* s = utf8os.GetString(); *s; ++s) {
					if (!decode(&state, &decodedCodepoint, (unsigned char)*s))
						break;
				}

				if (codepoint <= 0xFFFF)
					*p++ = static_cast<UTF16<>::Ch>(decodedCodepoint);
				else {
					// Encode code points above U+FFFF as surrogate pair.
					*p++ = static_cast<UTF16<>::Ch>(0xD7C0 + (decodedCodepoint >> 10));
					*p++ = static_cast<UTF16<>::Ch>(0xDC00 + (decodedCodepoint & 0x3FF));
				}
				*p++ = '\0';

				EXPECT_EQ(0, StrCmp(buffer, encodedStr));
			}

			// Decode
			{
				GenericStringStream<UTF16<> > is(encodedStr);
				unsigned decodedCodepoint;
				bool result = UTF16<>::Decode(is, &decodedCodepoint);
				EXPECT_TRUE(result);
				EXPECT_EQ(codepoint, decodedCodepoint);			
				if (!result || codepoint != decodedCodepoint)
					std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
			}

			// Validate
			{
				GenericStringStream<UTF16<> > is(encodedStr);
				os2.Clear();
				bool result = UTF16<>::Validate(is, os2);
				EXPECT_TRUE(result);
				EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString()));
			}
		}
	}
}

TEST(EncodingsTest, UTF32) {
	GenericStringBuffer<UTF32<> > os, os2;
	for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) {
		for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) {
			os.Clear();
			UTF32<>::Encode(os, codepoint);
			const UTF32<>::Ch* encodedStr = os.GetString();

			// Decode
			{
				GenericStringStream<UTF32<> > is(encodedStr);
				unsigned decodedCodepoint;
				bool result = UTF32<>::Decode(is, &decodedCodepoint);
				EXPECT_TRUE(result);
				EXPECT_EQ(codepoint, decodedCodepoint);			
				if (!result || codepoint != decodedCodepoint)
					std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
			}

			// Validate
			{
				GenericStringStream<UTF32<> > is(encodedStr);
				os2.Clear();
				bool result = UTF32<>::Validate(is, os2);
				EXPECT_TRUE(result);
				EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString()));
			}
		}
	}
}