Commit f4f7fc39 authored by wanglun's avatar wanglun

StringSplitter supports embedded '\0' bytes and separator can be '\0'

parent 2685cd8c
...@@ -400,7 +400,7 @@ static void LoadSymbols() { ...@@ -400,7 +400,7 @@ static void LoadSymbols() {
size_t line_len = 0; size_t line_len = 0;
ssize_t nr = 0; ssize_t nr = 0;
while ((nr = getline(&line, &line_len, fp.get())) != -1) { while ((nr = getline(&line, &line_len, fp.get())) != -1) {
butil::StringSplitter sp(line, line + line_len, ' '); butil::StringSplitter sp(line, line + nr, ' ');
if (sp == NULL) { if (sp == NULL) {
continue; continue;
} }
......
...@@ -61,6 +61,8 @@ public: ...@@ -61,6 +61,8 @@ public:
// length() field() will be skipped. // length() field() will be skipped.
inline StringSplitter(const char* input, char separator, inline StringSplitter(const char* input, char separator,
EmptyFieldAction action = SKIP_EMPTY_FIELD); EmptyFieldAction action = SKIP_EMPTY_FIELD);
// Allows containing embedded '\0' characters and separator can be '\0',
// if str_end is not NULL.
inline StringSplitter(const char* str_begin, const char* str_end, inline StringSplitter(const char* str_begin, const char* str_end,
char separator, char separator,
EmptyFieldAction = SKIP_EMPTY_FIELD); EmptyFieldAction = SKIP_EMPTY_FIELD);
...@@ -113,9 +115,15 @@ public: ...@@ -113,9 +115,15 @@ public:
// longer than this utility. // longer than this utility.
inline StringMultiSplitter(const char* input, const char* separators, inline StringMultiSplitter(const char* input, const char* separators,
EmptyFieldAction action = SKIP_EMPTY_FIELD); EmptyFieldAction action = SKIP_EMPTY_FIELD);
// Allows containing embedded '\0' characters, if str_end is not NULL.
inline StringMultiSplitter(const char* str_begin, const char* str_end, inline StringMultiSplitter(const char* str_begin, const char* str_end,
const char* separators, const char* separators,
EmptyFieldAction action = SKIP_EMPTY_FIELD); EmptyFieldAction action = SKIP_EMPTY_FIELD);
// Allows separators containing '\0', if str_end and separators_end are
// both not NULL.
inline StringMultiSplitter(const char* str_begin, const char* str_end,
const char* seps_begin, const char* seps_end,
EmptyFieldAction action = SKIP_EMPTY_FIELD);
// Move splitter forward. // Move splitter forward.
inline StringMultiSplitter& operator++(); inline StringMultiSplitter& operator++();
...@@ -152,6 +160,7 @@ private: ...@@ -152,6 +160,7 @@ private:
const char* _tail; const char* _tail;
const char* _str_tail; const char* _str_tail;
const char* const _seps; const char* const _seps;
const char* const _seps_tail;
const EmptyFieldAction _empty_field_action; const EmptyFieldAction _empty_field_action;
}; };
......
...@@ -87,7 +87,7 @@ size_t StringSplitter::length() const { ...@@ -87,7 +87,7 @@ size_t StringSplitter::length() const {
} }
bool StringSplitter::not_end(const char* p) const { bool StringSplitter::not_end(const char* p) const {
return *p && p != _str_tail; return (_str_tail == NULL) ? *p : (p != _str_tail);
} }
int StringSplitter::to_int8(int8_t* pv) const { int StringSplitter::to_int8(int8_t* pv) const {
...@@ -167,6 +167,7 @@ StringMultiSplitter::StringMultiSplitter ( ...@@ -167,6 +167,7 @@ StringMultiSplitter::StringMultiSplitter (
: _head(str) : _head(str)
, _str_tail(NULL) , _str_tail(NULL)
, _seps(seps) , _seps(seps)
, _seps_tail(NULL)
, _empty_field_action(action) { , _empty_field_action(action) {
init(); init();
} }
...@@ -177,6 +178,18 @@ StringMultiSplitter::StringMultiSplitter ( ...@@ -177,6 +178,18 @@ StringMultiSplitter::StringMultiSplitter (
: _head(str_begin) : _head(str_begin)
, _str_tail(str_end) , _str_tail(str_end)
, _seps(seps) , _seps(seps)
, _seps_tail(NULL)
, _empty_field_action(action) {
init();
}
StringMultiSplitter::StringMultiSplitter (
const char* str_begin, const char* str_end,
const char* seps_begin, const char* seps_end, EmptyFieldAction action)
: _head(str_begin)
, _str_tail(str_end)
, _seps(seps_begin)
, _seps_tail(seps_end)
, _empty_field_action(action) { , _empty_field_action(action) {
init(); init();
} }
...@@ -213,7 +226,8 @@ StringMultiSplitter StringMultiSplitter::operator++(int) { ...@@ -213,7 +226,8 @@ StringMultiSplitter StringMultiSplitter::operator++(int) {
} }
bool StringMultiSplitter::is_sep(char c) const { bool StringMultiSplitter::is_sep(char c) const {
for (const char* p = _seps; *p != '\0'; ++p) { for (const char* p = _seps;
(_seps_tail == NULL) ? (*p != '\0') : (p != _seps_tail); ++p) {
if (c == *p) { if (c == *p) {
return true; return true;
} }
...@@ -234,7 +248,7 @@ size_t StringMultiSplitter::length() const { ...@@ -234,7 +248,7 @@ size_t StringMultiSplitter::length() const {
} }
bool StringMultiSplitter::not_end(const char* p) const { bool StringMultiSplitter::not_end(const char* p) const {
return *p && p != _str_tail; return (_str_tail == NULL) ? *p : (p != _str_tail);
} }
int StringMultiSplitter::to_int8(int8_t* pv) const { int StringMultiSplitter::to_int8(int8_t* pv) const {
......
...@@ -165,7 +165,7 @@ TEST_F(StringSplitterTest, site_id_as_example) { ...@@ -165,7 +165,7 @@ TEST_F(StringSplitterTest, site_id_as_example) {
} }
TEST_F(StringSplitterTest, number_list) { TEST_F(StringSplitterTest, number_list) {
const char* str = " 123,,12,1, 21 4321"; const char* str = " 123,,12,1, 21 4321\00056";
butil::StringMultiSplitter ss(str, ", "); butil::StringMultiSplitter ss(str, ", ");
ASSERT_TRUE(ss); ASSERT_TRUE(ss);
ASSERT_EQ(3ul, ss.length()); ASSERT_EQ(3ul, ss.length());
...@@ -195,6 +195,76 @@ TEST_F(StringSplitterTest, number_list) { ...@@ -195,6 +195,76 @@ TEST_F(StringSplitterTest, number_list) {
ASSERT_FALSE(ss); ASSERT_FALSE(ss);
ASSERT_EQ(0ul, ss.length()); ASSERT_EQ(0ul, ss.length());
ASSERT_EQ(ss.field(), str + strlen(str)); ASSERT_EQ(ss.field(), str + strlen(str));
// contains embedded '\0'
const size_t str_len = 23;
butil::StringMultiSplitter ss2(str, str + str_len, ", ");
ASSERT_TRUE(ss2);
ASSERT_EQ(3ul, ss2.length());
ASSERT_FALSE(strncmp(ss2.field(), "123", ss2.length()));
ss2++;
ASSERT_TRUE(ss2);
ASSERT_EQ(2ul, ss2.length());
ASSERT_FALSE(strncmp(ss2.field(), "12", ss2.length()));
ss2++;
ASSERT_TRUE(ss2);
ASSERT_EQ(1ul, ss2.length());
ASSERT_FALSE(strncmp(ss2.field(), "1", ss2.length()));
ss2++;
ASSERT_TRUE(ss2);
ASSERT_EQ(2ul, ss2.length());
ASSERT_FALSE(strncmp(ss2.field(), "21", ss2.length()));
ss2++;
ASSERT_TRUE(ss2);
ASSERT_EQ(7ul, ss2.length());
ASSERT_FALSE(strncmp(ss2.field(), "4321\00056", ss2.length()));
++ss2;
ASSERT_FALSE(ss2);
ASSERT_EQ(0ul, ss2.length());
ASSERT_EQ(ss2.field(), str + str_len);
// separators contains '\0'
const char* seps = ", \0";
const size_t seps_len = 3;
butil::StringMultiSplitter ss3(str, str + str_len, seps, seps + seps_len);
ASSERT_TRUE(ss3);
ASSERT_EQ(3ul, ss3.length());
ASSERT_FALSE(strncmp(ss3.field(), "123", ss3.length()));
ss3++;
ASSERT_TRUE(ss3);
ASSERT_EQ(2ul, ss3.length());
ASSERT_FALSE(strncmp(ss3.field(), "12", ss3.length()));
ss3++;
ASSERT_TRUE(ss3);
ASSERT_EQ(1ul, ss3.length());
ASSERT_FALSE(strncmp(ss3.field(), "1", ss3.length()));
ss3++;
ASSERT_TRUE(ss3);
ASSERT_EQ(2ul, ss3.length());
ASSERT_FALSE(strncmp(ss3.field(), "21", ss3.length()));
ss3++;
ASSERT_TRUE(ss3);
ASSERT_EQ(4ul, ss3.length());
ASSERT_FALSE(strncmp(ss3.field(), "4321", ss3.length()));
ss3++;
ASSERT_TRUE(ss3);
ASSERT_EQ(2ul, ss3.length());
ASSERT_FALSE(strncmp(ss3.field(), "56", ss3.length()));
++ss3;
ASSERT_FALSE(ss3);
ASSERT_EQ(0ul, ss3.length());
ASSERT_EQ(ss3.field(), str + str_len);
} }
TEST_F(StringSplitterTest, cast_type) { TEST_F(StringSplitterTest, cast_type) {
...@@ -258,7 +328,7 @@ TEST_F(StringSplitterTest, cast_type) { ...@@ -258,7 +328,7 @@ TEST_F(StringSplitterTest, cast_type) {
} }
TEST_F(StringSplitterTest, split_limit_len) { TEST_F(StringSplitterTest, split_limit_len) {
const char* str = "1\t123\t111\t1\t10\t11\t1.3\t3.1415926"; const char* str = "1\t1\0003\t111\t1\t10\t11\t1.3\t3.1415926";
butil::StringSplitter ss(str, str + 5, '\t'); butil::StringSplitter ss(str, str + 5, '\t');
ASSERT_TRUE(ss); ASSERT_TRUE(ss);
...@@ -268,10 +338,25 @@ TEST_F(StringSplitterTest, split_limit_len) { ...@@ -268,10 +338,25 @@ TEST_F(StringSplitterTest, split_limit_len) {
++ss; ++ss;
ASSERT_TRUE(ss); ASSERT_TRUE(ss);
ASSERT_EQ(3ul, ss.length()); ASSERT_EQ(3ul, ss.length());
ASSERT_FALSE(strncmp(ss.field(), "123", ss.length())); ASSERT_FALSE(strncmp(ss.field(), "1\0003", ss.length()));
++ss; ++ss;
ASSERT_FALSE(ss); ASSERT_FALSE(ss);
// Allows using '\0' as separator
butil::StringSplitter ss2(str, str + 5, '\0');
ASSERT_TRUE(ss2);
ASSERT_EQ(3ul, ss2.length());
ASSERT_FALSE(strncmp(ss2.field(), "1\t1", ss2.length()));
++ss2;
ASSERT_TRUE(ss2);
ASSERT_EQ(1ul, ss2.length());
ASSERT_FALSE(strncmp(ss2.field(), "3", ss2.length()));
++ss2;
ASSERT_FALSE(ss2);
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment